Structured Data Extraction¶

`summ.structure.Structurer` ¶

Bases: Chain

Structurers infer a set of structured data to extract based on the query, then extract that data from every source document.

Source code in summ/structure/structurer.py

class Structurer(Chain):
    """Structurers infer a set of structured data to extract based on the query, then extract that data from every source document."""

    def __init__(self, query: str, **kwargs):
        super().__init__(**kwargs)
        self.query = query

    def metrics_template(self) -> PromptTemplate:
        """The template to transform a query into a list of metrics."""
        return PromptTemplate(
            template=dedent(
                f"""
                Use the query to determine which structured data is needed, and for each, write a specification which will extract and collect the data.
                If the query is qualitative, you can return an empty list.
                Your response must be in valid JSON format. Do not extract the information yet, just describe how to do so.
                The options for type are: {', '.join(list(MetricType))}.
                The options for collect are: {', '.join(list(MetricCollect))}.
                The prompt should minimize variance in the response.

                For example:
                Prompt: In each department, how many times did people prefer Google over Bing.
                Response:
                ```
                [
                    {{"metric": "department", "prompt": "Extract the company department that the user of this interview works in.", "type": "string", "collect": "list"}},
                    {{"metric": "preferred", "prompt": "Which of the following options best represents which search engine was preferred?", "type": "enum", "options": ["GOOGLE", "BING", "OTHER"], "collect": "count_unique"}},
                ]
                ```

                Prompt: {{{{ query }}}}
                Response:
                ```
                """
            ),
            input_variables=["query"],
            template_format="jinja2",
        )

    def doc_template(self) -> PromptTemplate:
        """The template to transform a document into a list of metrics."""

        return PromptTemplate(
            template=dedent(
                """
                Your task is to take a spec describing how to extract structured data, and apply it to a document.
                You must follow the spec exactly. For example, if the spec specifies an enum, your response must be one of the options.

                For example:
                Document:
                ```
                Yea over in sales, we prefer Google. Rest of the company likes DuckDuckGo.
                ```
                Spec:
                ```
                [
                    {"metric": "department", "prompt": "Extract the company department that the user of this interview works in.", "type": "string", "collect": "list"},
                    {"metric": "preferred", "prompt": "Which of the following options best represents which search engine was preferred?", "type": "enum", "options": ["GOOGLE", "BING", "OTHER"], "collect": "count_unique"},
                ]
                ```
                Response:
                ```
                {"department": "Engineering", "preferred": "GOOGLE"}
                ```

                Document:
                ```
                {{ text }}
                ```
                Spec:
                ```
                {{ spec }}
                ```
                Response:
                ```
                """
            ),
            input_variables=["text", "spec"],
            template_format="jinja2",
        )

    def clean_template(self) -> PromptTemplate:
        """The template to take a set of extracted metrics and clean them up."""

        return PromptTemplate(
            template=dedent(
                """
                Your task is to take a set of extracted metrics and clean them up.
                Each metric will have a spec. You can use the spec to determine how to clean the metric.
                The goal is to minimize variance and make this data useful for aggregation.
                If elements are semantically equivalent, they should be combined.

                For example:
                Query: In each department, how many times did people prefer Google over Bing.
                Spec:
                ```
                [
                    {"metric": "department", "prompt": "Extract the company department that the user of this interview works in.", "type": "string", "collect": "list"},
                    {"metric": "preferred", "prompt": "Which of the following options best represents which search engine was preferred?", "type": "enum", "options": ["GOOGLE", "BING", "OTHER"], "collect": "count_unique"},
                    {"metric": "feelings", "prompt": "How did the individual feel about the search engine?", "type": "string", "collect": "list"},
                ]
                ```
                Data:
                ```
                [
                    {
                        "department": ["Engineering", "eng", "engineering", "sales", "marketing", "markting"],
                        "preferred": {"GOOGLE": 3, "BING": 1, "OTHER": 1}
                        "feelings": ["it was truly awesome", "I really liked it", "awesome for all the months I used it", "sweet and liked it quite a bit"]
                    },
                ]
                ```
                Cleaned:
                ```
                [
                    {
                        "department": ["Engineering", "Sales", "Marketing"],
                        "preferred": {"GOOGLE": 3, "BING": 1}
                        "feelings": ["Awesome", "Liked It", "Sweet"],
                    },
                ]
                ```

                Spec:
                ```
                {{ spec }}
                ```
                Data:
                ```
                {{ data }}
                ```
                Cleaned:
                ```
                """
            ),
            input_variables=["spec", "data"],
            template_format="jinja2",
        )

    @cached_property
    def metrics(self) -> list[Metric]:
        """Extract the metrics from the query"""
        chain = LLMChain(llm=self.llm, prompt=self.metrics_template())
        results = self.cached(
            "metrics",
            chain,
            Document(page_content=self.query),
            lambda d: {"query": d.page_content, "stop": "```"},
        )
        try:
            return [
                m
                for o in cast(list, dirtyjson.loads(results))
                for m in [Metric.safe_parse(o)]
                if m is not None
            ]
        except Exception as e:
            logging.info(e)
            return []

    def metric(self, metric: str) -> Metric:
        return next(m for m in self.metrics if m.metric == metric)

    @property
    def spec(self) -> str:
        return f"[{', '.join([m.json() for m in self.metrics])}]"

    def extract_metrics(self, doc: Document) -> dict[str, MetricValue]:
        """Extract the metrics from the document"""
        results = self.cached(
            "extract_metrics",
            LLMChain(llm=self.llm, prompt=self.doc_template()),
            doc,
            lambda d: {
                "text": d.page_content,
                "spec": self.spec,
                "stop": "```",
            },
        )
        try:
            metrics = {
                k: m
                for k, v in cast(dict[str, TVal], dirtyjson.loads(results)).items()
                for m in [
                    MetricValue.safe_parse({"metric": self.metric(k), "value": v})
                ]
                if m and m.value is not None
            }
        except Exception as e:
            traceback.print_exc()
            metrics = {}

        return metrics

    def clean(self, metrics: dict[str, TVal_]) -> dict[str, TVal_]:
        """Clean up a set of extracted metrics."""

        results = self.cached(
            "clean_metrics",
            LLMChain(llm=self.llm, prompt=self.clean_template()),
            cast(list[Document], []),
            lambda _: {
                "spec": self.spec,
                "data": json.dumps(metrics),
                "stop": "```",
            },
        )

        try:
            return dirtyjson.loads(results)
        except Exception as e:
            logging.info(e)
            return metrics

    def _extract(self, docs: list[Document]) -> dict[str, TVal_]:
        metrics = self._pmap(self.extract_metrics, docs)
        formatted: dict[str, TVal_] = {
            m.metric: m.collect_fn(cs)
            for m in self.metrics
            for cs in [[x for v in metrics for x in [v.get(m.metric, None)] if x]]
        }
        if cleaned := self.clean(formatted):
            for k, v in cleaned.items():
                self.dprint("Metric", k, color="yellow")
                self.dprint("", v)
        return cleaned

    def extract(self, docs: list[Document]) -> dict[str, TVal_]:
        """Extract metrics from all documents"""
        try:
            return self._extract(docs)
        except Exception as e:
            logging.info(e)
            return {}

`metrics: list[Metric]` `cached` `property` ¶

Extract the metrics from the query

`metrics_template() -> PromptTemplate` ¶

The template to transform a query into a list of metrics.

Source code in summ/structure/structurer.py

def metrics_template(self) -> PromptTemplate:
    """The template to transform a query into a list of metrics."""
    return PromptTemplate(
        template=dedent(
            f"""
            Use the query to determine which structured data is needed, and for each, write a specification which will extract and collect the data.
            If the query is qualitative, you can return an empty list.
            Your response must be in valid JSON format. Do not extract the information yet, just describe how to do so.
            The options for type are: {', '.join(list(MetricType))}.
            The options for collect are: {', '.join(list(MetricCollect))}.
            The prompt should minimize variance in the response.

            For example:
            Prompt: In each department, how many times did people prefer Google over Bing.
            Response:
            ```
            [
                {{"metric": "department", "prompt": "Extract the company department that the user of this interview works in.", "type": "string", "collect": "list"}},
                {{"metric": "preferred", "prompt": "Which of the following options best represents which search engine was preferred?", "type": "enum", "options": ["GOOGLE", "BING", "OTHER"], "collect": "count_unique"}},
            ]
            ```

            Prompt: {{{{ query }}}}
            Response:
            ```
            """
        ),
        input_variables=["query"],
        template_format="jinja2",
    )

`doc_template() -> PromptTemplate` ¶

The template to transform a document into a list of metrics.

Source code in summ/structure/structurer.py

def doc_template(self) -> PromptTemplate:
    """The template to transform a document into a list of metrics."""

    return PromptTemplate(
        template=dedent(
            """
            Your task is to take a spec describing how to extract structured data, and apply it to a document.
            You must follow the spec exactly. For example, if the spec specifies an enum, your response must be one of the options.

            For example:
            Document:
            ```
            Yea over in sales, we prefer Google. Rest of the company likes DuckDuckGo.
            ```
            Spec:
            ```
            [
                {"metric": "department", "prompt": "Extract the company department that the user of this interview works in.", "type": "string", "collect": "list"},
                {"metric": "preferred", "prompt": "Which of the following options best represents which search engine was preferred?", "type": "enum", "options": ["GOOGLE", "BING", "OTHER"], "collect": "count_unique"},
            ]
            ```
            Response:
            ```
            {"department": "Engineering", "preferred": "GOOGLE"}
            ```

            Document:
            ```
            {{ text }}
            ```
            Spec:
            ```
            {{ spec }}
            ```
            Response:
            ```
            """
        ),
        input_variables=["text", "spec"],
        template_format="jinja2",
    )

`clean_template() -> PromptTemplate` ¶

The template to take a set of extracted metrics and clean them up.

Source code in summ/structure/structurer.py

def clean_template(self) -> PromptTemplate:
    """The template to take a set of extracted metrics and clean them up."""

    return PromptTemplate(
        template=dedent(
            """
            Your task is to take a set of extracted metrics and clean them up.
            Each metric will have a spec. You can use the spec to determine how to clean the metric.
            The goal is to minimize variance and make this data useful for aggregation.
            If elements are semantically equivalent, they should be combined.

            For example:
            Query: In each department, how many times did people prefer Google over Bing.
            Spec:
            ```
            [
                {"metric": "department", "prompt": "Extract the company department that the user of this interview works in.", "type": "string", "collect": "list"},
                {"metric": "preferred", "prompt": "Which of the following options best represents which search engine was preferred?", "type": "enum", "options": ["GOOGLE", "BING", "OTHER"], "collect": "count_unique"},
                {"metric": "feelings", "prompt": "How did the individual feel about the search engine?", "type": "string", "collect": "list"},
            ]
            ```
            Data:
            ```
            [
                {
                    "department": ["Engineering", "eng", "engineering", "sales", "marketing", "markting"],
                    "preferred": {"GOOGLE": 3, "BING": 1, "OTHER": 1}
                    "feelings": ["it was truly awesome", "I really liked it", "awesome for all the months I used it", "sweet and liked it quite a bit"]
                },
            ]
            ```
            Cleaned:
            ```
            [
                {
                    "department": ["Engineering", "Sales", "Marketing"],
                    "preferred": {"GOOGLE": 3, "BING": 1}
                    "feelings": ["Awesome", "Liked It", "Sweet"],
                },
            ]
            ```

            Spec:
            ```
            {{ spec }}
            ```
            Data:
            ```
            {{ data }}
            ```
            Cleaned:
            ```
            """
        ),
        input_variables=["spec", "data"],
        template_format="jinja2",
    )

`extract_metrics(doc: Document) -> dict[str, MetricValue]` ¶

Extract the metrics from the document

Source code in summ/structure/structurer.py

def extract_metrics(self, doc: Document) -> dict[str, MetricValue]:
    """Extract the metrics from the document"""
    results = self.cached(
        "extract_metrics",
        LLMChain(llm=self.llm, prompt=self.doc_template()),
        doc,
        lambda d: {
            "text": d.page_content,
            "spec": self.spec,
            "stop": "```",
        },
    )
    try:
        metrics = {
            k: m
            for k, v in cast(dict[str, TVal], dirtyjson.loads(results)).items()
            for m in [
                MetricValue.safe_parse({"metric": self.metric(k), "value": v})
            ]
            if m and m.value is not None
        }
    except Exception as e:
        traceback.print_exc()
        metrics = {}

    return metrics

`clean(metrics: dict[str, TVal_]) -> dict[str, TVal_]` ¶

Clean up a set of extracted metrics.

Source code in summ/structure/structurer.py

def clean(self, metrics: dict[str, TVal_]) -> dict[str, TVal_]:
    """Clean up a set of extracted metrics."""

    results = self.cached(
        "clean_metrics",
        LLMChain(llm=self.llm, prompt=self.clean_template()),
        cast(list[Document], []),
        lambda _: {
            "spec": self.spec,
            "data": json.dumps(metrics),
            "stop": "```",
        },
    )

    try:
        return dirtyjson.loads(results)
    except Exception as e:
        logging.info(e)
        return metrics

`extract(docs: list[Document]) -> dict[str, TVal_]` ¶

Extract metrics from all documents

Source code in summ/structure/structurer.py

def extract(self, docs: list[Document]) -> dict[str, TVal_]:
    """Extract metrics from all documents"""
    try:
        return self._extract(docs)
    except Exception as e:
        logging.info(e)
        return {}

`summ.structure.SQLStructurer` ¶

Bases: Structurer

Constructs an in-memory SQLite database to store and query the structured data.

Source code in summ/structure/sql_structurer.py

class SQLStructurer(Structurer):
    """Constructs an in-memory SQLite database to store and query the structured data."""

    def __init__(self, query: str, **kwargs):
        super().__init__(query, **kwargs)
        self.llm.max_tokens = 1024
        self.conn = sqlite3.connect(":memory:")
        self.conn.row_factory = sqlite3.Row

    def metrics_template(self) -> PromptTemplate:
        """The template to transform a query into a list of metrics."""

        return PromptTemplate(
            template=dedent(
                f"""
                Use the query to determine which structured data is needed, and use this to create a SQL table DDL.
                Include a confidence score column with values from 0 to 100.
                If the query is qualitative, you can return an empty table.
                Your response must be valid and complete SQL.

                Prompt: {{{{ query }}}}
                DDL:
                ```
                """
            ),
            input_variables=["query"],
            template_format="jinja2",
        )

    def doc_template(self) -> PromptTemplate:
        """The template to transform a document into a list of metrics."""

        return PromptTemplate(
            template=dedent(
                """
                You will be provided with the schema for a SQL table.
                Write between zero and three SQL statements which will insert data into the table.
                You do not need to use all three statements.
                Do not insert data which is not relevant to the query. Do not insert data which is ambiguous. Do not insert data which is noisy or too long.
                Only insert data that is derived from the document provided. Do not guess or make up data.
                For each row, record your confidence that the data is relevant to the query as a number from 0 to 100, using the confidence score column.
                Your response must be valid and complete SQL.

                Query: {{ query }}
                Document:
                ```
                {{ text }}
                ```
                Schema:
                ```
                {{ schema }}
                ```
                Response:
                ```
                """
            ),
            input_variables=["query", "text", "schema"],
            template_format="jinja2",
        )

    def clean_template(self) -> PromptTemplate:
        """The template to extract the relevant rows."""

        return PromptTemplate(
            template=dedent(
                """
                Write a SQLite statement which will clean and extract rows from the table.
                Use CTEs to process and clean the data. Apply a WHERE clause to extract the relevant rows.
                The cleaning rules must be short and simple. They do not need to be comprehensive.
                Your response must be valid and complete SQLite.
                No string literal in the response may be longer than 200 characters. The response must be less than 512 tokens.
                The sample data provided is not comprehensive.

                --
                This is an example. Use it as a guide, but do not copy it.
                The content is not relevant to the task.
                Your query can take a different form.
                ```
                WITH CleanedData AS (
                SELECT
                    CASE
                    WHEN Department LIKE '%eng%' THEN 'Engineering'
                    WHEN Department = 'sales' THEN 'Sales & Marketing'
                    WHEN Department = 'marketing' THEN 'Sales & Marketing'
                    ELSE NULL
                    END AS Department,
                    CASE
                    WHEN (Answer = 'yup' OR Answer = 'yes') THEN 'Yes'
                    WHEN Answer LIKE 'no%' THEN 'No'
                    ELSE Answer
                    END AS Answer,
                    Answer
                FROM Table
                )
                SELECT
                Department,
                GROUP_CONCAT(Response, ', ') AS Responses,
                AVG(ConfidenceScore) AS AvgConfidenceScore
                FROM CleanedData
                WHERE ConfidenceScore > 80
                GROUP BY Department
                HAVING COUNT(Response) > 2
                ORDER BY AVG(ConfidenceScore) DESC
                LIMIT 10;
                ```
                --

                Now with the real input.

                Query: {{ query }}
                Schema:
                ```
                {{ schema }}
                ```
                Sample Data:
                ```
                {{ data }}
                ```
                Response:
                ```
                """
            ),
            input_variables=["query", "schema", "data"],
            template_format="jinja2",
        )

    @cached_property
    def schema(self) -> str:
        """Extract the DDL for a table from the query."""

        chain = LLMChain(llm=self.llm, prompt=self.metrics_template())
        results = self.cached(
            "sql",
            chain,
            Document(page_content=self.query),
            lambda d: {"query": d.page_content, "stop": "```"},
        )

        return results

    @cached_property
    def table_name(self) -> str:
        curr = self.conn.cursor()
        curr.execute("SELECT * FROM sqlite_schema WHERE type='table';")
        res = curr.fetchone()
        return res["name"]

    def extract_metrics(self, doc: Document):
        """Extract the metrics from the document"""
        results = self.cached(
            "extract_metrics",
            LLMChain(llm=self.llm, prompt=self.doc_template()),
            doc,
            lambda d: {
                "query": self.query,
                "text": d.page_content,
                "schema": self.schema,
                "stop": "```",
            },
        )
        return results

    def clean(self, metrics: list[sqlite3.Row]) -> list[dict]:
        """Extract cleaned metrics from all documents"""
        stmnt = self.cached(
            "clean_metrics",
            LLMChain(llm=self.llm, prompt=self.clean_template()),
            cast(list[Document], []),
            lambda _: {
                "query": self.query,
                "schema": self.schema,
                "data": "\n".join([str(tuple(dict(x).values())) for x in metrics]),
                "stop": "```",
            },
        )

        try:
            return [dict(x) for x in self.conn.cursor().execute(stmnt).fetchall()]
        except Exception as e:
            logging.info(e)
            return [dict(x) for x in metrics]

    def _extract(self, docs: list[Document]) -> dict[str, TVal_]:
        self.conn.cursor().executescript(self.schema)

        metrics = self._pmap(self.extract_metrics, docs)
        for metric in metrics:
            try:
                self.conn.cursor().executescript(metric)
            except Exception as e:
                logging.info(e)

        metrics = (
            self.conn.cursor()
            .execute(f"SELECT * FROM {self.table_name} ORDER BY RANDOM() LIMIT 50;")
            .fetchall()
        )
        if cleaned := self.clean(metrics):
            for metric in cleaned[0].keys():
                self.dprint("Metric", metric, color="yellow")
                self.dprint("", "\n--\n".join([str(x[metric]) for x in cleaned[:5]]))

        return {"data": cleaned}

`schema: str` `cached` `property` ¶

Extract the DDL for a table from the query.

`metrics_template() -> PromptTemplate` ¶

The template to transform a query into a list of metrics.

Source code in summ/structure/sql_structurer.py

def metrics_template(self) -> PromptTemplate:
    """The template to transform a query into a list of metrics."""

    return PromptTemplate(
        template=dedent(
            f"""
            Use the query to determine which structured data is needed, and use this to create a SQL table DDL.
            Include a confidence score column with values from 0 to 100.
            If the query is qualitative, you can return an empty table.
            Your response must be valid and complete SQL.

            Prompt: {{{{ query }}}}
            DDL:
            ```
            """
        ),
        input_variables=["query"],
        template_format="jinja2",
    )

`doc_template() -> PromptTemplate` ¶

The template to transform a document into a list of metrics.

Source code in summ/structure/sql_structurer.py

def doc_template(self) -> PromptTemplate:
    """The template to transform a document into a list of metrics."""

    return PromptTemplate(
        template=dedent(
            """
            You will be provided with the schema for a SQL table.
            Write between zero and three SQL statements which will insert data into the table.
            You do not need to use all three statements.
            Do not insert data which is not relevant to the query. Do not insert data which is ambiguous. Do not insert data which is noisy or too long.
            Only insert data that is derived from the document provided. Do not guess or make up data.
            For each row, record your confidence that the data is relevant to the query as a number from 0 to 100, using the confidence score column.
            Your response must be valid and complete SQL.

            Query: {{ query }}
            Document:
            ```
            {{ text }}
            ```
            Schema:
            ```
            {{ schema }}
            ```
            Response:
            ```
            """
        ),
        input_variables=["query", "text", "schema"],
        template_format="jinja2",
    )

`clean_template() -> PromptTemplate` ¶

The template to extract the relevant rows.

Source code in summ/structure/sql_structurer.py

def clean_template(self) -> PromptTemplate:
    """The template to extract the relevant rows."""

    return PromptTemplate(
        template=dedent(
            """
            Write a SQLite statement which will clean and extract rows from the table.
            Use CTEs to process and clean the data. Apply a WHERE clause to extract the relevant rows.
            The cleaning rules must be short and simple. They do not need to be comprehensive.
            Your response must be valid and complete SQLite.
            No string literal in the response may be longer than 200 characters. The response must be less than 512 tokens.
            The sample data provided is not comprehensive.

            --
            This is an example. Use it as a guide, but do not copy it.
            The content is not relevant to the task.
            Your query can take a different form.
            ```
            WITH CleanedData AS (
            SELECT
                CASE
                WHEN Department LIKE '%eng%' THEN 'Engineering'
                WHEN Department = 'sales' THEN 'Sales & Marketing'
                WHEN Department = 'marketing' THEN 'Sales & Marketing'
                ELSE NULL
                END AS Department,
                CASE
                WHEN (Answer = 'yup' OR Answer = 'yes') THEN 'Yes'
                WHEN Answer LIKE 'no%' THEN 'No'
                ELSE Answer
                END AS Answer,
                Answer
            FROM Table
            )
            SELECT
            Department,
            GROUP_CONCAT(Response, ', ') AS Responses,
            AVG(ConfidenceScore) AS AvgConfidenceScore
            FROM CleanedData
            WHERE ConfidenceScore > 80
            GROUP BY Department
            HAVING COUNT(Response) > 2
            ORDER BY AVG(ConfidenceScore) DESC
            LIMIT 10;
            ```
            --

            Now with the real input.

            Query: {{ query }}
            Schema:
            ```
            {{ schema }}
            ```
            Sample Data:
            ```
            {{ data }}
            ```
            Response:
            ```
            """
        ),
        input_variables=["query", "schema", "data"],
        template_format="jinja2",
    )

`extract_metrics(doc: Document)` ¶

Extract the metrics from the document

Source code in summ/structure/sql_structurer.py

def extract_metrics(self, doc: Document):
    """Extract the metrics from the document"""
    results = self.cached(
        "extract_metrics",
        LLMChain(llm=self.llm, prompt=self.doc_template()),
        doc,
        lambda d: {
            "query": self.query,
            "text": d.page_content,
            "schema": self.schema,
            "stop": "```",
        },
    )
    return results

`clean(metrics: list[sqlite3.Row]) -> list[dict]` ¶

Extract cleaned metrics from all documents

Source code in summ/structure/sql_structurer.py

def clean(self, metrics: list[sqlite3.Row]) -> list[dict]:
    """Extract cleaned metrics from all documents"""
    stmnt = self.cached(
        "clean_metrics",
        LLMChain(llm=self.llm, prompt=self.clean_template()),
        cast(list[Document], []),
        lambda _: {
            "query": self.query,
            "schema": self.schema,
            "data": "\n".join([str(tuple(dict(x).values())) for x in metrics]),
            "stop": "```",
        },
    )

    try:
        return [dict(x) for x in self.conn.cursor().execute(stmnt).fetchall()]
    except Exception as e:
        logging.info(e)
        return [dict(x) for x in metrics]

Structured Data Extraction¶

summ.structure.Structurer ¶

metrics: list[Metric] cached property ¶

metrics_template() -> PromptTemplate ¶

doc_template() -> PromptTemplate ¶

clean_template() -> PromptTemplate ¶

extract_metrics(doc: Document) -> dict[str, MetricValue] ¶

clean(metrics: dict[str, TVal_]) -> dict[str, TVal_] ¶

extract(docs: list[Document]) -> dict[str, TVal_] ¶

summ.structure.SQLStructurer ¶

schema: str cached property ¶

metrics_template() -> PromptTemplate ¶

doc_template() -> PromptTemplate ¶

clean_template() -> PromptTemplate ¶

extract_metrics(doc: Document) ¶

clean(metrics: list[sqlite3.Row]) -> list[dict] ¶

`summ.structure.Structurer` ¶

`metrics: list[Metric]` `cached` `property` ¶

`metrics_template() -> PromptTemplate` ¶

`doc_template() -> PromptTemplate` ¶

`clean_template() -> PromptTemplate` ¶

`extract_metrics(doc: Document) -> dict[str, MetricValue]` ¶

`clean(metrics: dict[str, TVal_]) -> dict[str, TVal_]` ¶

`extract(docs: list[Document]) -> dict[str, TVal_]` ¶

`summ.structure.SQLStructurer` ¶

`schema: str` `cached` `property` ¶

`metrics_template() -> PromptTemplate` ¶

`doc_template() -> PromptTemplate` ¶

`clean_template() -> PromptTemplate` ¶

`extract_metrics(doc: Document)` ¶

`clean(metrics: list[sqlite3.Row]) -> list[dict]` ¶