Pipeline¶

`summ.pipeline` ¶

`Pipeline` ¶

Bases: Chain, Generic[C]

The end-to-end population pipeline.

This class will: - Take an Importer and yield a set of file-like objects. - Split the file-like objects into a set of chunks with a Splitter. - Extract facts from each chunk with a Factifier. - Extract a summary from each chunk with a Summarizer. - Embed, and optionally persist, each chunk with an Embedder.

Source code in summ/pipeline.py

class Pipeline(Chain, Generic[C]):
    """The end-to-end population pipeline.

    This class will:
    - Take an Importer and yield a set of file-like objects.
    - Split the file-like objects into a set of chunks with a Splitter.
    - Extract facts from each chunk with a Factifier.
    - Extract a summary from each chunk with a Summarizer.
    - Embed, and optionally persist, each chunk with an Embedder.
    """

    importer: Importer
    embedder: Embedder

    @classmethod
    def default(cls, path: Path, index: str) -> Self:
        return cls(
            importer=Importer(path),
            embedder=Embedder(index),
            persist=True,
            verbose=True,
        )

    def __init__(
        self,
        importer: Importer,
        embedder: Embedder,
        persist: bool = False,
        verbose: bool = False,
    ):
        super().__init__(verbose=verbose)
        self.splitter = Splitter()
        self.factifier = Factifier()
        self.classifier = Classifier[C]
        self.summarizer = Summarizer()
        self.embedder = embedder
        self.importer = importer
        self.persist = persist

    @retry(
        exceptions=RateLimitError,
        tries=5,
        delay=10,
        backoff=2,
        max_delay=120,
        jitter=(0, 10),
    )
    def _process_doc(self, doc: Document, classes: dict[str, list[C]]) -> Document:
        self.dprint(
            f"Document {self._ppprogress()}",
            metrohash.hash64(doc.page_content).hex()[:5],
            color="magenta",
        )
        with self.dprint.indent_children():
            try:
                if "classes" not in doc.metadata:
                    doc.metadata["classes"] = classes

                self.dprint("Factify", color="yellow")
                if "facts" not in doc.metadata:
                    doc.metadata["facts"] = self.factifier.factify(doc)
                self.dprint("", doc.metadata["facts"])

                self.dprint("Summarize", color="yellow")
                if "summary" not in doc.metadata:
                    doc.metadata["summary"] = self.summarizer.summarize_doc(doc)
                self.dprint("", doc.metadata["summary"])

                if "embeddings" not in doc.metadata:
                    doc.metadata["embeddings"] = (
                        self.embedder.persist(doc)
                        if self.persist
                        else self.embedder.embed(doc)
                    )
            except Exception as e:
                logging.error(f"Error processing {doc.metadata['file']}")
                traceback.print_exception(e)
                if "PYTEST_CURRENT_TEST" in os.environ:
                    raise e
            finally:
                return doc

    def _split_blob(self, blob: TextIO) -> list[Document]:
        return self.splitter.split(Path(blob.name).stem, blob.read())

    def _process_blob(self, blob: TextIO) -> Iterable[Document]:
        docs = self._split_blob(blob)
        classes = self.classifier.classify_all(docs)
        return map(partial(self._process_doc, classes=classes), docs)

    def _rung(self, blobs: Iterable[TextIO]) -> Generator[Document, None, None]:
        yield from chain.from_iterable(map(self._process_blob, blobs))

    def _runpg(self, blobs: Iterable[TextIO]) -> Generator[Document, None, None]:
        all_docs = self._pmap(self._split_blob, blobs)
        for i, docs in enumerate(all_docs):
            self.dprint(
                f"File [{i}/{len(all_docs)}]",
                docs[0].metadata["file"][:5],
                color="green",
            )
            with self.dprint.indent_children():
                self.dprint("Classify", color="cyan")
                classes = self.classifier.classify_all(docs)
                self.dprint("", {k: [x.name for x in v] for k, v in classes.items()})
                yield from self._pmap(self._process_doc, docs, classes)

    def _runp(self, blobs: Iterable[TextIO]) -> list[Document]:
        return list(self._runpg(blobs))

    def corpus(self) -> Generator[Document, None, None]:
        """Yields the extracted source corpus"""
        self.splitter = GPTSplitter.wrap(self.splitter)
        for docs in self._pmap(self._split_blob, self.importer.blobs):
            yield from docs

    def rung(self) -> Generator[Document, None, None]:
        """Yields one Embedding at a time.

        Helpful for when you want to test only a small part of your pipeline.
        """

        yield from self._rung(self.importer.blobs)

    def runp(self) -> list[Document]:
        """Calculates all embeddings in parallel. Very fast!"""

        return self._runp(self.importer.blobs)

    def run(self, parallel: bool = True) -> list[Document]:
        return self.runp() if parallel else list(self.rung())

`corpus() -> Generator[Document, None, None]` ¶

Yields the extracted source corpus

Source code in summ/pipeline.py

def corpus(self) -> Generator[Document, None, None]:
    """Yields the extracted source corpus"""
    self.splitter = GPTSplitter.wrap(self.splitter)
    for docs in self._pmap(self._split_blob, self.importer.blobs):
        yield from docs

`rung() -> Generator[Document, None, None]` ¶

Yields one Embedding at a time.

Helpful for when you want to test only a small part of your pipeline.

Source code in summ/pipeline.py

def rung(self) -> Generator[Document, None, None]:
    """Yields one Embedding at a time.

    Helpful for when you want to test only a small part of your pipeline.
    """

    yield from self._rung(self.importer.blobs)

`runp() -> list[Document]` ¶

Calculates all embeddings in parallel. Very fast!

Source code in summ/pipeline.py

def runp(self) -> list[Document]:
    """Calculates all embeddings in parallel. Very fast!"""

    return self._runp(self.importer.blobs)

Pipeline¶

summ.pipeline ¶

Pipeline ¶

corpus() -> Generator[Document, None, None] ¶

rung() -> Generator[Document, None, None] ¶

runp() -> list[Document] ¶

`summ.pipeline` ¶

`Pipeline` ¶

`corpus() -> Generator[Document, None, None]` ¶

`rung() -> Generator[Document, None, None]` ¶

`runp() -> list[Document]` ¶