Splitting¶

`summ.splitter.Splitter` ¶

Splitters are responsible for taking a file and splitting it into a list of documents (chunks).

By defauly, we just split on double-newlines (paragraphs).

Source code in summ/splitter/splitter.py

class Splitter:
    """Splitters are responsible for taking a file and splitting it into a list of documents (chunks).

    By defauly, we just split on double-newlines (paragraphs).
    """

    @classmethod
    def wrap(cls: Type[Self], other: "Splitter") -> "Splitter":
        """Wrap an existing splitter to chain processing."""

        class WrappedSplitter(cls):  # type: ignore
            def split(self, title: str, text: str):
                docs = other.split(title, text)
                prefix = os.path.commonprefix([doc.page_content for doc in docs])
                return super().split(
                    title,
                    "\n\n".join(doc.page_content.removesuffix(prefix) for doc in docs),
                )

        return WrappedSplitter()

    def __init__(self) -> None:
        self.splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
            chunk_size=100,
            chunk_overlap=0,
        )

    def get_chunks(self, title: str, text: str) -> list[str]:
        return text.split("\n\n")

    def split(self, title: str, text: str):
        chunks = self.get_chunks(title, text)
        return self.splitter.create_documents(
            chunks, cast(list, UnsharedDictList({"file": title}, len(chunks)))
        )

`wrap(other: Splitter) -> Splitter` `classmethod` ¶

Wrap an existing splitter to chain processing.

Source code in summ/splitter/splitter.py

@classmethod
def wrap(cls: Type[Self], other: "Splitter") -> "Splitter":
    """Wrap an existing splitter to chain processing."""

    class WrappedSplitter(cls):  # type: ignore
        def split(self, title: str, text: str):
            docs = other.split(title, text)
            prefix = os.path.commonprefix([doc.page_content for doc in docs])
            return super().split(
                title,
                "\n\n".join(doc.page_content.removesuffix(prefix) for doc in docs),
            )

    return WrappedSplitter()

`summ.splitter.OtterSplitter` ¶

Bases: Splitter

Adds Splitter support for transcripts exported from otter.ai.

To filter out your own remarks, pass a list of speakers to exclude.

Source code in summ/splitter/otter.py

class OtterSplitter(Splitter):
    """Adds Splitter support for transcripts exported from otter.ai.

    To filter out your own remarks, pass a list of speakers to exclude.
    """

    def __init__(self, speakers_to_exclude: list[str] = []) -> None:
        super().__init__()
        self.exclude = speakers_to_exclude

    def get_chunks(self, title: str, text: str) -> list[str]:
        return [
            speaker_chunk[1]
            for utterance in text.split("\n\n")
            for speaker_chunk in [utterance.split("\n")]
            if "\n" in utterance
            and not any(sp in speaker_chunk[0].lower() for sp in self.exclude)
        ]

Splitting¶

summ.splitter.Splitter ¶

wrap(other: Splitter) -> Splitter classmethod ¶

summ.splitter.OtterSplitter ¶

`summ.splitter.Splitter` ¶

`wrap(other: Splitter) -> Splitter` `classmethod` ¶

`summ.splitter.OtterSplitter` ¶