Skip to content

Splitting

summ.splitter.Splitter

Splitters are responsible for taking a file and splitting it into a list of documents (chunks).

By defauly, we just split on double-newlines (paragraphs).

Source code in summ/splitter/splitter.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
class Splitter:
    """Splitters are responsible for taking a file and splitting it into a list of documents (chunks).

    By defauly, we just split on double-newlines (paragraphs).
    """

    @classmethod
    def wrap(cls: Type[Self], other: "Splitter") -> "Splitter":
        """Wrap an existing splitter to chain processing."""

        class WrappedSplitter(cls):  # type: ignore
            def split(self, title: str, text: str):
                docs = other.split(title, text)
                prefix = os.path.commonprefix([doc.page_content for doc in docs])
                return super().split(
                    title,
                    "\n\n".join(doc.page_content.removesuffix(prefix) for doc in docs),
                )

        return WrappedSplitter()

    def __init__(self) -> None:
        self.splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
            chunk_size=100,
            chunk_overlap=0,
        )

    def get_chunks(self, title: str, text: str) -> list[str]:
        return text.split("\n\n")

    def split(self, title: str, text: str):
        chunks = self.get_chunks(title, text)
        return self.splitter.create_documents(
            chunks, cast(list, UnsharedDictList({"file": title}, len(chunks)))
        )

wrap(other: Splitter) -> Splitter classmethod

Wrap an existing splitter to chain processing.

Source code in summ/splitter/splitter.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
@classmethod
def wrap(cls: Type[Self], other: "Splitter") -> "Splitter":
    """Wrap an existing splitter to chain processing."""

    class WrappedSplitter(cls):  # type: ignore
        def split(self, title: str, text: str):
            docs = other.split(title, text)
            prefix = os.path.commonprefix([doc.page_content for doc in docs])
            return super().split(
                title,
                "\n\n".join(doc.page_content.removesuffix(prefix) for doc in docs),
            )

    return WrappedSplitter()

summ.splitter.OtterSplitter

Bases: Splitter

Adds Splitter support for transcripts exported from otter.ai.

To filter out your own remarks, pass a list of speakers to exclude.

Source code in summ/splitter/otter.py
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
class OtterSplitter(Splitter):
    """Adds Splitter support for transcripts exported from otter.ai.

    To filter out your own remarks, pass a list of speakers to exclude.
    """

    def __init__(self, speakers_to_exclude: list[str] = []) -> None:
        super().__init__()
        self.exclude = speakers_to_exclude

    def get_chunks(self, title: str, text: str) -> list[str]:
        return [
            speaker_chunk[1]
            for utterance in text.split("\n\n")
            for speaker_chunk in [utterance.split("\n")]
            if "\n" in utterance
            and not any(sp in speaker_chunk[0].lower() for sp in self.exclude)
        ]