Skip to content

Embedding

summ.embed.Embedding

Bases: CacheItem

A serializable embedding vector, representing a query.

Always has an associated fact.

Source code in summ/embed/embedder.py
16
17
18
19
20
21
22
23
24
25
26
27
28
class Embedding(CacheItem):
    """A serializable embedding vector, representing a query.

    Always has an associated fact."""

    document: CacheDocument
    query: str
    fact: str
    embedding: list[float]

    @classmethod
    def make_pk(cls, instance: Self) -> str:
        return cls._hash(instance.query)

summ.embed.Embedder

Embedders are responsible for taking fully-populated Documents and embedding them, optionally persiting them to a vector store in the process.

Currently, only Pinecone is supported.

Source code in summ/embed/embedder.py
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
class Embedder:
    """Embedders are responsible for taking fully-populated Documents and embedding them,
    optionally persiting them to a vector store in the process.

    Currently, only Pinecone is supported.
    """

    GPT3_DIMS = 1536

    QUERIES = 1
    """The number of extra queries to generate per fact."""

    QUERY_TEMPLATE = PromptTemplate(
        input_variables=["fact", "context"],
        template=dedent(
            """
            A user was interviewed, and stated a fact. Given this fact and the context of the interview, create a question that this fact is the answer to. The question should be specific to this fact.

            Fact: {fact}
            Context: {context}
            Question:
            """
        ),
    )

    def create_index(self):
        """Creates the named index in Pinecone."""

        pinecone.create_index(
            self.index_name,
            dimension=self.dims,
            metadata_config={"indexed": ["classes"]},
        )

    def has_index(self):
        """Checks if the named index in Pinecone exists."""

        try:
            pinecone.describe_index(self.index_name)
            return True
        except pinecone.exceptions.NotFoundException:
            return False

    def __init__(self, index: str, dims: int = GPT3_DIMS):
        """Creates a new Embedder.

        Args:
            index: The name of the vector db index to use.
            dims: The number of dimensions of the vector db index.
        """
        super().__init__()
        self.index_name = index
        self.dims = dims
        self.embeddings = OpenAIEmbeddings()
        self.index = pinecone.Index(index)

    def _embed(self, query: str, fact: str, doc: Document) -> Embedding:
        embedding = Embedding.passthrough(query=query)

        if not embedding.embedding:
            embedding.fact = fact
            embedding.document = CacheDocument.from_doc(doc)
            embedding.embedding = self.embeddings.embed_documents([query])[0]
            embedding.save()

        return embedding

    @cached_property
    def query_chain(self):
        return LLMChain(
            llm=OpenAI(temperature=0.7, cache=False), prompt=self.QUERY_TEMPLATE
        )

    @retry(exceptions=RateLimitError, tries=5, delay=6, jitter=(0, 4))
    def _generate_query(self, fact: str, doc: Document) -> str:
        return self.query_chain.run(fact=fact, context=doc.metadata["summary"])

    def embed(
        self, doc: Document, gen_queries: bool = False
    ) -> Generator[Embedding, None, None]:
        """Yields a set of embeddings for a given document."""

        for fact in doc.metadata["facts"]:
            yield self._embed(query=fact, fact=fact, doc=doc)
            if gen_queries:
                for _ in range(self.QUERIES):
                    yield self._embed(
                        query=self._generate_query(fact, doc),
                        fact=fact,
                        doc=doc,
                    )

    def persist(self, doc: Document) -> list[Embedding]:
        """Collects the set of embeddings for a Document,
        and persists them to the vector store."""

        embeddings = list(self.embed(doc, gen_queries=True))
        vectors = [
            (
                e.pk,
                e.embedding,
                {
                    "classes": list(
                        itertools.chain.from_iterable(
                            e.document.metadata["classes"].values()
                        )
                    ),
                },
            )
            for e in embeddings
        ]
        if vectors:
            self.index.upsert(vectors)
        return embeddings

QUERIES = 1 class-attribute

The number of extra queries to generate per fact.

create_index()

Creates the named index in Pinecone.

Source code in summ/embed/embedder.py
56
57
58
59
60
61
62
63
def create_index(self):
    """Creates the named index in Pinecone."""

    pinecone.create_index(
        self.index_name,
        dimension=self.dims,
        metadata_config={"indexed": ["classes"]},
    )

has_index()

Checks if the named index in Pinecone exists.

Source code in summ/embed/embedder.py
65
66
67
68
69
70
71
72
def has_index(self):
    """Checks if the named index in Pinecone exists."""

    try:
        pinecone.describe_index(self.index_name)
        return True
    except pinecone.exceptions.NotFoundException:
        return False

__init__(index: str, dims: int = GPT3_DIMS)

Creates a new Embedder.

PARAMETER DESCRIPTION
index

The name of the vector db index to use.

TYPE: str

dims

The number of dimensions of the vector db index.

TYPE: int DEFAULT: GPT3_DIMS

Source code in summ/embed/embedder.py
74
75
76
77
78
79
80
81
82
83
84
85
def __init__(self, index: str, dims: int = GPT3_DIMS):
    """Creates a new Embedder.

    Args:
        index: The name of the vector db index to use.
        dims: The number of dimensions of the vector db index.
    """
    super().__init__()
    self.index_name = index
    self.dims = dims
    self.embeddings = OpenAIEmbeddings()
    self.index = pinecone.Index(index)

embed(doc: Document, gen_queries: bool = False) -> Generator[Embedding, None, None]

Yields a set of embeddings for a given document.

Source code in summ/embed/embedder.py
108
109
110
111
112
113
114
115
116
117
118
119
120
121
def embed(
    self, doc: Document, gen_queries: bool = False
) -> Generator[Embedding, None, None]:
    """Yields a set of embeddings for a given document."""

    for fact in doc.metadata["facts"]:
        yield self._embed(query=fact, fact=fact, doc=doc)
        if gen_queries:
            for _ in range(self.QUERIES):
                yield self._embed(
                    query=self._generate_query(fact, doc),
                    fact=fact,
                    doc=doc,
                )

persist(doc: Document) -> list[Embedding]

Collects the set of embeddings for a Document, and persists them to the vector store.

Source code in summ/embed/embedder.py
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
def persist(self, doc: Document) -> list[Embedding]:
    """Collects the set of embeddings for a Document,
    and persists them to the vector store."""

    embeddings = list(self.embed(doc, gen_queries=True))
    vectors = [
        (
            e.pk,
            e.embedding,
            {
                "classes": list(
                    itertools.chain.from_iterable(
                        e.document.metadata["classes"].values()
                    )
                ),
            },
        )
        for e in embeddings
    ]
    if vectors:
        self.index.upsert(vectors)
    return embeddings