class MyEmbeddingFunction_v3
Maturity: 22
A class named MyEmbeddingFunction
File:
/tf/active/vicechatdev/offline_docstore_multi.py
Lines:
127 - 187
127 - 187
Complexity:
moderate
moderate
Purpose
No detailed description available
Source Code
class MyEmbeddingFunction(EmbeddingFunction):
def __init__(self, model_name: str, embed_model_name: str, api_key: str):
self.model_name = model_name
self.api_key = api_key
self.llm = ChatOpenAI(model_name=model_name, temperature=0,api_key=api_key)
self.embed_model_name = embed_model_name
def summarize_text(self,text, max_tokens_summary=8192):
"""
Summarize the input text using the GPT-4o-mini summarizer.
The summary will be limited to under max_tokens_summary tokens.
"""
# Prepare the summarization prompt
text=self.sanitize_text(text)
prompt = (
f"Please summarize the following text such that the summary is under {max_tokens_summary} tokens:\n\n{text}"
)
# Call the ChatCompletion API with the GPT-4o-mini model
response = self.llm.invoke(prompt)
summary = response.content.strip()
return summary
def sanitize_text(self,text):
"""
Sanitize text by encoding to UTF-8 with error replacement and decoding back.
This replaces any characters that might cause ASCII encoding errors.
"""
return text.encode("utf-8", errors="replace").decode("utf-8")
def count_tokens(self,text):
encoding = tiktoken.get_encoding("cl100k_base")
return len(encoding.encode(text))
def __call__(self, input: Documents) -> Embeddings:
# embed the documents somehow
## expect a list of str and return a list of embeddings
embeddings=[]
for content in input:
if len(content) > 1000000:
content = content[:1000000]
logger.warning(f"Shrinking content due to token limit")
while self.count_tokens(content) > 110000:
content = content[:-1000]
# Create embedding
if self.count_tokens(content) > 8192:
logger.warning(f"Summarizing text due to token limit")
content=self.summarize_text(content, api_key)
response = openai.embeddings.create(
model=self.embed_model_name,
input=content,
)
embedding = response.data[0].embedding
embeddings.append(embedding)
return embeddings
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
EmbeddingFunction | - |
Parameter Details
bases: Parameter of type EmbeddingFunction
Return Value
Returns unspecified type
Class Interface
Methods
__init__(self, model_name, embed_model_name, api_key)
Purpose: Internal method: init
Parameters:
model_name: Type: strembed_model_name: Type: strapi_key: Type: str
Returns: None
summarize_text(self, text, max_tokens_summary)
Purpose: Summarize the input text using the GPT-4o-mini summarizer. The summary will be limited to under max_tokens_summary tokens.
Parameters:
text: Parametermax_tokens_summary: Parameter
Returns: None
sanitize_text(self, text)
Purpose: Sanitize text by encoding to UTF-8 with error replacement and decoding back. This replaces any characters that might cause ASCII encoding errors.
Parameters:
text: Parameter
Returns: None
count_tokens(self, text)
Purpose: Performs count tokens
Parameters:
text: Parameter
Returns: None
__call__(self, input) -> Embeddings
Purpose: Internal method: call
Parameters:
input: Type: Documents
Returns: Returns Embeddings
Required Imports
import os
import subprocess
import tempfile
from pathlib import Path
import pandas as pd
Usage Example
# Example usage:
# result = MyEmbeddingFunction(bases)
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class MyEmbeddingFunction_v2 96.5% similar
-
class MyEmbeddingFunction_v1 95.0% similar
-
class MyEmbeddingFunction 59.9% similar
-
class SPMachineLearningHub 48.5% similar
-
class SharingLinkInfo 46.8% similar