class MyEmbeddingFunction_v1
A class named MyEmbeddingFunction
/tf/active/vicechatdev/OneCo_hybrid_RAG copy.py
589 - 683
moderate
Purpose
No detailed description available
Source Code
class MyEmbeddingFunction(EmbeddingFunction):
def __init__(self, model_name: str, embed_model_name: str, api_key: str):
"""
Initialize the embedding function with specific models and API key.
Args:
model_name: Model name for the LLM summarizer
embed_model_name: Model name for embeddings
api_key: OpenAI API key
"""
self.model_name = model_name
self.api_key = api_key
# Set up the OpenAI client directly
import openai
self.client = openai.OpenAI(api_key=api_key)
# Set up the LLM for summarization
self.llm = ChatOpenAI(
model_name=model_name,
temperature=0,
api_key=api_key
)
self.embed_model_name = embed_model_name
# Ensure we're using standard OpenAI (not Azure)
# This is critical for avoiding the ambiguous API error
import os
os.environ["OPENAI_API_TYPE"] = "openai"
os.environ["OPENAI_API_KEY"] = api_key
def summarize_text(self,text, max_tokens_summary=8192):
"""
Summarize the input text using the GPT-4o-mini summarizer.
The summary will be limited to under max_tokens_summary tokens.
"""
# Prepare the summarization prompt
text=self.sanitize_text(text)
prompt = (
f"Please summarize the following text such that the summary is under {max_tokens_summary} tokens:\n\n{text}"
)
# Call the ChatCompletion API with the GPT-4o-mini model
response = self.llm.invoke(prompt)
summary = response.content.strip()
return summary
def sanitize_text(self,text):
"""
Sanitize text by encoding to UTF-8 with error replacement and decoding back.
This replaces any characters that might cause ASCII encoding errors.
"""
return text.encode("utf-8", errors="replace").decode("utf-8")
def count_tokens(self,text):
encoding = tiktoken.get_encoding("cl100k_base")
return len(encoding.encode(text))
def __call__(self, input: Documents) -> Embeddings:
"""
Generate embeddings for the input documents.
Args:
input: List of document strings to embed
Returns:
List of embeddings for each document
"""
# Embed the documents somehow
embeddings = []
for content in input:
# Handle very long content
if len(content) > 1000000:
content = content[:1000000]
while self.count_tokens(content) > 110000:
content = content[:-1000]
# Create embedding
if self.count_tokens(content) > 8192:
content = self.summarize_text(content)
# Use the direct client instead of the module-level API
response = self.client.embeddings.create(
model=self.embed_model_name,
input=content,
)
embedding = response.data[0].embedding
embeddings.append(embedding)
return embeddings
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
EmbeddingFunction | - |
Parameter Details
bases: Parameter of type EmbeddingFunction
Return Value
Returns unspecified type
Class Interface
Methods
__init__(self, model_name, embed_model_name, api_key)
Purpose: Initialize the embedding function with specific models and API key. Args: model_name: Model name for the LLM summarizer embed_model_name: Model name for embeddings api_key: OpenAI API key
Parameters:
model_name: Type: strembed_model_name: Type: strapi_key: Type: str
Returns: None
summarize_text(self, text, max_tokens_summary)
Purpose: Summarize the input text using the GPT-4o-mini summarizer. The summary will be limited to under max_tokens_summary tokens.
Parameters:
text: Parametermax_tokens_summary: Parameter
Returns: None
sanitize_text(self, text)
Purpose: Sanitize text by encoding to UTF-8 with error replacement and decoding back. This replaces any characters that might cause ASCII encoding errors.
Parameters:
text: Parameter
Returns: None
count_tokens(self, text)
Purpose: Performs count tokens
Parameters:
text: Parameter
Returns: None
__call__(self, input) -> Embeddings
Purpose: Generate embeddings for the input documents. Args: input: List of document strings to embed Returns: List of embeddings for each document
Parameters:
input: Type: Documents
Returns: Returns Embeddings
Required Imports
from typing import List
from typing import Any
from typing import Dict
import os
import panel as pn
Usage Example
# Example usage:
# result = MyEmbeddingFunction(bases)
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class MyEmbeddingFunction_v2 96.8% similar
-
class MyEmbeddingFunction_v3 95.0% similar
-
class MyEmbeddingFunction 60.1% similar
-
class SPMachineLearningHub 48.1% similar
-
class EnhancedMeetingMinutesGenerator 48.1% similar