Closed whysirier closed 1 week ago
@whysirier unless you explicitly tell Chroma to persist the next iteration should not have any of the previous documents. Can you share a MRE?
@whysirier unless you explicitly tell Chroma to persist the next iteration should not have any of the previous documents. Can you share a MRE?
what can I solve this problem. My details code is: from pprint import pprint
import os from copy import deepcopy
import argparse
from PIL import Image import cv2 import numpy as np import re from paddleocr import PaddleOCR import paddle from tqdm import tqdm from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig from langchain.document_loaders import UnstructuredFileLoader from langchain.document_loaders import UnstructuredMarkdownLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.embeddings import HuggingFaceBgeEmbeddings from langchain.vectorstores import Chroma import io from PyPDF2 import PdfReader, PdfWriter
import io import base64 from io import BytesIO
from PIL import Image from typing import Any, List, Optional from langchain.llms.base import LLM from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig from langchain.callbacks.manager import CallbackManagerForLLMRun from langchain.prompts import PromptTemplate from langchain.chains import RetrievalQA import fitz import torch from accelerate import init_empty_weights, load_checkpoint_and_dispatch, infer_auto_device_map
from openai import OpenAI import configparser
from flask import Flask, request, jsonify from flask_cors import CORS
import json
import pytz from datetime import datetime import pymysql import jwt import configparser
app = Flask(name) CORS(app, resources={r'/': {'origins': ''}}, supports_credentials=True)
class TeleChat(LLM):
tokenizer : AutoTokenizer = None
model : AutoModelForCausalLM = None
generate_config : GenerationConfig = None
def __init__(self, model_path : str):
super().__init__()
model_dir = '/mnt/data/spdi-code/SpdiChat/dataroot/models/TeleChat-12B'
self.tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
self.model = AutoModelForCausalLM.from_pretrained(model_dir, trust_remote_code=True, device_map="auto", torch_dtype=torch.float16)
#self.generation_config = GenerationConfig.from_pretrained(model_dir, trust_remote_code=True)
print("-------------完成模型加载--------------------")
def _call(self, question : str, stop: Optional[List[str]] = None,
run_manager: Optional[CallbackManagerForLLMRun] = None,
**kwargs: Any):
generation_config = GenerationConfig.from_pretrained("/mnt/data/spdi-code/SpdiChat/dataroot/models/TeleChat-12B", trust_remote_code=True)
answer, history = self.model.chat(tokenizer=self.tokenizer, question=question, history=[], generation_config=generation_config, stream=False)
return answer
@property
def _llm_type(self) -> str:
return "TeleChat"
class QwenLM(LLM):
tokenizer: AutoTokenizer = None
model: AutoModelForCausalLM = None
def __init__(self, model_path: str):
# model_path: Qwen 模型路径
# 从本地初始化模型
super().__init__()
print("正在从本地加载模型...")
self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
print("self.tokenizer", type(self.tokenizer))
# num_gpus = torch.cuda.device_count()
# device_map = infer_auto_device_map(model=self.model,max_memory={i: "24GiB" for i in range(num_gpus)})
self.model = AutoModelForCausalLM.from_pretrained(model_path, device_map='auto', trust_remote_code=True).eval()
# Specify hyperparameters for generation
self.model.generation_config = GenerationConfig.from_pretrained(model_path, trust_remote_code=True, top_k=3,
temperature=0.8) # 可指定不同的生成长度、top_p等相关超参
print("完成本地模型的加载")
def _call(self, prompt: str, stop: Optional[List[str]] = None,
run_manager: Optional[CallbackManagerForLLMRun] = None,
**kwargs: Any):
# 重写调用函数
response, history = self.model.chat(self.tokenizer, prompt, history=[])
torch.cuda.empty_cache()
return response
@property
def _llm_type(self) -> str:
return "QwenLM"
model_path = "/mnt/data/spdi-code/SpdiChat/dataroot/models/Qwen/Qwen-14B-Chat-Int8"
llm = QwenLM(model_path=model_path) bge_embeddings = HuggingFaceBgeEmbeddings(model_name="/mnt/data/spdi-code/paddlechat/bge-large-zh-v1.5")
def call_qwen(question, contents, model_id): print("----------------------------------------question---------------------------------", question) print("contents内容", contents) text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=150) split_docs = text_splitter.create_documents([contents]) print("split文档内容", split_docs) vectordb = Chroma.from_documents( documents=split_docs, embedding=bge_embeddings)
template = """使用以下上下文来回答最后的问题。不要回答"是的"、"不好意思"这些词,不要试图编造答案。尽量使答案简明扼要。”。
{context}
问题: {question}
有用的回答:"""
# 构建提示词对象
QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context", "question"], template=template)
qa_chain = RetrievalQA.from_chain_type(llm,
retriever=vectordb.as_retriever(),
chain_type='stuff',
return_source_documents=True,
chain_type_kwargs={"prompt": QA_CHAIN_PROMPT})
print("qa_chain", qa_chain)
answer = ''
for que in question:
result = qa_chain({"query": que})
torch.cuda.empty_cache()
print("向量库的回答:{}".format(result["result"]))
print("向量库检索文档情况", result["source_documents"])
answer += result["result"] + '\n'
return answer
if name == "main": docs1 = ['设计预算总表的总价值的除税价是452900.05元。方案建议书的总金额为:452900.05元(除税价)。;设计预算表的总金额为:452900.05 元(除税价)'] docs2 = ['含税价(美元)1mIIVVVIVIIVIIIIXxXIXIIXIII1工程费452900. 05452900. 0553931.16506831. 212TXL-4甲A服服务商购销材料费6969. 676969. 67897. 417867. 082TXL-5甲工程建设其他费43498.0143498. 013193. 5246691. 5331、2项之和合计452900. 0543498. 01496398. 0657124. 68553522.744总计452900. 0543498. 01496398. 0657124.68553522.74设计负责人'] answer1 = call_qwen('设计预算总表的总价值是多少?', docs1[0]) answer1 = call_qwen('含税价是多少?', docs2[0])
you can see, for the answer1, the matching fragment only includes docs1[0]; but for the answer2, the the matching fragment only includes docs1[0] and docs2[0]
the next iteration should not have any of the previous documents
In my code, the next iteration always have any of the previous documents, thx
Hi, thanks for reporting this.
This may be a bug depending on the intended behavior for the in-memory version. Here is a minimal reproducible example / test:
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_core.embeddings import FakeEmbeddings
embeddings = FakeEmbeddings(size=5)
docs = [Document("document")]
vectorstore = Chroma.from_documents(docs, embeddings)
assert(vectorstore._collection.count() == 1)
vectorstore = Chroma.from_documents(docs, embeddings)
assert(vectorstore._collection.count() == 1)
(vectorstore._collection.count()
is 2 in the second example).
When you instantiate Chroma, the collection remains in memory and is accessible until the kernel is restarted. Because the collection is instantiated with a default name, when you add documents in the same python session the collection will grow.
A simple workaround for now is to change the collection name (e.g., with random identifiers):
import uuid
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_core.embeddings import FakeEmbeddings
embeddings = FakeEmbeddings(size=5)
docs = [Document("document")]
vectorstore = Chroma.from_documents(
docs, embeddings, collection_name=str(uuid.uuid4())
)
assert(vectorstore._collection.count() == 1)
vectorstore = Chroma.from_documents(
docs, embeddings, collection_name=str(uuid.uuid4())
)
assert(vectorstore._collection.count() == 1)
Hi, thanks for reporting this.
This may be a bug depending on the intended behavior for the in-memory version. Here is a minimal reproducible example / test:
thanks very much, I try it again
Hi, @whysirier. I'm Dosu, and I'm helping the LangChain team manage their backlog. I'm marking this issue as stale.
Issue Summary:
Next Steps:
Thank you for your understanding and contribution!
Checked other resources
Example Code
from langchain.vectorstores import Chroma
for doc in docs: vectordb = Chroma.from_documents( documents=doc, embedding=bge_embeddings)
Each round, I initialize the vectordb, why the next round will appear the history document, such as: 1)for the first round, i feed the document to chroma, and the output is 'Document(page_content='工程预算总表(表一)建设项目名称....) 2)for the second round, i feed another document to chroma, and the ouput is '[Document(page_content='设计预算总表的总价值的除税价为452900.05元。......'), Document(page_content='工程预算总表(表一)名称....]'
for the second round, i initialize the vectordb, why will appear the first document content?
Error Message and Stack Trace (if applicable)
No response
Description
langchain 0.0.354
System Info
ubuntu 22 pytorch2.3 python 3.8