langchain-ai / langchain

🦜🔗 Build context-aware reasoning applications
https://python.langchain.com
MIT License
92.05k stars 14.65k forks source link

RecursiveCharacterTextSplitter.from_language(language=Language.C) ValueError: Language Language.C is not supported! :bug: #22430

Closed W-Wuxian closed 3 months ago

W-Wuxian commented 3 months ago

Checked other resources

Example Code

The following code:

from pathlib import Path
import getopt, sys, os, shutil

from langchain_community.document_loaders import (
    DirectoryLoader, TextLoader
)

from langchain_text_splitters import (
    Language,
    RecursiveCharacterTextSplitter
)

def routerloader(obj, buf, keys):
    if os.path.isfile(obj):
        Fname = os.path.basename(obj)
        if Fname.endswith(".c") or Fname.endswith(".h") or Fname.endswith(".cu"):
            loader = TextLoader(obj, autodetect_encoding = True)
            buf["c"].extend(loader.load())
            keychecker("c", keys)
    elif os.path.isdir(obj):
        # BEGIN F90 C .h CPP As TextLoader
        if any(File.endswith(".c") for File in os.listdir(obj)):
            abc={'autodetect_encoding': True}
            loader = DirectoryLoader(
                obj, glob="**/*.c", loader_cls=TextLoader,
                loader_kwargs=abc, show_progress=True, use_multithreading=True
            )
            buf["c"].extend(loader.load())
            keychecker("c", keys)
        if any(File.endswith(".h") for File in os.listdir(obj)):
            abc={'autodetect_encoding': True}
            loader = DirectoryLoader(
                obj, glob="**/*.h", loader_cls=TextLoader,
                loader_kwargs=abc, show_progress=True, use_multithreading=True
            )
            buf["c"].extend(loader.load())
            keychecker("c", keys)
    return buf, keys #accumulator

def specificsplitter(keys, **kwargs):
    splitted_data = []
    splitter_fun = {key: [] for key in keys}
    embedding = kwargs.get("embedding", None)
    for key in keys:
        if key == "c" or key == "h" or key == "cuh" or key == "cu":
            splitter_fun[key] = RecursiveCharacterTextSplitter.from_language(
                language=Language.C, chunk_size=200, chunk_overlap=0
            )
    return splitter_fun

def keychecker(key, keys):
    if key not in keys:
        keys.append(key)

def loaddata(data_path, **kwargs):
    default_keys = ["txt", "pdf", "f90", "c", "cpp", "py", "png", "xlsx", "odt", "csv", "pptx", "md", "org"]
    buf = {key: [] for key in default_keys}
    keys = []
    documents = []
    embedding = kwargs.get("embedding", None)
    for data in data_path:
        print(data)
        buf, keys = routerloader(data, buf, keys)
    print (keys)
    print (buf)
    splitter_fun = specificsplitter(keys, embedding=embedding)
    print (splitter_fun)
    for key in keys:
        print ("*"*20)
        print (key)
        buf[key] = splitter_fun[key].split_documents(buf[key])
        print (buf[key])
        print(len(buf[key]))
    return buf, keys

IDOC_PATH = []
argumentlist = sys.argv[1:]
options = "hi:"
long_options = ["help",
                 "inputdocs_path="]
arguments, values = getopt.getopt(argumentlist, options, long_options)
for currentArgument, currentValue in arguments:
    if currentArgument in ("-h", "--help"):
        print("python main.py -i path/docs")
    elif currentArgument in ("-i", "--inputdocs_path"):
        for i in currentValue.split(" "):
                    if (len(i) != 0):
                        if (os.path.isfile(i)) or ((os.path.isdir(i)) and (len(os.listdir(i)) != 0)):
                            IDOC_PATH.append(Path(i))

splitted_data, keys = loaddata(IDOC_PATH)

Error Message and Stack Trace (if applicable)

python ISSUE_TXT_SPLITTER.py -i "/home/vlederer/Bureau/ISSUE_TXT/DOCS/hello_world.c"
/home/vlederer/Bureau/ISSUE_TXT/DOCS/hello_world.c
['c']
{'txt': [], 'pdf': [], 'f90': [], 'c': [Document(page_content='#include <stdio.h>\n\nint main() {\n    puts("Hello, World!");\n    return 0;\n}', metadata={'source': '/home/vlederer/Bureau/ISSUE_TXT/DOCS/hello_world.c'})], 'cpp': [], 'py': [], 'png': [], 'xlsx': [], 'odt': [], 'csv': [], 'pptx': [], 'md': [], 'org': []}
Traceback (most recent call last):
  File "/home/vlederer/Bureau/ISSUE_TXT/ISSUE_TXT_SPLITTER.py", line 92, in <module>
    splitted_data, keys = loaddata(IDOC_PATH)
                          ^^^^^^^^^^^^^^^^^^^
  File "/home/vlederer/Bureau/ISSUE_TXT/ISSUE_TXT_SPLITTER.py", line 67, in loaddata
    splitter_fun = specificsplitter(keys, embedding=embedding)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/vlederer/Bureau/ISSUE_TXT/ISSUE_TXT_SPLITTER.py", line 47, in specificsplitter
    splitter_fun[key] = RecursiveCharacterTextSplitter.from_language(
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/Anaconda3/envs/langchain_rag_pytorchcuda121gpu_env/lib/python3.11/site-packages/langchain_text_splitters/character.py", line 116, in from_language
    separators = cls.get_separators_for_language(language)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/Anaconda3/envs/langchain_rag_pytorchcuda121gpu_env/lib/python3.11/site-packages/langchain_text_splitters/character.py", line 631, in get_separators_for_language
    raise ValueError(
ValueError: Language Language.C is not supported! Please choose from [<Language.CPP: 'cpp'>, <Language.GO: 'go'>, <Language.JAVA: 'java'>, <Language.KOTLIN: 'kotlin'>, <Language.JS: 'js'>, <Language.TS: 'ts'>, <Language.PHP: 'php'>, <Language.PROTO: 'proto'>, <Language.PYTHON: 'python'>, <Language.RST: 'rst'>, <Language.RUBY: 'ruby'>, <Language.RUST: 'rust'>, <Language.SCALA: 'scala'>, <Language.SWIFT: 'swift'>, <Language.MARKDOWN: 'markdown'>, <Language.LATEX: 'latex'>, <Language.HTML: 'html'>, <Language.SOL: 'sol'>, <Language.CSHARP: 'csharp'>, <Language.COBOL: 'cobol'>, <Language.C: 'c'>, <Language.LUA: 'lua'>, <Language.PERL: 'perl'>, <Language.HASKELL: 'haskell'>]

Description

I'm trying to split C code using the langchain-text-splitter and RecursiveCharacterTextSplitter.from_language with Language=Language.C or Language='c'. I'am expecting no error since the C language is listed by the enumerator

[print(e.value) for e in Language]

System Info

langchain==0.2.1
langchain-community==0.2.1
langchain-core==0.2.3
langchain-experimental==0.0.59
langchain-text-splitters==0.2.0
No LSB modules are available.
Distributor ID: Ubuntu
Description:    Linux Mint 21.3
Release:    22.04
Codename:   virginia
Python 3.11.9
System Information
------------------
> OS:  Linux
> OS Version:  #35~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Tue May  7 09:00:52 UTC 2
> Python Version:  3.11.9 (main, Apr 19 2024, 16:48:06) [GCC 11.2.0]

Package Information
-------------------
> langchain_core: 0.2.3
> langchain: 0.2.1
> langchain_community: 0.2.1
> langsmith: 0.1.65
> langchain_experimental: 0.0.59
> langchain_text_splitters: 0.2.0

Packages not installed (Not Necessarily a Problem)
--------------------------------------------------
The following packages were not found:

> langgraph
> langserve
hwchase17 commented 3 months ago

c is not supported in the RecursiveTextSplitter yet. This is a confusing error message. I opened up a PR to make the error message more clear, but work is still needed to added support for c