microsoft / graphrag

A modular graph-based Retrieval-Augmented Generation (RAG) system
https://microsoft.github.io/graphrag/
MIT License
19.05k stars 1.88k forks source link

WARNING Warning! Error loading file book.txt. Skipping... #644

Closed maple0325 closed 3 months ago

maple0325 commented 3 months ago

Is your feature request related to a problem? Please describe.

in my log

12:35:02,358 graphrag.config.read_dotenv INFO Loading pipeline .env file
12:35:02,363 graphrag.index.cli INFO using default configuration: {
    "llm": {
        "api_key": "REDACTED, length 6",
        "type": "openai_chat",
        "model": "gemma",
        "max_tokens": 4000,
        "temperature": 0.0,
        "top_p": 1.0,
        "n": 1,
        "request_timeout": 180.0,
        "api_base": "http://127.0.0.1:11434/v1",
        "api_version": null,
        "proxy": null,
        "cognitive_services_endpoint": null,
        "deployment_name": null,
        "model_supports_json": true,
        "tokens_per_minute": 0,
        "requests_per_minute": 0,
        "max_retries": 10,
        "max_retry_wait": 10.0,
        "sleep_on_rate_limit_recommendation": true,
        "concurrent_requests": 25
    },
    "parallelization": {
        "stagger": 0.3,
        "num_threads": 50
    },
    "async_mode": "threaded",
    "root_dir": "./ragtest",
    "reporting": {
        "type": "file",
        "base_dir": "output/${timestamp}/reports",
        "storage_account_blob_url": null
    },
    "storage": {
        "type": "file",
        "base_dir": "output/${timestamp}/artifacts",
        "storage_account_blob_url": null
    },
    "cache": {
        "type": "file",
        "base_dir": "cache",
        "storage_account_blob_url": null
    },
    "input": {
        "type": "file",
        "file_type": "text",
        "base_dir": "input",
        "storage_account_blob_url": null,
        "encoding": "utf-8",
        "file_pattern": ".*\\.txt$",
        "file_filter": null,
        "source_column": null,
        "timestamp_column": null,
        "timestamp_format": null,
        "text_column": "text",
        "title_column": null,
        "document_attribute_columns": []
    },
    "embed_graph": {
        "enabled": false,
        "num_walks": 10,
        "walk_length": 40,
        "window_size": 2,
        "iterations": 3,
        "random_seed": 597832,
        "strategy": null
    },
    "embeddings": {
        "llm": {
            "api_key": "REDACTED, length 9",
            "type": "openai_embedding",
            "model": "/nomic-ai/nomic-embed-text-v1.5-GGUF/nomic-embed-text-v1.5.Q4_K_M.gguf",
            "max_tokens": 4000,
            "temperature": 0,
            "top_p": 1,
            "n": 1,
            "request_timeout": 180.0,
            "api_base": "http://localhost:1234/v1",
            "api_version": null,
            "proxy": null,
            "cognitive_services_endpoint": null,
            "deployment_name": null,
            "model_supports_json": null,
            "tokens_per_minute": 0,
            "requests_per_minute": 0,
            "max_retries": 10,
            "max_retry_wait": 10.0,
            "sleep_on_rate_limit_recommendation": true,
            "concurrent_requests": 25
        },
        "parallelization": {
            "stagger": 0.3,
            "num_threads": 50
        },
        "async_mode": "threaded",
        "batch_size": 16,
        "batch_max_tokens": 8191,
        "target": "required",
        "skip": [],
        "vector_store": null,
        "strategy": null
    },
    "chunks": {
        "size": 1200,
        "overlap": 100,
        "group_by_columns": [
            "id"
        ],
        "strategy": null
    },
    "snapshots": {
        "graphml": false,
        "raw_entities": false,
        "top_level_nodes": false
    },
    "entity_extraction": {
        "llm": {
            "api_key": "REDACTED, length 6",
            "type": "openai_chat",
            "model": "gemma",
            "max_tokens": 4000,
            "temperature": 0.0,
            "top_p": 1.0,
            "n": 1,
            "request_timeout": 180.0,
            "api_base": "http://127.0.0.1:11434/v1",
            "api_version": null,
            "proxy": null,
            "cognitive_services_endpoint": null,
            "deployment_name": null,
            "model_supports_json": true,
            "tokens_per_minute": 0,
            "requests_per_minute": 0,
            "max_retries": 10,
            "max_retry_wait": 10.0,
            "sleep_on_rate_limit_recommendation": true,
            "concurrent_requests": 25
        },
        "parallelization": {
            "stagger": 0.3,
            "num_threads": 50
        },
        "async_mode": "threaded",
        "prompt": "prompts/entity_extraction.txt",
        "entity_types": [
            "organization",
            "person",
            "geo",
            "event"
        ],
        "max_gleanings": 1,
        "strategy": null
    },
    "summarize_descriptions": {
        "llm": {
            "api_key": "REDACTED, length 6",
            "type": "openai_chat",
            "model": "gemma",
            "max_tokens": 4000,
            "temperature": 0.0,
            "top_p": 1.0,
            "n": 1,
            "request_timeout": 180.0,
            "api_base": "http://127.0.0.1:11434/v1",
            "api_version": null,
            "proxy": null,
            "cognitive_services_endpoint": null,
            "deployment_name": null,
            "model_supports_json": true,
            "tokens_per_minute": 0,
            "requests_per_minute": 0,
            "max_retries": 10,
            "max_retry_wait": 10.0,
            "sleep_on_rate_limit_recommendation": true,
            "concurrent_requests": 25
        },
        "parallelization": {
            "stagger": 0.3,
            "num_threads": 50
        },
        "async_mode": "threaded",
        "prompt": "prompts/summarize_descriptions.txt",
        "max_length": 500,
        "strategy": null
    },
    "community_reports": {
        "llm": {
            "api_key": "REDACTED, length 6",
            "type": "openai_chat",
            "model": "gemma",
            "max_tokens": 4000,
            "temperature": 0.0,
            "top_p": 1.0,
            "n": 1,
            "request_timeout": 180.0,
            "api_base": "http://127.0.0.1:11434/v1",
            "api_version": null,
            "proxy": null,
            "cognitive_services_endpoint": null,
            "deployment_name": null,
            "model_supports_json": true,
            "tokens_per_minute": 0,
            "requests_per_minute": 0,
            "max_retries": 10,
            "max_retry_wait": 10.0,
            "sleep_on_rate_limit_recommendation": true,
            "concurrent_requests": 25
        },
        "parallelization": {
            "stagger": 0.3,
            "num_threads": 50
        },
        "async_mode": "threaded",
        "prompt": "prompts/community_report.txt",
        "max_length": 2000,
        "max_input_length": 8000,
        "strategy": null
    },
    "claim_extraction": {
        "llm": {
            "api_key": "REDACTED, length 6",
            "type": "openai_chat",
            "model": "gemma",
            "max_tokens": 4000,
            "temperature": 0.0,
            "top_p": 1.0,
            "n": 1,
            "request_timeout": 180.0,
            "api_base": "http://127.0.0.1:11434/v1",
            "api_version": null,
            "proxy": null,
            "cognitive_services_endpoint": null,
            "deployment_name": null,
            "model_supports_json": true,
            "tokens_per_minute": 0,
            "requests_per_minute": 0,
            "max_retries": 10,
            "max_retry_wait": 10.0,
            "sleep_on_rate_limit_recommendation": true,
            "concurrent_requests": 25
        },
        "parallelization": {
            "stagger": 0.3,
            "num_threads": 50
        },
        "async_mode": "threaded",
        "enabled": false,
        "prompt": "prompts/claim_extraction.txt",
        "description": "Any claims or facts that could be relevant to information discovery.",
        "max_gleanings": 1,
        "strategy": null
    },
    "cluster_graph": {
        "max_cluster_size": 10,
        "strategy": null
    },
    "umap": {
        "enabled": false
    },
    "local_search": {
        "text_unit_prop": 0.5,
        "community_prop": 0.1,
        "conversation_history_max_turns": 5,
        "top_k_entities": 10,
        "top_k_relationships": 10,
        "temperature": 0.0,
        "top_p": 1.0,
        "n": 1,
        "max_tokens": 12000,
        "llm_max_tokens": 2000
    },
    "global_search": {
        "temperature": 0.0,
        "top_p": 1.0,
        "n": 1,
        "max_tokens": 12000,
        "data_max_tokens": 12000,
        "map_max_tokens": 1000,
        "reduce_max_tokens": 2000,
        "concurrency": 32
    },
    "encoding_model": "cl100k_base",
    "skip_workflows": []
}
12:35:02,363 graphrag.index.create_pipeline_config INFO skipping workflows 
12:35:02,373 graphrag.index.run INFO Running pipeline
12:35:02,373 graphrag.index.storage.file_pipeline_storage INFO Creating file storage at ragtest\output\20240722-123502\artifacts
12:35:02,373 graphrag.index.input.load_input INFO loading input from root_dir=input
12:35:02,373 graphrag.index.input.load_input INFO using file storage for input
12:35:02,373 graphrag.index.storage.file_pipeline_storage INFO search ragtest\input for files matching .*\.txt$
12:35:02,373 graphrag.index.input.text INFO found text files from input, found [('book.txt', {})]
12:35:02,373 graphrag.index.input.text WARNING Warning! Error loading file book.txt. Skipping...
12:35:02,373 graphrag.index.input.text INFO Found 1 files, loading 0
12:35:02,388 graphrag.index.workflows.load INFO Workflow Run Order: ['create_base_text_units', 'create_base_extracted_entities', 'create_summarized_entities', 'create_base_entity_graph', 'create_final_entities', 'create_final_nodes', 'create_final_communities', 'join_text_units_to_entity_ids', 'create_final_relationships', 'join_text_units_to_relationship_ids', 'create_final_community_reports', 'create_final_text_units', 'create_base_documents', 'create_final_documents']
12:35:02,388 graphrag.index.run INFO Final # of rows loaded: 0
12:35:02,574 graphrag.index.run INFO Running workflow: create_base_text_units...
12:35:02,574 graphrag.index.run INFO dependencies for create_base_text_units: []
12:35:02,584 datashaper.workflow.workflow INFO executing verb orderby
12:35:02,584 datashaper.workflow.workflow ERROR Error executing verb "orderby" in create_base_text_units: 'id'
Traceback (most recent call last):
  File "C:\ProgramData\anaconda3\Lib\site-packages\datashaper\workflow\workflow.py", line 410, in _execute_verb
    result = node.verb.func(**verb_args)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\ProgramData\anaconda3\Lib\site-packages\datashaper\engine\verbs\orderby.py", line 32, in orderby
    output = input_table.sort_values(by=columns, ascending=ascending)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\frame.py", line 7189, in sort_values
    k = self._get_label_or_level_values(by[0], axis=axis)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\generic.py", line 1911, in _get_label_or_level_values
    raise KeyError(key)
KeyError: 'id'
12:35:02,594 graphrag.index.reporting.file_workflow_callbacks INFO Error executing verb "orderby" in create_base_text_units: 'id' details=None
12:35:02,594 graphrag.index.run ERROR error running workflow create_base_text_units
Traceback (most recent call last):
  File "C:\Users\DELL\Desktop\graphrag-main\graphrag\index\run.py", line 323, in run_pipeline
    result = await workflow.run(context, callbacks)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\ProgramData\anaconda3\Lib\site-packages\datashaper\workflow\workflow.py", line 369, in run
    timing = await self._execute_verb(node, context, callbacks)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\ProgramData\anaconda3\Lib\site-packages\datashaper\workflow\workflow.py", line 410, in _execute_verb
    result = node.verb.func(**verb_args)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\ProgramData\anaconda3\Lib\site-packages\datashaper\engine\verbs\orderby.py", line 32, in orderby
    output = input_table.sort_values(by=columns, ascending=ascending)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\frame.py", line 7189, in sort_values
    k = self._get_label_or_level_values(by[0], axis=axis)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\generic.py", line 1911, in _get_label_or_level_values
    raise KeyError(key)
KeyError: 'id'
12:35:02,599 graphrag.index.reporting.file_workflow_callbacks INFO Error running pipeline! details=None

there is a warning

WARNING Warning! Error loading file book.txt. Skipping...

I think this is a reason why

PS C:\Users\DELL\Desktop\graphrag-main> python -m graphrag.index --root ./ragtest
🚀 Reading settings from ragtest\settings.yaml
❌ create_base_text_units
None
⠸ GraphRAG Indexer
├── Loading Input (InputFileType.text) - 1 files loaded (0 filtered) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 0:00:00 0:00:00
└── create_base_text_units
❌ Errors occurred during the pipeline run, see logs for more details.
PS C:\Users\DELL\Desktop\graphrag-main>

Describe the solution you'd like

how to solve this warning

WARNING Warning! Error loading file book.txt. Skipping...

Additional context

No response

ChihchengHsieh commented 3 months ago

Any solution for this one, I'm having the same issue.

rushizirpe commented 3 months ago

It might be because of the encoding format of the book.txt. In that case, Changing the file Encoding property to utf-8 should resolve the issue

natoverse commented 3 months ago

PR https://github.com/microsoft/graphrag/pull/639 has been merged and included in GraphRAG 0.2.0, which enforces UTF-8 encoding. Please let us know if upgrading resolves your issue.

github-actions[bot] commented 3 months ago

This issue has been marked stale due to inactivity after repo maintainer or community member responses that request more information or suggest a solution. It will be closed after five additional days.

luoyai commented 3 months ago

Please, is there a solution to this problem?

natoverse commented 3 months ago

We have resolved several issues related to text encoding and JSON parsing that are rolled up into version 0.2.2. Please try again with that version and re-open if this is still an issue.

neo-lu15 commented 3 months ago

I'm using the 0.2.2 version of graphrag and i still got the same error

🚀 Reading settings from ragtest\settings.yaml ❌ create_base_text_units None ⠹ GraphRAG Indexer ├── Loading Input (InputFileType.text) - 1 files loaded (0 filtered) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 0:00:00 0:00:00 └── create_base_text_units ❌ Errors occurred during the pipeline run, see logs for more details.

14:50:16,78 graphrag.index.input.text INFO found text files from input, found [('book.txt', {})] 14:50:16,81 graphrag.index.input.text WARNING Warning! Error loading file book.txt. Skipping...

kkkkken33 commented 2 months ago

It might be because of the encoding format of the book.txt. In that case, Changing the file Encoding property to utf-8 should resolve the issue

Thanks! This is exactly my problem. Thanks a lot!