severian42 / GraphRAG-Local-UI

GraphRAG using Local LLMs - Features robust API and multiple apps for Indexing/Prompt Tuning/Query/Chat/Visualizing/Etc. This is meant to be the ultimate GraphRAG/KG local LLM app.
MIT License
1.51k stars 173 forks source link

indexing Error: create_base_entity_graph❌ Errors occurred during the pipeline run, see logs for more details. #25

Closed Ikaros-521 closed 2 months ago

Ikaros-521 commented 2 months ago
🚀 Reading settings from ragtest\settings.yaml

f:\GraphRAG-Ollama-UI\Miniconda3\lib\site-packages\numpy\core\fromnumeric.py:59: FutureWarning: 'DataFrame.swapaxes' is deprecated and will be removed in a future version. Please use 'DataFrame.transpose' instead.

  return bound(*args, **kwds)

🚀 create_base_text_units

                                 id  ... n_tokens

0  4ecc5524ce433cb392fcd1107123db7d  ...        6

[1 rows x 5 columns]

🚀 create_base_extracted_entities

                                        entity_graph

0  <graphml xmlns="http://graphml.graphdrawing.or...

🚀 create_summarized_entities

                                        entity_graph

0  <graphml xmlns="http://graphml.graphdrawing.or...

❌ create_base_entity_graph

None

⠋ GraphRAG Indexer 

├── Loading Input (text) - 1 files loaded (0 filtered) ----- 100% 0:00:… 0:00:…

├── create_base_text_units

├── create_base_extracted_entities

├── create_summarized_entities

└── create_base_entity_graph❌ Errors occurred during the pipeline run, see logs for more details.
Ikaros-521 commented 2 months ago
16:56:10,98 graphrag.config.read_dotenv INFO Loading pipeline .env file
16:56:10,103 graphrag.index.cli INFO using default configuration: {
    "llm": {
        "api_key": "REDACTED, length 3",
        "type": "openai_chat",
        "model": "qwen2:latest",
        "max_tokens": 4000,
        "temperature": 0.0,
        "top_p": 1.0,
        "request_timeout": 180.0,
        "api_base": "http://127.0.0.1:11434/v1",
        "api_version": null,
        "proxy": null,
        "cognitive_services_endpoint": null,
        "deployment_name": null,
        "model_supports_json": true,
        "tokens_per_minute": 0,
        "requests_per_minute": 0,
        "max_retries": 10,
        "max_retry_wait": 10.0,
        "sleep_on_rate_limit_recommendation": true,
        "concurrent_requests": 10
    },
    "parallelization": {
        "stagger": 0.3,
        "num_threads": 50
    },
    "async_mode": "threaded",
    "root_dir": "./ragtest",
    "reporting": {
        "type": "file",
        "base_dir": "output/${timestamp}/reports",
        "storage_account_blob_url": null
    },
    "storage": {
        "type": "file",
        "base_dir": "output/${timestamp}/artifacts",
        "storage_account_blob_url": null
    },
    "cache": {
        "type": "file",
        "base_dir": "cache",
        "storage_account_blob_url": null
    },
    "input": {
        "type": "file",
        "file_type": "text",
        "base_dir": "input",
        "storage_account_blob_url": null,
        "encoding": "utf-8",
        "file_pattern": ".*\\.txt$",
        "file_filter": null,
        "source_column": null,
        "timestamp_column": null,
        "timestamp_format": null,
        "text_column": "text",
        "title_column": null,
        "document_attribute_columns": []
    },
    "embed_graph": {
        "enabled": false,
        "num_walks": 10,
        "walk_length": 40,
        "window_size": 2,
        "iterations": 3,
        "random_seed": 597832,
        "strategy": null
    },
    "embeddings": {
        "llm": {
            "api_key": "REDACTED, length 3",
            "type": "openai_embedding",
            "model": "nomic-embed-text",
            "max_tokens": 4000,
            "temperature": 0,
            "top_p": 1,
            "request_timeout": 180.0,
            "api_base": "http://127.0.0.1:11434/v1",
            "api_version": null,
            "proxy": null,
            "cognitive_services_endpoint": null,
            "deployment_name": null,
            "model_supports_json": null,
            "tokens_per_minute": 0,
            "requests_per_minute": 0,
            "max_retries": 10,
            "max_retry_wait": 10.0,
            "sleep_on_rate_limit_recommendation": true,
            "concurrent_requests": 10
        },
        "parallelization": {
            "stagger": 0.3,
            "num_threads": 50
        },
        "async_mode": "threaded",
        "batch_size": 16,
        "batch_max_tokens": 8191,
        "target": "required",
        "skip": [],
        "vector_store": null,
        "strategy": null
    },
    "chunks": {
        "size": 512,
        "overlap": 64,
        "group_by_columns": [
            "id"
        ],
        "strategy": null
    },
    "snapshots": {
        "graphml": true,
        "raw_entities": true,
        "top_level_nodes": true
    },
    "entity_extraction": {
        "llm": {
            "api_key": "REDACTED, length 3",
            "type": "openai_chat",
            "model": "qwen2:latest",
            "max_tokens": 4000,
            "temperature": 0.0,
            "top_p": 1.0,
            "request_timeout": 180.0,
            "api_base": "http://127.0.0.1:11434/v1",
            "api_version": null,
            "proxy": null,
            "cognitive_services_endpoint": null,
            "deployment_name": null,
            "model_supports_json": true,
            "tokens_per_minute": 0,
            "requests_per_minute": 0,
            "max_retries": 10,
            "max_retry_wait": 10.0,
            "sleep_on_rate_limit_recommendation": true,
            "concurrent_requests": 10
        },
        "parallelization": {
            "stagger": 0.3,
            "num_threads": 50
        },
        "async_mode": "threaded",
        "prompt": "prompts/entity_extraction.txt",
        "entity_types": [
            "organization",
            "person",
            "geo",
            "event"
        ],
        "max_gleanings": 0,
        "strategy": null
    },
    "summarize_descriptions": {
        "llm": {
            "api_key": "REDACTED, length 3",
            "type": "openai_chat",
            "model": "qwen2:latest",
            "max_tokens": 4000,
            "temperature": 0.0,
            "top_p": 1.0,
            "request_timeout": 180.0,
            "api_base": "http://127.0.0.1:11434/v1",
            "api_version": null,
            "proxy": null,
            "cognitive_services_endpoint": null,
            "deployment_name": null,
            "model_supports_json": true,
            "tokens_per_minute": 0,
            "requests_per_minute": 0,
            "max_retries": 10,
            "max_retry_wait": 10.0,
            "sleep_on_rate_limit_recommendation": true,
            "concurrent_requests": 10
        },
        "parallelization": {
            "stagger": 0.3,
            "num_threads": 50
        },
        "async_mode": "threaded",
        "prompt": "prompts/summarize_descriptions.txt",
        "max_length": 500,
        "strategy": null
    },
    "community_reports": {
        "llm": {
            "api_key": "REDACTED, length 3",
            "type": "openai_chat",
            "model": "qwen2:latest",
            "max_tokens": 4000,
            "temperature": 0.0,
            "top_p": 1.0,
            "request_timeout": 180.0,
            "api_base": "http://127.0.0.1:11434/v1",
            "api_version": null,
            "proxy": null,
            "cognitive_services_endpoint": null,
            "deployment_name": null,
            "model_supports_json": true,
            "tokens_per_minute": 0,
            "requests_per_minute": 0,
            "max_retries": 10,
            "max_retry_wait": 10.0,
            "sleep_on_rate_limit_recommendation": true,
            "concurrent_requests": 10
        },
        "parallelization": {
            "stagger": 0.3,
            "num_threads": 50
        },
        "async_mode": "threaded",
        "prompt": null,
        "max_length": 2000,
        "max_input_length": 8000,
        "strategy": null
    },
    "claim_extraction": {
        "llm": {
            "api_key": "REDACTED, length 3",
            "type": "openai_chat",
            "model": "qwen2:latest",
            "max_tokens": 4000,
            "temperature": 0.0,
            "top_p": 1.0,
            "request_timeout": 180.0,
            "api_base": "http://127.0.0.1:11434/v1",
            "api_version": null,
            "proxy": null,
            "cognitive_services_endpoint": null,
            "deployment_name": null,
            "model_supports_json": true,
            "tokens_per_minute": 0,
            "requests_per_minute": 0,
            "max_retries": 10,
            "max_retry_wait": 10.0,
            "sleep_on_rate_limit_recommendation": true,
            "concurrent_requests": 10
        },
        "parallelization": {
            "stagger": 0.3,
            "num_threads": 50
        },
        "async_mode": "threaded",
        "enabled": false,
        "prompt": "prompts/claim_extraction.txt",
        "description": "Any claims or facts that could be relevant to information discovery.",
        "max_gleanings": 0,
        "strategy": null
    },
    "cluster_graph": {
        "max_cluster_size": 10,
        "strategy": null
    },
    "umap": {
        "enabled": false
    },
    "local_search": {
        "text_unit_prop": 0.5,
        "community_prop": 0.1,
        "conversation_history_max_turns": 5,
        "top_k_entities": 10,
        "top_k_relationships": 10,
        "max_tokens": 12000,
        "llm_max_tokens": 2000
    },
    "global_search": {
        "temperature": 0.0,
        "top_p": 1.0,
        "max_tokens": 12000,
        "data_max_tokens": 12000,
        "map_max_tokens": 1000,
        "reduce_max_tokens": 2000,
        "concurrency": 32
    },
    "encoding_model": "cl100k_base",
    "skip_workflows": []
}
16:56:10,110 graphrag.index.create_pipeline_config INFO skipping workflows 
16:56:10,111 graphrag.index.run INFO Running pipeline
16:56:10,111 graphrag.index.storage.file_pipeline_storage INFO Creating file storage at ragtest\output\20240718-165610\artifacts
16:56:10,112 graphrag.index.input.load_input INFO loading input from root_dir=input
16:56:10,112 graphrag.index.input.load_input INFO using file storage for input
16:56:10,113 graphrag.index.storage.file_pipeline_storage INFO search ragtest\input for files matching .*\.txt$
16:56:10,114 graphrag.index.input.text INFO found text files from input, found [('ikaros.txt', {})]
16:56:10,116 graphrag.index.workflows.v1.create_base_entity_graph INFO Created 2 steps for create_base_entity_graph
16:56:10,117 graphrag.index.workflows.load INFO Workflow Run Order: ['create_base_text_units', 'create_base_extracted_entities', 'create_summarized_entities', 'create_base_entity_graph', 'create_final_entities', 'create_final_nodes', 'create_final_communities', 'join_text_units_to_entity_ids', 'create_final_relationships', 'join_text_units_to_relationship_ids', 'create_final_community_reports', 'create_final_text_units', 'create_base_documents', 'create_final_documents']
16:56:10,117 graphrag.index.run INFO Final # of rows loaded: 1
16:56:10,212 graphrag.index.run INFO Running workflow: create_base_text_units...
16:56:10,212 graphrag.index.run INFO dependencies for create_base_text_units: []
16:56:10,212 datashaper.workflow.workflow INFO executing verb orderby
16:56:10,213 datashaper.workflow.workflow INFO executing verb zip
16:56:10,213 datashaper.workflow.workflow INFO executing verb aggregate_override
16:56:10,216 datashaper.workflow.workflow INFO executing verb chunk
16:56:10,379 datashaper.workflow.workflow INFO executing verb select
16:56:10,380 datashaper.workflow.workflow INFO executing verb unroll
16:56:10,381 datashaper.workflow.workflow INFO executing verb rename
16:56:10,382 datashaper.workflow.workflow INFO executing verb genid
16:56:10,382 datashaper.workflow.workflow INFO executing verb unzip
16:56:10,383 datashaper.workflow.workflow INFO executing verb copy
16:56:10,383 datashaper.workflow.workflow INFO executing verb filter
16:56:10,389 graphrag.index.emit.parquet_table_emitter INFO emitting parquet table create_base_text_units.parquet
16:56:10,487 graphrag.index.run INFO Running workflow: create_base_extracted_entities...
16:56:10,487 graphrag.index.run INFO dependencies for create_base_extracted_entities: ['create_base_text_units']
16:56:10,487 graphrag.index.run INFO read table from storage: create_base_text_units.parquet
16:56:10,494 datashaper.workflow.workflow INFO executing verb entity_extract
16:56:10,497 graphrag.llm.openai.create_openai_client INFO Creating OpenAI client base_url=http://127.0.0.1:11434/v1
16:56:10,518 graphrag.index.llm.load_llm INFO create TPM/RPM limiter for qwen2:latest: TPM=0, RPM=0
16:56:10,518 graphrag.index.llm.load_llm INFO create concurrency limiter for qwen2:latest: 10
16:56:10,521 datashaper.workflow.workflow INFO executing verb snapshot
16:56:10,523 datashaper.workflow.workflow INFO executing verb merge_graphs
16:56:10,525 datashaper.workflow.workflow INFO executing verb snapshot_rows
16:56:10,528 graphrag.index.emit.parquet_table_emitter INFO emitting parquet table create_base_extracted_entities.parquet
16:56:10,617 graphrag.index.run INFO Running workflow: create_summarized_entities...
16:56:10,618 graphrag.index.run INFO dependencies for create_summarized_entities: ['create_base_extracted_entities']
16:56:10,618 graphrag.index.run INFO read table from storage: create_base_extracted_entities.parquet
16:56:10,621 datashaper.workflow.workflow INFO executing verb summarize_descriptions
16:56:10,624 datashaper.workflow.workflow INFO executing verb snapshot_rows
16:56:10,626 graphrag.index.emit.parquet_table_emitter INFO emitting parquet table create_summarized_entities.parquet
16:56:10,727 graphrag.index.run INFO Running workflow: create_base_entity_graph...
16:56:10,728 graphrag.index.run INFO dependencies for create_base_entity_graph: ['create_summarized_entities']
16:56:10,728 graphrag.index.run INFO read table from storage: create_summarized_entities.parquet
16:56:10,732 datashaper.workflow.workflow INFO executing verb cluster_graph
16:56:10,734 datashaper.workflow.workflow ERROR Error executing verb "cluster_graph" in create_base_entity_graph: EmptyNetworkError
Traceback (most recent call last):
  File "f:\GraphRAG-Ollama-UI\Miniconda3\lib\site-packages\datashaper\workflow\workflow.py", line 410, in _execute_verb
    result = node.verb.func(**verb_args)
  File "F:\GraphRAG-Ollama-UI\graphrag\index\verbs\graph\clustering\cluster_graph.py", line 61, in cluster_graph
    results = output_df[column].apply(lambda graph: run_layout(strategy, graph))
  File "f:\GraphRAG-Ollama-UI\Miniconda3\lib\site-packages\pandas\core\series.py", line 4924, in apply
    ).apply()
  File "f:\GraphRAG-Ollama-UI\Miniconda3\lib\site-packages\pandas\core\apply.py", line 1427, in apply
    return self.apply_standard()
  File "f:\GraphRAG-Ollama-UI\Miniconda3\lib\site-packages\pandas\core\apply.py", line 1507, in apply_standard
    mapped = obj._map_values(
  File "f:\GraphRAG-Ollama-UI\Miniconda3\lib\site-packages\pandas\core\base.py", line 921, in _map_values
    return algorithms.map_array(arr, mapper, na_action=na_action, convert=convert)
  File "f:\GraphRAG-Ollama-UI\Miniconda3\lib\site-packages\pandas\core\algorithms.py", line 1743, in map_array
    return lib.map_infer(values, mapper, convert=convert)
  File "lib.pyx", line 2972, in pandas._libs.lib.map_infer
  File "F:\GraphRAG-Ollama-UI\graphrag\index\verbs\graph\clustering\cluster_graph.py", line 61, in <lambda>
    results = output_df[column].apply(lambda graph: run_layout(strategy, graph))
  File "F:\GraphRAG-Ollama-UI\graphrag\index\verbs\graph\clustering\cluster_graph.py", line 167, in run_layout
    clusters = run_leiden(graph, strategy)
  File "F:\GraphRAG-Ollama-UI\graphrag\index\verbs\graph\clustering\strategies\leiden.py", line 26, in run
    node_id_to_community_map = _compute_leiden_communities(
  File "F:\GraphRAG-Ollama-UI\graphrag\index\verbs\graph\clustering\strategies\leiden.py", line 61, in _compute_leiden_communities
    community_mapping = hierarchical_leiden(
  File "<@beartype(graspologic.partition.leiden.hierarchical_leiden) at 0x16cf58224d0>", line 304, in hierarchical_leiden
  File "f:\GraphRAG-Ollama-UI\Miniconda3\lib\site-packages\graspologic\partition\leiden.py", line 588, in hierarchical_leiden
    hierarchical_clusters_native = gn.hierarchical_leiden(
leiden.EmptyNetworkError: EmptyNetworkError
16:56:10,737 graphrag.index.reporting.file_workflow_callbacks INFO Error executing verb "cluster_graph" in create_base_entity_graph: EmptyNetworkError details=None
16:56:10,737 graphrag.index.run ERROR error running workflow create_base_entity_graph
Traceback (most recent call last):
  File "F:\GraphRAG-Ollama-UI\graphrag\index\run.py", line 323, in run_pipeline
    result = await workflow.run(context, callbacks)
  File "f:\GraphRAG-Ollama-UI\Miniconda3\lib\site-packages\datashaper\workflow\workflow.py", line 369, in run
    timing = await self._execute_verb(node, context, callbacks)
  File "f:\GraphRAG-Ollama-UI\Miniconda3\lib\site-packages\datashaper\workflow\workflow.py", line 410, in _execute_verb
    result = node.verb.func(**verb_args)
  File "F:\GraphRAG-Ollama-UI\graphrag\index\verbs\graph\clustering\cluster_graph.py", line 61, in cluster_graph
    results = output_df[column].apply(lambda graph: run_layout(strategy, graph))
  File "f:\GraphRAG-Ollama-UI\Miniconda3\lib\site-packages\pandas\core\series.py", line 4924, in apply
    ).apply()
  File "f:\GraphRAG-Ollama-UI\Miniconda3\lib\site-packages\pandas\core\apply.py", line 1427, in apply
    return self.apply_standard()
  File "f:\GraphRAG-Ollama-UI\Miniconda3\lib\site-packages\pandas\core\apply.py", line 1507, in apply_standard
    mapped = obj._map_values(
  File "f:\GraphRAG-Ollama-UI\Miniconda3\lib\site-packages\pandas\core\base.py", line 921, in _map_values
    return algorithms.map_array(arr, mapper, na_action=na_action, convert=convert)
  File "f:\GraphRAG-Ollama-UI\Miniconda3\lib\site-packages\pandas\core\algorithms.py", line 1743, in map_array
    return lib.map_infer(values, mapper, convert=convert)
  File "lib.pyx", line 2972, in pandas._libs.lib.map_infer
  File "F:\GraphRAG-Ollama-UI\graphrag\index\verbs\graph\clustering\cluster_graph.py", line 61, in <lambda>
    results = output_df[column].apply(lambda graph: run_layout(strategy, graph))
  File "F:\GraphRAG-Ollama-UI\graphrag\index\verbs\graph\clustering\cluster_graph.py", line 167, in run_layout
    clusters = run_leiden(graph, strategy)
  File "F:\GraphRAG-Ollama-UI\graphrag\index\verbs\graph\clustering\strategies\leiden.py", line 26, in run
    node_id_to_community_map = _compute_leiden_communities(
  File "F:\GraphRAG-Ollama-UI\graphrag\index\verbs\graph\clustering\strategies\leiden.py", line 61, in _compute_leiden_communities
    community_mapping = hierarchical_leiden(
  File "<@beartype(graspologic.partition.leiden.hierarchical_leiden) at 0x16cf58224d0>", line 304, in hierarchical_leiden
  File "f:\GraphRAG-Ollama-UI\Miniconda3\lib\site-packages\graspologic\partition\leiden.py", line 588, in hierarchical_leiden
    hierarchical_clusters_native = gn.hierarchical_leiden(
leiden.EmptyNetworkError: EmptyNetworkError
16:56:10,739 graphrag.index.reporting.file_workflow_callbacks INFO Error running pipeline! details=None
Ikaros-521 commented 2 months ago

I configured it incorrectly, it has been resolved

embeddings:
  async_mode: threaded
  llm:
    api_base: http://127.0.0.1:11434/api
    api_key: ${GRAPHRAG_API_KEY}
    concurrent_requests: 10
    model: nomic-embed-text:latest
    type: openai_embedding
encoding_model: cl100k_base
entity_extraction:
  entity_types:
  - organization
  - person
  - geo
  - event
  max_gleanings: 0
  prompt: prompts/entity_extraction.txt
global_search:
  concurrency: 32
input:
  base_dir: input
  file_encoding: utf-8
  file_pattern: .*\.txt$
  file_type: text
  type: file
llm:
  api_base: http://127.0.0.1:11434/v1
  api_key: ${GRAPHRAG_API_KEY}
  concurrent_requests: 10
  model: qwen2:latest
  model_supports_json: true
  type: openai_chat
hbh112233abc commented 1 month ago

I configured it incorrectly, it has been resolved

embeddings:
  async_mode: threaded
  llm:
    api_base: http://127.0.0.1:11434/api
    api_key: ${GRAPHRAG_API_KEY}
    concurrent_requests: 10
    model: nomic-embed-text:latest
    type: openai_embedding
encoding_model: cl100k_base
entity_extraction:
  entity_types:
  - organization
  - person
  - geo
  - event
  max_gleanings: 0
  prompt: prompts/entity_extraction.txt
global_search:
  concurrency: 32
input:
  base_dir: input
  file_encoding: utf-8
  file_pattern: .*\.txt$
  file_type: text
  type: file
llm:
  api_base: http://127.0.0.1:11434/v1
  api_key: ${GRAPHRAG_API_KEY}
  concurrent_requests: 10
  model: qwen2:latest
  model_supports_json: true
  type: openai_chat

thanks, it's work, I found error line is llm.type old is type:openai must change to type:openai_chat

ForestDake commented 1 month ago

Still wrong... I have just adapted the setting file, with LLM and Embedding blocks, to use my ollam. And adapt the python file openai_embeddings_llm.py & embedding.py But still error as you have mentioned. My setting file changed:

Under LLM model: mistral max_tokens: 5000 request_timeout: 500 api_base: https://localhost:11434/v1

Under Embedding model: nomic-embed-text api_base: https://localhost:11434/api

Any missing adapting?

Thank you so much!!!

Ikaros-521 commented 1 month ago

Still wrong... I have just adapted the setting file, with LLM and Embedding blocks, to use my ollam.还是不对...我刚刚使用LLM和Embedding块调整了设置文件以使用我的ollam。 And adapt the python file openai_embeddings_llm.py & embedding.py并调整python文件openai_embeddings_llm.py & embedding.py But still error as you have mentioned.但正如你提到的错误。 My setting file changed: 我的设置文件已更改:

Under LLM LLM下 model: mistral model:other max_tokens: 5000 max_tokens:5000 request_timeout: 500 请求超时:500 api_base: https://localhost:11434/v1API_base:[https://localhost:11434/v1](https://localhost:11434/v1)

Under Embedding 嵌入不足 model: nomic-embed-text model:nomic-embed-text api_base: https://localhost:11434/apiAPI_base:[https://localhost:11434/API](https://localhost:11434/api)

Any missing adapting? 有没有什么不适应的?

Thank you so much!!! 非常感谢!

you can try another llm model,Ollama compatibility is not stable