Closed JosephWuMTK closed 1 month ago
Can you post your indexing-engine.log?
16:45:49,52 graphrag.config.read_dotenv INFO No .env file found at /home/appuser/app 16:45:49,58 graphrag.index.cli INFO using default configuration: { "llm": { "api_key": "REDACTED, length 260", "type": "azure_openai_chat", "model": "gpt-4-turbo-preview", "max_tokens": 4000, "temperature": 0.0, "top_p": 1.0, "n": 1, "request_timeout": 180.0, "api_base": "https://private url", "api_version": "2024-05-01-preview", "proxy": null, "cognitive_services_endpoint": null, "deployment_name": "aide-gpt-4o", "model_supports_json": true, "tokens_per_minute": 0, "requests_per_minute": 0, "max_retries": 10, "max_retry_wait": 10.0, "sleep_on_rate_limit_recommendation": true, "concurrent_requests": 25 }, "parallelization": { "stagger": 0.3, "num_threads": 50 }, "async_mode": "threaded", "root_dir": "/home/appuser/app", "reporting": { "type": "file", "base_dir": "hr/fin_doc/QUALCOMM//Company_financials_and_slides//Quarterly_financial_results//FY2024Q2//source/reports", "storage_account_blob_url": null }, "storage": { "type": "file", "base_dir": "hr/fin_doc/QUALCOMM//Company_financials_and_slides//Quarterly_financial_results//FY2024Q2//source/artifacts", "storage_account_blob_url": null }, "cache": { "type": "file", "base_dir": "hr/fin_doc/cache", "storage_account_blob_url": null }, "input": { "type": "file", "file_type": "text", "base_dir": "hr/fin_doc/QUALCOMM//Company_financials_and_slides//Quarterly_financial_results//FY2024Q2//source", "storage_account_blob_url": null, "encoding": "utf-8", "file_pattern": ".\.txt$", "file_filter": null, "source_column": null, "timestamp_column": null, "timestamp_format": null, "text_column": "text", "title_column": null, "document_attribute_columns": [] }, "embed_graph": { "enabled": false, "num_walks": 10, "walk_length": 40, "window_size": 2, "iterations": 3, "random_seed": 597832, "strategy": null }, "embeddings": { "llm": { "api_key": "REDACTED, length 260", "type": "azure_openai_embedding", "model": "text-embedding-ada-002", "max_tokens": 4000, "temperature": 0, "top_p": 1, "n": 1, "request_timeout": 180.0, "api_base": "https://private url", "api_version": "2023-12-01-preview", "proxy": null, "cognitive_services_endpoint": null, "deployment_name": "aide-text-embedding-ada-002-v2", "model_supports_json": null, "tokens_per_minute": 0, "requests_per_minute": 0, "max_retries": 10, "max_retry_wait": 10.0, "sleep_on_rate_limit_recommendation": true, "concurrent_requests": 25 }, "parallelization": { "stagger": 0.3, "num_threads": 50 }, "async_mode": "threaded", "batch_size": 16, "batch_max_tokens": 8191, "target": "required", "skip": [], "vector_store": null, "strategy": null }, "chunks": { "size": 300, "overlap": 100, "group_by_columns": [ "id" ], "strategy": null }, "snapshots": { "graphml": true, "raw_entities": false, "top_level_nodes": false }, "entity_extraction": { "llm": { "api_key": "REDACTED, length 260", "type": "azure_openai_chat", "model": "gpt-4-turbo-preview", "max_tokens": 4000, "temperature": 0.0, "top_p": 1.0, "n": 1, "request_timeout": 180.0, "api_base": "https://private url", "api_version": "2024-05-01-preview", "proxy": null, "cognitive_services_endpoint": null, "deployment_name": "aide-gpt-4o", "model_supports_json": true, "tokens_per_minute": 0, "requests_per_minute": 0, "max_retries": 10, "max_retry_wait": 10.0, "sleep_on_rate_limit_recommendation": true, "concurrent_requests": 25 }, "parallelization": { "stagger": 0.3, "num_threads": 50 }, "async_mode": "threaded", "prompt": "prompts/entity_extraction.txt", "entity_types": [ "organization", "person", "geo", "event" ], "max_gleanings": 1, "strategy": null }, "summarize_descriptions": { "llm": { "api_key": "REDACTED, length 260", "type": "azure_openai_chat", "model": "gpt-4-turbo-preview", "max_tokens": 4000, "temperature": 0.0, "top_p": 1.0, "n": 1, "request_timeout": 180.0, "api_base": "https://private url", "api_version": "2024-05-01-preview", "proxy": null, "cognitive_services_endpoint": null, "deployment_name": "aide-gpt-4o", "model_supports_json": true, "tokens_per_minute": 0, "requests_per_minute": 0, "max_retries": 10, "max_retry_wait": 10.0, "sleep_on_rate_limit_recommendation": true, "concurrent_requests": 25 }, "parallelization": { "stagger": 0.3, "num_threads": 50 }, "async_mode": "threaded", "prompt": "prompts/summarize_descriptions.txt", "max_length": 500, "strategy": null }, "community_reports": { "llm": { "api_key": "REDACTED, length 260", "type": "azure_openai_chat", "model": "gpt-4-turbo-preview", "max_tokens": 4000, "temperature": 0.0, "top_p": 1.0, "n": 1, "request_timeout": 180.0, "api_base": "https://mlop-azure-gateway.mediatek.inc", "api_version": "2024-05-01-preview", "proxy": null, "cognitive_services_endpoint": null, "deployment_name": "aide-gpt-4o", "model_supports_json": true, "tokens_per_minute": 0, "requests_per_minute": 0, "max_retries": 10, "max_retry_wait": 10.0, "sleep_on_rate_limit_recommendation": true, "concurrent_requests": 25 }, "parallelization": { "stagger": 0.3, "num_threads": 50 }, "async_mode": "threaded", "prompt": null, "max_length": 2000, "max_input_length": 8000, "strategy": null }, "claim_extraction": { "llm": { "api_key": "REDACTED, length 260", "type": "azure_openai_chat", "model": "gpt-4-turbo-preview", "max_tokens": 4000, "temperature": 0.0, "top_p": 1.0, "n": 1, "request_timeout": 180.0, "api_base": "https://mlop-azure-gateway.mediatek.inc", "api_version": "2024-05-01-preview", "proxy": null, "cognitive_services_endpoint": null, "deployment_name": "aide-gpt-4o", "model_supports_json": true, "tokens_per_minute": 0, "requests_per_minute": 0, "max_retries": 10, "max_retry_wait": 10.0, "sleep_on_rate_limit_recommendation": true, "concurrent_requests": 25 }, "parallelization": { "stagger": 0.3, "num_threads": 50 }, "async_mode": "threaded", "enabled": false, "prompt": "prompts/claim_extraction.txt", "description": "Any claims or facts that could be relevant to information discovery.", "max_gleanings": 1, "strategy": null }, "cluster_graph": { "max_cluster_size": 10, "strategy": null }, "umap": { "enabled": false }, "local_search": { "text_unit_prop": 0.5, "community_prop": 0.1, "conversation_history_max_turns": 5, "top_k_entities": 10, "top_k_relationships": 10, "temperature": 0.0, "top_p": 1.0, "n": 1, "max_tokens": 12000, "llm_max_tokens": 2000 }, "global_search": { "temperature": 0.0, "top_p": 1.0, "n": 1, "max_tokens": 12000, "data_max_tokens": 12000, "map_max_tokens": 1000, "reduce_max_tokens": 2000, "concurrency": 32 }, "encoding_model": "cl100k_base", "skip_workflows": [] } 16:45:49,59 graphrag.index.create_pipeline_config INFO skipping workflows 16:45:49,64 graphrag.index.run INFO Running pipeline 16:45:49,64 graphrag.index.storage.file_pipeline_storage INFO Creating file storage at /hr/fin_doc/QUALCOMM/Company_financials_and_slides/Quarterly_financial_results/FY2024Q2/source/artifacts 16:45:49,67 graphrag.index.input.load_input INFO loading input from root_dir=hr/fin_doc/QUALCOMM//Company_financials_and_slides//Quarterly_financial_results//FY2024Q2//source 16:45:49,67 graphrag.index.input.load_input INFO using file storage for input 16:45:49,68 graphrag.index.storage.file_pipeline_storage INFO search /hr/fin_doc/QUALCOMM/Company_financials_and_slides/Quarterly_financial_results/FY2024Q2/source for files matching ..txt$ 16:45:49,74 graphrag.index.input.text INFO found text files from /hr/fin_doc/QUALCOMM//Company_financials_and_slides//Quarterly_financial_results//FY2024Q2//source, found [('QUALCOMM_FY2024Q2_transcript.txt', {})] 16:45:49,77 graphrag.index.input.text INFO Found 1 files, loading 1 16:45:49,80 graphrag.index.workflows.load INFO Workflow Run Order: ['create_base_text_units', 'create_base_extracted_entities', 'create_summarized_entities', 'create_base_entity_graph', 'create_final_entities', 'create_final_nodes', 'create_final_communities', 'join_text_units_to_entity_ids', 'create_final_relationships', 'join_text_units_to_relationship_ids', 'create_final_community_reports', 'create_final_text_units', 'create_base_documents', 'create_final_documents'] 16:45:49,81 graphrag.index.run INFO Final # of rows loaded: 1 16:45:49,452 graphrag.index.run INFO Running workflow: create_base_text_units... 16:45:49,452 graphrag.index.run INFO dependencies for create_base_text_units: [] 16:45:49,453 datashaper.workflow.workflow INFO executing verb orderby 16:45:49,454 datashaper.workflow.workflow INFO executing verb zip 16:45:49,455 datashaper.workflow.workflow INFO executing verb aggregate_override 16:45:49,459 datashaper.workflow.workflow INFO executing verb chunk 16:45:50,135 datashaper.workflow.workflow INFO executing verb select 16:45:50,136 datashaper.workflow.workflow INFO executing verb unroll 16:45:50,141 datashaper.workflow.workflow INFO executing verb rename 16:45:50,142 datashaper.workflow.workflow INFO executing verb genid 16:45:50,144 datashaper.workflow.workflow INFO executing verb unzip 16:45:50,145 datashaper.workflow.workflow INFO executing verb copy 16:45:50,145 datashaper.workflow.workflow INFO executing verb filter 16:45:50,154 graphrag.index.emit.parquet_table_emitter INFO emitting parquet table create_base_text_units.parquet 16:45:50,574 graphrag.index.run INFO Running workflow: create_base_extracted_entities... 16:45:50,574 graphrag.index.run INFO dependencies for create_base_extracted_entities: ['create_base_text_units'] 16:45:50,622 graphrag.index.run INFO read table from storage: create_base_text_units.parquet 16:45:50,630 datashaper.workflow.workflow INFO executing verb entity_extract 16:45:50,640 graphrag.llm.openai.create_openai_client INFO Creating Azure OpenAI client api_base=https://private url, deployment_name=aide-gpt-4o 16:45:50,728 graphrag.index.llm.load_llm INFO create TPM/RPM limiter for gpt-4-turbo-preview: TPM=0, RPM=0 16:45:50,728 graphrag.index.llm.load_llm INFO create concurrency limiter for gpt-4-turbo-preview: 25 16:45:51,358 datashaper.workflow.workflow INFO executing verb merge_graphs 16:45:51,661 datashaper.workflow.workflow INFO executing verb snapshot_rows 16:45:51,723 graphrag.index.emit.parquet_table_emitter INFO emitting parquet table create_base_extracted_entities.parquet 16:45:52,134 graphrag.index.run INFO Running workflow: create_summarized_entities... 16:45:52,134 graphrag.index.run INFO dependencies for create_summarized_entities: ['create_base_extracted_entities'] 16:45:52,138 graphrag.index.run INFO read table from storage: create_base_extracted_entities.parquet
Are there lines later in your log that indicate exceptions being thrown? So far I'm not seeing any errors that indicate why files would be missing.
Here is the complete content of the log.
error message
"status": "failure", "stderr": "/home/appuser/.local/lib/python3.11/site-packages/numpy/core/fromnumeric.py:59: FutureWarning: 'DataFrame.swapaxes' is deprecated and will be removed in a future version. Please use 'DataFrame.transpose' instead.\n return bound(*args, kwds)\n/home/appuser/.local/lib/python3.11/site-packages/numpy/core/fromnumeric.py:59: FutureWarning: 'DataFrame.swapaxes' is deprecated and will be removed in a future version. Please use 'DataFrame.transpose' instead.\n return bound(*args, *kwds)\n/home/appuser/.local/lib/python3.11/site-packages/numpy/core/fromnumeric.py:59: FutureWarning: 'DataFrame.swapaxes' is deprecated and will be removed in a future version. Please use 'DataFrame.transpose' instead.\n return bound(args, kwds)\n", "returncode": -9
Identify the cause and remove the following resource limits from the deploy.yaml file:
resources: limits: cpu: 500m memory: 512Mi requests: cpu: 250m memory: 256Mi
Is there an existing issue for this?
Describe the issue
why I only get the following files ( No createfinal*.parquet ) in artifacts after indexing :
clustered_graph.0.graphml
embedded_graph.0.graphml clustered_graph.1.graphml
embedded_graph.1.graphml clustered_graph.2.graphml
merged_graph.graphml create_base_extracted_entities.parquet stats.json create_base_text_units.parquet
summarized_graph.graphml create_summarized_entities.parquet
Steps to reproduce
No response
GraphRAG Config Used
Logs and screenshots
No response
Additional Information