datakind / permanent-colandr-back

DataCorps - Colandr Tool resources
https://www.colandrapp.com/signin
14 stars 1 forks source link

Out of memory errors / socket hangups on colandrapp (production) #39

Open AndrewIOM opened 2 years ago

AndrewIOM commented 2 years ago

We are conducting a screening of ~60,000 records in colandrapp, but are finding that the 'unscreened' list often does not load. In addition, sometimes the include / exclude requests start failing (timeout / socket hangups), and we have to wait 10 - 30 minutes for the system to recover the ability to submit decisions.

A typical error when loading the unscreened page is below:

{
  "message": "500 - {\"errors\":\"(psycopg2.errors.OutOfMemory) could not resize shared memory segment \\\"/PostgreSQL.39801251\\\" to 8388608 bytes: Cannot allocate memory\\n [SQL: \\\"SELECT studies.id AS studies_id, studies.created_at AS studies_created_at, studies.last_updated AS studies_last_updated, studies.user_id AS studies_user_id, studies.review_id AS studies_review_id, studies.tags AS studies_tags, studies.data_source_id AS studies_data_source_id, studies.dedupe_status AS studies_dedupe_status, studies.citation_status AS studies_citation_status, studies.fulltext_status AS studies_fulltext_status, studies.data_extraction_status AS studies_data_extraction_status, dedupes_1.id AS dedupes_1_id, dedupes_1.created_at AS dedupes_1_created_at, dedupes_1.review_id AS dedupes_1_review_id, dedupes_1.duplicate_of AS dedupes_1_duplicate_of, dedupes_1.duplicate_score AS dedupes_1_duplicate_score, citations_1.id AS citations_1_id, citations_1.created_at AS citations_1_created_at, citations_1.last_updated AS citations_1_last_updated, citations_1.review_id AS citations_1_review_id, citations_1.type_of_work AS citations_1_type_of_work, citations_1.title AS citations_1_title, citations_1.secondary_title AS citations_1_secondary_title, citations_1.abstract AS citations_1_abstract, citations_1.pub_year AS citations_1_pub_year, citations_1.pub_month AS citations_1_pub_month, citations_1.authors AS citations_1_authors, citations_1.keywords AS citations_1_keywords, citations_1.type_of_reference AS citations_1_type_of_reference, citations_1.journal_name AS citations_1_journal_name, citations_1.volume AS citations_1_volume, citations_1.issue_number AS citations_1_issue_number, citations_1.doi AS citations_1_doi, citations_1.issn AS citations_1_issn, citations_1.publisher AS citations_1_publisher, citations_1.language AS citations_1_language, citations_1.other_fields AS citations_1_other_fields, citations_1.text_content_vector_rep AS citations_1_text_content_vector_rep, fulltexts_1.id AS fulltexts_1_id, fulltexts_1.created_at AS fulltexts_1_created_at, fulltexts_1.last_updated AS fulltexts_1_last_updated, fulltexts_1.review_id AS fulltexts_1_review_id, fulltexts_1.filename AS fulltexts_1_filename, fulltexts_1.original_filename AS fulltexts_1_original_filename, fulltexts_1.text_content AS fulltexts_1_text_content, fulltexts_1.text_content_vector_rep AS fulltexts_1_text_content_vector_rep, data_extractions_1.id AS data_extractions_1_id, data_extractions_1.created_at AS data_extractions_1_created_at, data_extractions_1.last_updated AS data_extractions_1_last_updated, data_extractions_1.review_id AS data_extractions_1_review_id, data_extractions_1.extracted_items AS data_extractions_1_extracted_items \\\\nFROM studies JOIN citations ON citations.id = studies.id LEFT OUTER JOIN dedupes AS dedupes_1 ON studies.id = dedupes_1.id LEFT OUTER JOIN citations AS citations_1 ON studies.id = citations_1.id LEFT OUTER JOIN fulltexts AS fulltexts_1 ON studies.id = fulltexts_1.id LEFT OUTER JOIN data_extractions AS data_extractions_1 ON studies.id = data_extractions_1.id \\\\nWHERE %(param_1)s = studies.review_id AND studies.id IN (\\\\n                    SELECT t.id\\\\n                    FROM (SELECT\\\\n                              studies.id,\\\\n                              studies.dedupe_status,\\\\n                              studies.citation_status,\\\\n                              screenings.user_ids\\\\n                          FROM studies\\\\n                          LEFT JOIN (SELECT citation_id, ARRAY_AGG(user_id) AS user_ids\\\\n                                     FROM citation_screenings\\\\n                                     GROUP BY citation_id\\\\n                                     ) AS screenings\\\\n                          ON studies.id = screenings.citation_id\\\\n                          ) AS t\\\\n                    WHERE\\\\n                        t.dedupe_status = 'not_duplicate' -- this is necessary!\\\\n                        AND t.citation_status NOT IN ('excluded', 'included', 'conflict')\\\\n                        AND (t.citation_status = 'not_screened' OR NOT 6672 = ANY(t.user_ids))\\\\n                    ) ORDER BY random() \\\\n LIMIT %(param_2)s\\\"] [parameters: {'param_1': [removed for github], 'param_2': 1000}] (Background on this error at: http://sqlalche.me/e/e3q8)\"}",
  "error": {
    "name": "StatusCodeError",
    "statusCode": 500,
    "message": "500 - {\"errors\":\"(psycopg2.errors.OutOfMemory) could not resize shared memory segment \\\"/PostgreSQL.39801251\\\" to 8388608 bytes: Cannot allocate memory\\n [SQL: \\\"SELECT studies.id AS studies_id, studies.created_at AS studies_created_at, studies.last_updated AS studies_last_updated, studies.user_id AS studies_user_id, studies.review_id AS studies_review_id, studies.tags AS studies_tags, studies.data_source_id AS studies_data_source_id, studies.dedupe_status AS studies_dedupe_status, studies.citation_status AS studies_citation_status, studies.fulltext_status AS studies_fulltext_status, studies.data_extraction_status AS studies_data_extraction_status, dedupes_1.id AS dedupes_1_id, dedupes_1.created_at AS dedupes_1_created_at, dedupes_1.review_id AS dedupes_1_review_id, dedupes_1.duplicate_of AS dedupes_1_duplicate_of, dedupes_1.duplicate_score AS dedupes_1_duplicate_score, citations_1.id AS citations_1_id, citations_1.created_at AS citations_1_created_at, citations_1.last_updated AS citations_1_last_updated, citations_1.review_id AS citations_1_review_id, citations_1.type_of_work AS citations_1_type_of_work, citations_1.title AS citations_1_title, citations_1.secondary_title AS citations_1_secondary_title, citations_1.abstract AS citations_1_abstract, citations_1.pub_year AS citations_1_pub_year, citations_1.pub_month AS citations_1_pub_month, citations_1.authors AS citations_1_authors, citations_1.keywords AS citations_1_keywords, citations_1.type_of_reference AS citations_1_type_of_reference, citations_1.journal_name AS citations_1_journal_name, citations_1.volume AS citations_1_volume, citations_1.issue_number AS citations_1_issue_number, citations_1.doi AS citations_1_doi, citations_1.issn AS citations_1_issn, citations_1.publisher AS citations_1_publisher, citations_1.language AS citations_1_language, citations_1.other_fields AS citations_1_other_fields, citations_1.text_content_vector_rep AS citations_1_text_content_vector_rep, fulltexts_1.id AS fulltexts_1_id, fulltexts_1.created_at AS fulltexts_1_created_at, fulltexts_1.last_updated AS fulltexts_1_last_updated, fulltexts_1.review_id AS fulltexts_1_review_id, fulltexts_1.filename AS fulltexts_1_filename, fulltexts_1.original_filename AS fulltexts_1_original_filename, fulltexts_1.text_content AS fulltexts_1_text_content, fulltexts_1.text_content_vector_rep AS fulltexts_1_text_content_vector_rep, data_extractions_1.id AS data_extractions_1_id, data_extractions_1.created_at AS data_extractions_1_created_at, data_extractions_1.last_updated AS data_extractions_1_last_updated, data_extractions_1.review_id AS data_extractions_1_review_id, data_extractions_1.extracted_items AS data_extractions_1_extracted_items \\\\nFROM studies JOIN citations ON citations.id = studies.id LEFT OUTER JOIN dedupes AS dedupes_1 ON studies.id = dedupes_1.id LEFT OUTER JOIN citations AS citations_1 ON studies.id = citations_1.id LEFT OUTER JOIN fulltexts AS fulltexts_1 ON studies.id = fulltexts_1.id LEFT OUTER JOIN data_extractions AS data_extractions_1 ON studies.id = data_extractions_1.id \\\\nWHERE %(param_1)s = studies.review_id AND studies.id IN (\\\\n                    SELECT t.id\\\\n                    FROM (SELECT\\\\n                              studies.id,\\\\n                              studies.dedupe_status,\\\\n                              studies.citation_status,\\\\n                              screenings.user_ids\\\\n                          FROM studies\\\\n                          LEFT JOIN (SELECT citation_id, ARRAY_AGG(user_id) AS user_ids\\\\n                                     FROM citation_screenings\\\\n                                     GROUP BY citation_id\\\\n                                     ) AS screenings\\\\n                          ON studies.id = screenings.citation_id\\\\n                          ) AS t\\\\n                    WHERE\\\\n                        t.dedupe_status = 'not_duplicate' -- this is necessary!\\\\n                        AND t.citation_status NOT IN ('excluded', 'included', 'conflict')\\\\n                        AND (t.citation_status = 'not_screened' OR NOT 6672 = ANY(t.user_ids))\\\\n                    ) ORDER BY random() \\\\n LIMIT %(param_2)s\\\"] [parameters: {'param_1': [removed for github], 'param_2': 1000}] (Background on this error at: http://sqlalche.me/e/e3q8)\"}",
    "error": {
      "errors": "(psycopg2.errors.OutOfMemory) could not resize shared memory segment \"/PostgreSQL.39801251\" to 8388608 bytes: Cannot allocate memory\n [SQL: \"SELECT studies.id AS studies_id, studies.created_at AS studies_created_at, studies.last_updated AS studies_last_updated, studies.user_id AS studies_user_id, studies.review_id AS studies_review_id, studies.tags AS studies_tags, studies.data_source_id AS studies_data_source_id, studies.dedupe_status AS studies_dedupe_status, studies.citation_status AS studies_citation_status, studies.fulltext_status AS studies_fulltext_status, studies.data_extraction_status AS studies_data_extraction_status, dedupes_1.id AS dedupes_1_id, dedupes_1.created_at AS dedupes_1_created_at, dedupes_1.review_id AS dedupes_1_review_id, dedupes_1.duplicate_of AS dedupes_1_duplicate_of, dedupes_1.duplicate_score AS dedupes_1_duplicate_score, citations_1.id AS citations_1_id, citations_1.created_at AS citations_1_created_at, citations_1.last_updated AS citations_1_last_updated, citations_1.review_id AS citations_1_review_id, citations_1.type_of_work AS citations_1_type_of_work, citations_1.title AS citations_1_title, citations_1.secondary_title AS citations_1_secondary_title, citations_1.abstract AS citations_1_abstract, citations_1.pub_year AS citations_1_pub_year, citations_1.pub_month AS citations_1_pub_month, citations_1.authors AS citations_1_authors, citations_1.keywords AS citations_1_keywords, citations_1.type_of_reference AS citations_1_type_of_reference, citations_1.journal_name AS citations_1_journal_name, citations_1.volume AS citations_1_volume, citations_1.issue_number AS citations_1_issue_number, citations_1.doi AS citations_1_doi, citations_1.issn AS citations_1_issn, citations_1.publisher AS citations_1_publisher, citations_1.language AS citations_1_language, citations_1.other_fields AS citations_1_other_fields, citations_1.text_content_vector_rep AS citations_1_text_content_vector_rep, fulltexts_1.id AS fulltexts_1_id, fulltexts_1.created_at AS fulltexts_1_created_at, fulltexts_1.last_updated AS fulltexts_1_last_updated, fulltexts_1.review_id AS fulltexts_1_review_id, fulltexts_1.filename AS fulltexts_1_filename, fulltexts_1.original_filename AS fulltexts_1_original_filename, fulltexts_1.text_content AS fulltexts_1_text_content, fulltexts_1.text_content_vector_rep AS fulltexts_1_text_content_vector_rep, data_extractions_1.id AS data_extractions_1_id, data_extractions_1.created_at AS data_extractions_1_created_at, data_extractions_1.last_updated AS data_extractions_1_last_updated, data_extractions_1.review_id AS data_extractions_1_review_id, data_extractions_1.extracted_items AS data_extractions_1_extracted_items \\nFROM studies JOIN citations ON citations.id = studies.id LEFT OUTER JOIN dedupes AS dedupes_1 ON studies.id = dedupes_1.id LEFT OUTER JOIN citations AS citations_1 ON studies.id = citations_1.id LEFT OUTER JOIN fulltexts AS fulltexts_1 ON studies.id = fulltexts_1.id LEFT OUTER JOIN data_extractions AS data_extractions_1 ON studies.id = data_extractions_1.id \\nWHERE %(param_1)s = studies.review_id AND studies.id IN (\\n                    SELECT t.id\\n                    FROM (SELECT\\n                              studies.id,\\n                              studies.dedupe_status,\\n                              studies.citation_status,\\n                              screenings.user_ids\\n                          FROM studies\\n                          LEFT JOIN (SELECT citation_id, ARRAY_AGG(user_id) AS user_ids\\n                                     FROM citation_screenings\\n                                     GROUP BY citation_id\\n                                     ) AS screenings\\n                          ON studies.id = screenings.citation_id\\n                          ) AS t\\n                    WHERE\\n                        t.dedupe_status = 'not_duplicate' -- this is necessary!\\n                        AND t.citation_status NOT IN ('excluded', 'included', 'conflict')\\n                        AND (t.citation_status = 'not_screened' OR NOT 6672 = ANY(t.user_ids))\\n                    ) ORDER BY random() \\n LIMIT %(param_2)s\"] [parameters: {'param_1': [removed for github], 'param_2': 1000}] (Background on this error at: http://sqlalche.me/e/e3q8)"
    },
    "options": {
      "uri": "http://localhost:5000/api/studies",
      "json": true,
      "auth": {
        "user": "[removed for github]"
      },
      "qs": {
        "review_id": "[removed for github]",
        "fields": "id,citation.title,citation.authors,citation.journal_name,citation.pub_year,citation.abstract,citation.keywords,citation.screenings,citation_status,tags",
        "citation_status": "pending",
        "order_by": "relevance",
        "page": 0,
        "per_page": 100
      },
      "simple": true,
      "resolveWithFullResponse": false,
      "transform2xxOnly": false
    },
    "response": {
      "statusCode": 500,
      "body": {
        "errors": "(psycopg2.errors.OutOfMemory) could not resize shared memory segment \"/PostgreSQL.39801251\" to 8388608 bytes: Cannot allocate memory\n [SQL: \"SELECT studies.id AS studies_id, studies.created_at AS studies_created_at, studies.last_updated AS studies_last_updated, studies.user_id AS studies_user_id, studies.review_id AS studies_review_id, studies.tags AS studies_tags, studies.data_source_id AS studies_data_source_id, studies.dedupe_status AS studies_dedupe_status, studies.citation_status AS studies_citation_status, studies.fulltext_status AS studies_fulltext_status, studies.data_extraction_status AS studies_data_extraction_status, dedupes_1.id AS dedupes_1_id, dedupes_1.created_at AS dedupes_1_created_at, dedupes_1.review_id AS dedupes_1_review_id, dedupes_1.duplicate_of AS dedupes_1_duplicate_of, dedupes_1.duplicate_score AS dedupes_1_duplicate_score, citations_1.id AS citations_1_id, citations_1.created_at AS citations_1_created_at, citations_1.last_updated AS citations_1_last_updated, citations_1.review_id AS citations_1_review_id, citations_1.type_of_work AS citations_1_type_of_work, citations_1.title AS citations_1_title, citations_1.secondary_title AS citations_1_secondary_title, citations_1.abstract AS citations_1_abstract, citations_1.pub_year AS citations_1_pub_year, citations_1.pub_month AS citations_1_pub_month, citations_1.authors AS citations_1_authors, citations_1.keywords AS citations_1_keywords, citations_1.type_of_reference AS citations_1_type_of_reference, citations_1.journal_name AS citations_1_journal_name, citations_1.volume AS citations_1_volume, citations_1.issue_number AS citations_1_issue_number, citations_1.doi AS citations_1_doi, citations_1.issn AS citations_1_issn, citations_1.publisher AS citations_1_publisher, citations_1.language AS citations_1_language, citations_1.other_fields AS citations_1_other_fields, citations_1.text_content_vector_rep AS citations_1_text_content_vector_rep, fulltexts_1.id AS fulltexts_1_id, fulltexts_1.created_at AS fulltexts_1_created_at, fulltexts_1.last_updated AS fulltexts_1_last_updated, fulltexts_1.review_id AS fulltexts_1_review_id, fulltexts_1.filename AS fulltexts_1_filename, fulltexts_1.original_filename AS fulltexts_1_original_filename, fulltexts_1.text_content AS fulltexts_1_text_content, fulltexts_1.text_content_vector_rep AS fulltexts_1_text_content_vector_rep, data_extractions_1.id AS data_extractions_1_id, data_extractions_1.created_at AS data_extractions_1_created_at, data_extractions_1.last_updated AS data_extractions_1_last_updated, data_extractions_1.review_id AS data_extractions_1_review_id, data_extractions_1.extracted_items AS data_extractions_1_extracted_items \\nFROM studies JOIN citations ON citations.id = studies.id LEFT OUTER JOIN dedupes AS dedupes_1 ON studies.id = dedupes_1.id LEFT OUTER JOIN citations AS citations_1 ON studies.id = citations_1.id LEFT OUTER JOIN fulltexts AS fulltexts_1 ON studies.id = fulltexts_1.id LEFT OUTER JOIN data_extractions AS data_extractions_1 ON studies.id = data_extractions_1.id \\nWHERE %(param_1)s = studies.review_id AND studies.id IN (\\n                    SELECT t.id\\n                    FROM (SELECT\\n                              studies.id,\\n                              studies.dedupe_status,\\n                              studies.citation_status,\\n                              screenings.user_ids\\n                          FROM studies\\n                          LEFT JOIN (SELECT citation_id, ARRAY_AGG(user_id) AS user_ids\\n                                     FROM citation_screenings\\n                                     GROUP BY citation_id\\n                                     ) AS screenings\\n                          ON studies.id = screenings.citation_id\\n                          ) AS t\\n                    WHERE\\n                        t.dedupe_status = 'not_duplicate' -- this is necessary!\\n                        AND t.citation_status NOT IN ('excluded', 'included', 'conflict')\\n                        AND (t.citation_status = 'not_screened' OR NOT 6672 = ANY(t.user_ids))\\n                    ) ORDER BY random() \\n LIMIT %(param_2)s\"] [parameters: {'param_1': [removed for github], 'param_2': 1000}] (Background on this error at: http://sqlalche.me/e/e3q8)"
      },
      "headers": {
        "server": "gunicorn/19.6.0",
        "date": "Tue, 08 Mar 2022 11:08:35 GMT",
        "connection": "close",
        "content-type": "application/json",
        "content-length": "4292"
      },
      "request": {
        "uri": {
          "protocol": "http:",
          "slashes": true,
          "auth": null,
          "host": "localhost:5000",
          "port": "5000",
          "hostname": "localhost",
          "hash": null,
          "search": "?review_id=[removed for github]&fields=id%2Ccitation.title%2Ccitation.authors%2Ccitation.journal_name%2Ccitation.pub_year%2Ccitation.abstract%2Ccitation.keywords%2Ccitation.screenings%2Ccitation_status%2Ctags&citation_status=pending&order_by=relevance&page=0&per_page=100",
          "query": "review_id=[removed for github]&fields=id%2Ccitation.title%2Ccitation.authors%2Ccitation.journal_name%2Ccitation.pub_year%2Ccitation.abstract%2Ccitation.keywords%2Ccitation.screenings%2Ccitation_status%2Ctags&citation_status=pending&order_by=relevance&page=0&per_page=100",
          "pathname": "/api/studies",
          "path": "/api/studies?review_id=[removed for github]&fields=id%2Ccitation.title%2Ccitation.authors%2Ccitation.journal_name%2Ccitation.pub_year%2Ccitation.abstract%2Ccitation.keywords%2Ccitation.screenings%2Ccitation_status%2Ctags&citation_status=pending&order_by=relevance&page=0&per_page=100",
          "href": "http://localhost:5000/api/studies?review_id=[removed for github]&fields=id%2Ccitation.title%2Ccitation.authors%2Ccitation.journal_name%2Ccitation.pub_year%2Ccitation.abstract%2Ccitation.keywords%2Ccitation.screenings%2Ccitation_status%2Ctags&citation_status=pending&order_by=relevance&page=0&per_page=100"
        },
        "method": "GET",
        "headers": {
          "authorization": "{I've removed this for security reasons}",
          "accept": "application/json"
        }
      }
    }
  }
}

Thanks for any help with these errors.