MPMG-DCC-UFMG / F01-gerenciamento

2 stars 1 forks source link

Resolver problema de "KeyError: 'id' " na geração de coletores #4

Closed albertoueda closed 2 years ago

albertoueda commented 2 years ago

Problema ocorre para algumas configurações. Reportado por @RitaRez .

Teste com erro

unknown (1)

Teste de sucesso com .json de Teixeiras, direto no servidor:

Config .json:

{
  "source_name": "[Realiza\u00e7\u00e3o-F01] Coleta de Terceiro Setor - Conv\u00eanios de Teixeiras",
  "base_url": "https://pm-teixeiras.publicacao.siplanweb.com.br/",
  "obey_robots": false,
  "data_path": "/datalake/ufmg/webcrawlerc01/realizacaof01/teixeiras/terceiro_setor/convenios",
  "request_type": "GET",
  "form_request_type": "POST",
  "antiblock_download_delay": 2,
  "antiblock_autothrottle_enabled": false,
  "antiblock_autothrottle_start_delay": 2,
  "antiblock_autothrottle_max_delay": 10,
  "antiblock_ip_rotation_enabled": false,
  "antiblock_ip_rotation_type": "tor",
  "antiblock_max_reqs_per_ip": 10,
  "antiblock_max_reuse_rounds": 10,
  "antiblock_proxy_list": "",
  "antiblock_user_agent_rotation_enabled": false,
  "antiblock_reqs_per_user_agent": 100,
  "antiblock_user_agents_list": "",
  "antiblock_insert_cookies_enabled": false,
  "antiblock_cookies_list": "",
  "captcha": "none",
  "has_webdriver": false,
  "webdriver_path": null,
  "img_xpath": null,
  "sound_xpath": null,
  "dynamic_processing": true,
  "explore_links": false,
  "link_extractor_max_depth": null,
  "link_extractor_allow_url": null,
  "link_extractor_allow_domains": null,
  "link_extractor_tags": null,
  "link_extractor_attrs": null,
  "link_extractor_check_type": false,
  "link_extractor_process_value": "",
  "download_files": false,
  "download_files_allow_url": null,
  "download_files_allow_extensions": null,
  "download_files_allow_domains": null,
  "download_files_tags": null,
  "download_files_attrs": null,
  "download_files_process_value": "",
  "download_files_check_large_content": true,
  "download_imgs": false,
  "steps": "{\"step\":\"root\",\"depth\":0,\"children\":[{\"step\":\"espere\",\"depth\":1,\"arguments\":{\"segundos\":\"4\"}},{\"step\":\"screenshot\",\"depth\":1,\"arguments\":{}},{\"step\":\"salva_pagina\",\"depth\":1,\"arguments\":{}},{\"step\":\"atribuicao\",\"depth\":1,\"target\":\"aba_presente\",\"source\":{\"call\":{\"step\":\"elemento_existe_na_pagina\",\"arguments\":{\"xpath\":\"\\\"//a[h2='Conv\u00eanios']\\\"\"}}}},{\"step\":\"se\",\"depth\":1,\"children\":[{\"step\":\"clique\",\"depth\":2,\"arguments\":{\"elemento\":\"\\\"//a[h2='Conv\u00eanios']\\\"\"}},{\"step\":\"espere\",\"depth\":2,\"arguments\":{\"segundos\":\"6\"}},{\"step\":\"para_cada\",\"depth\":2,\"iterator\":\"ano\",\"children\":[{\"step\":\"selecione\",\"depth\":3,\"arguments\":{\"xpath\":\"\\\"//select[@name='exercicio']\\\"\",\"opcao\":\"ano\"}},{\"step\":\"espere\",\"depth\":3,\"arguments\":{\"segundos\":\"1\"}},{\"step\":\"clique\",\"depth\":3,\"arguments\":{\"elemento\":\"\\\"//button[text()='Filtrar']\\\"\"}},{\"step\":\"espere\",\"depth\":3,\"arguments\":{\"segundos\":\"4\"}},{\"step\":\"selecione\",\"depth\":3,\"arguments\":{\"xpath\":\"\\\"//select[@name='gridConvenios_length']\\\"\",\"opcao\":\"\\\"1000\\\"\"}},{\"step\":\"espere\",\"depth\":3,\"arguments\":{\"segundos\":\"4\"}},{\"step\":\"screenshot\",\"depth\":3,\"arguments\":{}},{\"step\":\"salva_pagina\",\"depth\":3,\"arguments\":{}},{\"step\":\"imprime\",\"depth\":3,\"arguments\":{\"texto\":\"f\\\"Coletando dados de {ano}:\\\"\"}},{\"step\":\"clique\",\"depth\":3,\"arguments\":{\"elemento\":\"\\\"//button[@title='Gerar PDF']\\\"\"}},{\"step\":\"imprime\",\"depth\":3,\"arguments\":{\"texto\":\"\\\"- Baixando arquivo PDF\\\"\"}},{\"step\":\"espere\",\"depth\":3,\"arguments\":{\"segundos\":\"4\"}},{\"step\":\"clique\",\"depth\":3,\"arguments\":{\"elemento\":\"\\\"//button[@title='Gerar Excel']\\\"\"}},{\"step\":\"imprime\",\"depth\":3,\"arguments\":{\"texto\":\"\\\"- Baixando arquivo Excel\\\"\"}},{\"step\":\"espere\",\"depth\":3,\"arguments\":{\"segundos\":\"4\"}},{\"step\":\"clique\",\"depth\":3,\"arguments\":{\"elemento\":\"\\\"//button[@title='Gerar CSV']\\\"\"}},{\"step\":\"imprime\",\"depth\":3,\"arguments\":{\"texto\":\"\\\"- Baixando arquivo CSV\\\"\"}},{\"step\":\"espere\",\"depth\":3,\"arguments\":{\"segundos\":\"4\"}},{\"step\":\"para_cada\",\"depth\":3,\"iterator\":\"convenio\",\"children\":[{\"step\":\"abrir_em_nova_aba\",\"depth\":4,\"link_xpath\":\"convenio\",\"children\":[]},{\"step\":\"espere\",\"depth\":4,\"arguments\":{\"segundos\":\"2\"}},{\"step\":\"salva_pagina\",\"depth\":4,\"arguments\":{}},{\"step\":\"atribuicao\",\"depth\":4,\"target\":\"numero\",\"source\":{\"call\":{\"step\":\"extrai_texto\",\"arguments\":{\"xpath\":\"\\\"/html/body/div/div/div[2]/div[2]/div/div[1]\\\"\"}}}},{\"step\":\"imprime\",\"depth\":4,\"arguments\":{\"texto\":\"f\\\"-- Salvando dados do Conv\u00eanio n\u00ba {numero.split()[1]}\\\"\"}},{\"step\":\"espere\",\"depth\":4,\"arguments\":{\"segundos\":\"2\"}},{\"step\":\"fechar_aba\",\"depth\":4,\"arguments\":{}},{\"step\":\"espere\",\"depth\":4,\"arguments\":{\"segundos\":\"2\"}},{\"step\":\"clique\",\"depth\":4,\"arguments\":{\"elemento\":\"convenio[:-2] + \\\"[1]\\\"\"}},{\"step\":\"espere\",\"depth\":4,\"arguments\":{\"segundos\":\"1\"}}],\"iterable\":{\"call\":{\"step\":\"localiza_elementos\",\"arguments\":{\"xpath\":\"\\\"//tr[*]/td/a\\\"\"}}}}],\"iterable\":{\"call\":{\"step\":\"opcoes\",\"arguments\":{\"xpath\":\"\\\"//select[@name='exercicio']\\\"\"}}}}],\"condition\":{\"call\":{\"step\":\"objeto\",\"arguments\":{\"objeto\":\"aba_presente\"}}}},{\"step\":\"se\",\"depth\":1,\"children\":[{\"step\":\"imprime\",\"depth\":2,\"arguments\":{\"texto\":\"\\\"Se\u00e7\u00e3o 'Conv\u00eanios' n\u00e3o dispon\u00edvel na p\u00e1gina\\\"\"}}],\"condition\":{\"call\":{\"step\":\"objeto\",\"arguments\":{\"objeto\":\"not(aba_presente)\"}}}}]}",
  "encoding_detection_method": 1,
  "templated_url_parameter_handlers": [],
  "static_form_parameter_handlers": [],
  "templated_url_response_handlers": [],
  "static_form_response_handlers": [],
  "crawler_id": 2198,
  "instance_id": "165068165023522"
}

Teste

import glob, json, requests

file = glob.glob(f"configs/temp_320366967183144134764414472177225492345/config_Teixeiras.json")[0]
config = None

with open(file, 'r') as f:
    config = json.load(f)

r_creator = requests.post('http://10.21.0.130:8000/api/crawlers/', data=config)
configuracao_do_coletor_criado = r_creator.json()
print("keys existentes:", configuracao_do_coletor_criado.keys())

id_crawler = configuracao_do_coletor_criado['id']
print("id_crawer:", id_crawler)

Saída:

unknown (2)
RitaRez commented 2 years ago

Exemplo de configuração e arquivo com parâmetros que causaram o erro:

Configuração:

{
  "source_name": "[Realiza\u00e7\u00e3o-F01] Coleta de <TAG> - <SUBTAG> de <NOME_MUNICIPIO>",
  "base_url": "<URL_BASE><PARAM_URL>",
  "obey_robots": false,
  "data_path": "/datalake/ufmg/webcrawlerc01/realizacaof01/<MUNICIPIO_PATH>/<TAG>/<SUBTAG>",
  "request_type": "GET",
  "form_request_type": "POST",
  "antiblock_download_delay": 2,
  "antiblock_autothrottle_enabled": false,
  "antiblock_autothrottle_start_delay": 2,
  "antiblock_autothrottle_max_delay": 10,
  "antiblock_ip_rotation_enabled": false,
  "antiblock_ip_rotation_type": "tor",
  "antiblock_max_reqs_per_ip": 10,
  "antiblock_max_reuse_rounds": 10,
  "antiblock_proxy_list": "",
  "antiblock_user_agent_rotation_enabled": false,
  "antiblock_reqs_per_user_agent": 100,
  "antiblock_user_agents_list": "",
  "antiblock_insert_cookies_enabled": false,
  "antiblock_cookies_list": "",
  "captcha": "none",
  "has_webdriver": false,
  "webdriver_path": "",
  "img_xpath": "",
  "sound_xpath": "",
  "dynamic_processing": true,
  "explore_links": false,
  "link_extractor_max_depth": null,
  "link_extractor_allow_url": "",
  "link_extractor_allow_domains": "",
  "link_extractor_tags": "",
  "link_extractor_attrs": "",
  "link_extractor_check_type": false,
  "link_extractor_process_value": "",
  "download_files": true,
  "download_files_allow_url": "",
  "download_files_allow_extensions": "pdf,csv,xml",
  "download_files_allow_domains": "",
  "download_files_tags": "",
  "download_files_attrs": "",
  "download_files_process_value": "",
  "download_files_check_large_content": true,
  "download_imgs": false,
  "steps": "{\"step\":\"root\",\"depth\":0,\"children\":[{\"step\":\"espere\",\"depth\":1,\"arguments\":{\"segundos\":\"10\"}},{\"step\":\"selecione\",\"depth\":1,\"arguments\":{\"xpath\":\"\\\"/html/body/div[1]/div[2]/div/form/table[2]/tbody/tr[1]/td/table/tbody/tr[1]/td/table/tbody/tr/td[2]/select\\\"\",\"opcao\":\"\\\"2022\\\"\"}},{\"step\":\"clique\",\"depth\":1,\"arguments\":{\"elemento\":\"\\\"/html/body/div[1]/div[2]/div/form/table[2]/tbody/tr[1]/td/table/tbody/tr[4]/td/table/tbody/tr/td/button\\\"\"}},{\"step\":\"espere\",\"depth\":1,\"arguments\":{\"segundos\":\"5\"}},{\"step\":\"para_cada\",\"depth\":1,\"iterator\":\"csv\",\"children\":[{\"step\":\"clique\",\"depth\":2,\"arguments\":{\"elemento\":\"csv\"}}],\"iterable\":{\"call\":{\"step\":\"localiza_elementos\",\"arguments\":{\"xpath\":\"\\\"/html/body/div[1]/div[2]/div/form/table[2]/tbody/tr[2]/td/table/tbody/tr/td/table/tbody/tr[*]/td/table/tbody/tr/td[7]/a\\\"\"}}}},{\"step\":\"screenshot\",\"depth\":1,\"arguments\":{}},{\"step\":\"salva_pagina\",\"depth\":1,\"arguments\":{}},{\"step\":\"selecione\",\"depth\":1,\"arguments\":{\"xpath\":\"\\\"/html/body/div[1]/div[2]/div/form/table[2]/tbody/tr[1]/td/table/tbody/tr[1]/td/table/tbody/tr/td[2]/select\\\"\",\"opcao\":\"\\\"2021\\\"\"}},{\"step\":\"clique\",\"depth\":1,\"arguments\":{\"elemento\":\"\\\"/html/body/div[1]/div[2]/div/form/table[2]/tbody/tr[1]/td/table/tbody/tr[4]/td/table/tbody/tr/td/button\\\"\"}},{\"step\":\"espere\",\"depth\":1,\"arguments\":{\"segundos\":\"5\"}},{\"step\":\"para_cada\",\"depth\":1,\"iterator\":\"csv\",\"children\":[{\"step\":\"clique\",\"depth\":2,\"arguments\":{\"elemento\":\"csv\"}}],\"iterable\":{\"call\":{\"step\":\"localiza_elementos\",\"arguments\":{\"xpath\":\"\\\"/html/body/div[1]/div[2]/div/form/table[2]/tbody/tr[2]/td/table/tbody/tr/td/table/tbody/tr[*]/td/table/tbody/tr/td[7]/a\\\"\"}}}},{\"step\":\"screenshot\",\"depth\":1,\"arguments\":{}},{\"step\":\"salva_pagina\",\"depth\":1,\"arguments\":{}},{\"step\":\"selecione\",\"depth\":1,\"arguments\":{\"xpath\":\"\\\"/html/body/div[1]/div[2]/div/form/table[2]/tbody/tr[1]/td/table/tbody/tr[1]/td/table/tbody/tr/td[2]/select\\\"\",\"opcao\":\"\\\"2020\\\"\"}},{\"step\":\"clique\",\"depth\":1,\"arguments\":{\"elemento\":\"\\\"/html/body/div[1]/div[2]/div/form/table[2]/tbody/tr[1]/td/table/tbody/tr[4]/td/table/tbody/tr/td/button\\\"\"}},{\"step\":\"espere\",\"depth\":1,\"arguments\":{\"segundos\":\"5\"}},{\"step\":\"para_cada\",\"depth\":1,\"iterator\":\"csv\",\"children\":[{\"step\":\"clique\",\"depth\":2,\"arguments\":{\"elemento\":\"csv\"}}],\"iterable\":{\"call\":{\"step\":\"localiza_elementos\",\"arguments\":{\"xpath\":\"\\\"/html/body/div[1]/div[2]/div/form/table[2]/tbody/tr[2]/td/table/tbody/tr/td/table/tbody/tr[*]/td/table/tbody/tr/td[7]/a\\\"\"}}}},{\"step\":\"screenshot\",\"depth\":1,\"arguments\":{}},{\"step\":\"salva_pagina\",\"depth\":1,\"arguments\":{}},{\"step\":\"selecione\",\"depth\":1,\"arguments\":{\"xpath\":\"\\\"/html/body/div[1]/div[2]/div/form/table[2]/tbody/tr[1]/td/table/tbody/tr[1]/td/table/tbody/tr/td[2]/select\\\"\",\"opcao\":\"\\\"2019\\\"\"}},{\"step\":\"clique\",\"depth\":1,\"arguments\":{\"elemento\":\"\\\"/html/body/div[1]/div[2]/div/form/table[2]/tbody/tr[1]/td/table/tbody/tr[4]/td/table/tbody/tr/td/button\\\"\"}},{\"step\":\"espere\",\"depth\":1,\"arguments\":{\"segundos\":\"5\"}},{\"step\":\"para_cada\",\"depth\":1,\"iterator\":\"csv\",\"children\":[{\"step\":\"clique\",\"depth\":2,\"arguments\":{\"elemento\":\"csv\"}}],\"iterable\":{\"call\":{\"step\":\"localiza_elementos\",\"arguments\":{\"xpath\":\"\\\"/html/body/div[1]/div[2]/div/form/table[2]/tbody/tr[2]/td/table/tbody/tr/td/table/tbody/tr[*]/td/table/tbody/tr/td[7]/a\\\"\"}}}},{\"step\":\"screenshot\",\"depth\":1,\"arguments\":{}},{\"step\":\"salva_pagina\",\"depth\":1,\"arguments\":{}},{\"step\":\"selecione\",\"depth\":1,\"arguments\":{\"xpath\":\"\\\"/html/body/div[1]/div[2]/div/form/table[2]/tbody/tr[1]/td/table/tbody/tr[1]/td/table/tbody/tr/td[2]/select\\\"\",\"opcao\":\"\\\"2018\\\"\"}},{\"step\":\"clique\",\"depth\":1,\"arguments\":{\"elemento\":\"\\\"/html/body/div[1]/div[2]/div/form/table[2]/tbody/tr[1]/td/table/tbody/tr[4]/td/table/tbody/tr/td/button\\\"\"}},{\"step\":\"espere\",\"depth\":1,\"arguments\":{\"segundos\":\"5\"}},{\"step\":\"para_cada\",\"depth\":1,\"iterator\":\"csv\",\"children\":[{\"step\":\"clique\",\"depth\":2,\"arguments\":{\"elemento\":\"csv\"}}],\"iterable\":{\"call\":{\"step\":\"localiza_elementos\",\"arguments\":{\"xpath\":\"\\\"/html/body/div[1]/div[2]/div/form/table[2]/tbody/tr[2]/td/table/tbody/tr/td/table/tbody/tr[*]/td/table/tbody/tr/td[7]/a\\\"\"}}}},{\"step\":\"screenshot\",\"depth\":1,\"arguments\":{}},{\"step\":\"salva_pagina\",\"depth\":1,\"arguments\":{}},{\"step\":\"selecione\",\"depth\":1,\"arguments\":{\"xpath\":\"\\\"/html/body/div[1]/div[2]/div/form/table[2]/tbody/tr[1]/td/table/tbody/tr[1]/td/table/tbody/tr/td[2]/select\\\"\",\"opcao\":\"\\\"2017\\\"\"}},{\"step\":\"clique\",\"depth\":1,\"arguments\":{\"elemento\":\"\\\"/html/body/div[1]/div[2]/div/form/table[2]/tbody/tr[1]/td/table/tbody/tr[4]/td/table/tbody/tr/td/button\\\"\"}},{\"step\":\"espere\",\"depth\":1,\"arguments\":{\"segundos\":\"5\"}},{\"step\":\"para_cada\",\"depth\":1,\"iterator\":\"csv\",\"children\":[{\"step\":\"clique\",\"depth\":2,\"arguments\":{\"elemento\":\"csv\"}}],\"iterable\":{\"call\":{\"step\":\"localiza_elementos\",\"arguments\":{\"xpath\":\"\\\"/html/body/div[1]/div[2]/div/form/table[2]/tbody/tr[2]/td/table/tbody/tr/td/table/tbody/tr[*]/td/table/tbody/tr/td[7]/a\\\"\"}}}},{\"step\":\"screenshot\",\"depth\":1,\"arguments\":{}},{\"step\":\"salva_pagina\",\"depth\":1,\"arguments\":{}},{\"step\":\"selecione\",\"depth\":1,\"arguments\":{\"xpath\":\"\\\"/html/body/div[1]/div[2]/div/form/table[2]/tbody/tr[1]/td/table/tbody/tr[1]/td/table/tbody/tr/td[2]/select\\\"\",\"opcao\":\"\\\"2016\\\"\"}},{\"step\":\"clique\",\"depth\":1,\"arguments\":{\"elemento\":\"\\\"/html/body/div[1]/div[2]/div/form/table[2]/tbody/tr[1]/td/table/tbody/tr[4]/td/table/tbody/tr/td/button\\\"\"}},{\"step\":\"espere\",\"depth\":1,\"arguments\":{\"segundos\":\"5\"}},{\"step\":\"para_cada\",\"depth\":1,\"iterator\":\"csv\",\"children\":[{\"step\":\"clique\",\"depth\":2,\"arguments\":{\"elemento\":\"csv\"}}],\"iterable\":{\"call\":{\"step\":\"localiza_elementos\",\"arguments\":{\"xpath\":\"\\\"/html/body/div[1]/div[2]/div/form/table[2]/tbody/tr[2]/td/table/tbody/tr/td/table/tbody/tr[*]/td/table/tbody/tr/td[7]/a\\\"\"}}}},{\"step\":\"screenshot\",\"depth\":1,\"arguments\":{}},{\"step\":\"salva_pagina\",\"depth\":1,\"arguments\":{}},{\"step\":\"selecione\",\"depth\":1,\"arguments\":{\"xpath\":\"\\\"/html/body/div[1]/div[2]/div/form/table[2]/tbody/tr[1]/td/table/tbody/tr[1]/td/table/tbody/tr/td[2]/select\\\"\",\"opcao\":\"\\\"2015\\\"\"}},{\"step\":\"clique\",\"depth\":1,\"arguments\":{\"elemento\":\"\\\"/html/body/div[1]/div[2]/div/form/table[2]/tbody/tr[1]/td/table/tbody/tr[4]/td/table/tbody/tr/td/button\\\"\"}},{\"step\":\"espere\",\"depth\":1,\"arguments\":{\"segundos\":\"5\"}},{\"step\":\"para_cada\",\"depth\":1,\"iterator\":\"csv\",\"children\":[{\"step\":\"clique\",\"depth\":2,\"arguments\":{\"elemento\":\"csv\"}}],\"iterable\":{\"call\":{\"step\":\"localiza_elementos\",\"arguments\":{\"xpath\":\"\\\"/html/body/div[1]/div[2]/div/form/table[2]/tbody/tr[2]/td/table/tbody/tr/td/table/tbody/tr[*]/td/table/tbody/tr/td[7]/a\\\"\"}}}},{\"step\":\"screenshot\",\"depth\":1,\"arguments\":{}},{\"step\":\"salva_pagina\",\"depth\":1,\"arguments\":{}},{\"step\":\"selecione\",\"depth\":1,\"arguments\":{\"xpath\":\"\\\"/html/body/div[1]/div[2]/div/form/table[2]/tbody/tr[1]/td/table/tbody/tr[1]/td/table/tbody/tr/td[2]/select\\\"\",\"opcao\":\"\\\"2014\\\"\"}},{\"step\":\"clique\",\"depth\":1,\"arguments\":{\"elemento\":\"\\\"/html/body/div[1]/div[2]/div/form/table[2]/tbody/tr[1]/td/table/tbody/tr[4]/td/table/tbody/tr/td/button\\\"\"}},{\"step\":\"espere\",\"depth\":1,\"arguments\":{\"segundos\":\"5\"}},{\"step\":\"para_cada\",\"depth\":1,\"iterator\":\"csv\",\"children\":[{\"step\":\"clique\",\"depth\":2,\"arguments\":{\"elemento\":\"csv\"}}],\"iterable\":{\"call\":{\"step\":\"localiza_elementos\",\"arguments\":{\"xpath\":\"\\\"/html/body/div[1]/div[2]/div/form/table[2]/tbody/tr[2]/td/table/tbody/tr/td/table/tbody/tr[*]/td/table/tbody/tr/td[7]/a\\\"\"}}}},{\"step\":\"screenshot\",\"depth\":1,\"arguments\":{}},{\"step\":\"salva_pagina\",\"depth\":1,\"arguments\":{}},{\"step\":\"selecione\",\"depth\":1,\"arguments\":{\"xpath\":\"\\\"/html/body/div[1]/div[2]/div/form/table[2]/tbody/tr[1]/td/table/tbody/tr[1]/td/table/tbody/tr/td[2]/select\\\"\",\"opcao\":\"\\\"2013\\\"\"}},{\"step\":\"clique\",\"depth\":1,\"arguments\":{\"elemento\":\"\\\"/html/body/div[1]/div[2]/div/form/table[2]/tbody/tr[1]/td/table/tbody/tr[4]/td/table/tbody/tr/td/button\\\"\"}},{\"step\":\"espere\",\"depth\":1,\"arguments\":{\"segundos\":\"5\"}},{\"step\":\"para_cada\",\"depth\":1,\"iterator\":\"csv\",\"children\":[{\"step\":\"clique\",\"depth\":2,\"arguments\":{\"elemento\":\"csv\"}}],\"iterable\":{\"call\":{\"step\":\"localiza_elementos\",\"arguments\":{\"xpath\":\"\\\"/html/body/div[1]/div[2]/div/form/table[2]/tbody/tr[2]/td/table/tbody/tr/td/table/tbody/tr[*]/td/table/tbody/tr/td[7]/a\\\"\"}}}},{\"step\":\"screenshot\",\"depth\":1,\"arguments\":{}},{\"step\":\"salva_pagina\",\"depth\":1,\"arguments\":{}},{\"step\":\"selecione\",\"depth\":1,\"arguments\":{\"xpath\":\"\\\"/html/body/div[1]/div[2]/div/form/table[2]/tbody/tr[1]/td/table/tbody/tr[1]/td/table/tbody/tr/td[2]/select\\\"\",\"opcao\":\"\\\"2012\\\"\"}},{\"step\":\"clique\",\"depth\":1,\"arguments\":{\"elemento\":\"\\\"/html/body/div[1]/div[2]/div/form/table[2]/tbody/tr[1]/td/table/tbody/tr[4]/td/table/tbody/tr/td/button\\\"\"}},{\"step\":\"espere\",\"depth\":1,\"arguments\":{\"segundos\":\"5\"}},{\"step\":\"para_cada\",\"depth\":1,\"iterator\":\"csv\",\"children\":[{\"step\":\"clique\",\"depth\":2,\"arguments\":{\"elemento\":\"csv\"}}],\"iterable\":{\"call\":{\"step\":\"localiza_elementos\",\"arguments\":{\"xpath\":\"\\\"/html/body/div[1]/div[2]/div/form/table[2]/tbody/tr[2]/td/table/tbody/tr/td/table/tbody/tr[*]/td/table/tbody/tr/td[7]/a\\\"\"}}}},{\"step\":\"screenshot\",\"depth\":1,\"arguments\":{}},{\"step\":\"salva_pagina\",\"depth\":1,\"arguments\":{}},{\"step\":\"selecione\",\"depth\":1,\"arguments\":{\"xpath\":\"\\\"/html/body/div[1]/div[2]/div/form/table[2]/tbody/tr[1]/td/table/tbody/tr[1]/td/table/tbody/tr/td[2]/select\\\"\",\"opcao\":\"\\\"2011\\\"\"}},{\"step\":\"clique\",\"depth\":1,\"arguments\":{\"elemento\":\"\\\"/html/body/div[1]/div[2]/div/form/table[2]/tbody/tr[1]/td/table/tbody/tr[4]/td/table/tbody/tr/td/button\\\"\"}},{\"step\":\"espere\",\"depth\":1,\"arguments\":{\"segundos\":\"5\"}},{\"step\":\"para_cada\",\"depth\":1,\"iterator\":\"csv\",\"children\":[{\"step\":\"clique\",\"depth\":2,\"arguments\":{\"elemento\":\"csv\"}}],\"iterable\":{\"call\":{\"step\":\"localiza_elementos\",\"arguments\":{\"xpath\":\"\\\"/html/body/div[1]/div[2]/div/form/table[2]/tbody/tr[2]/td/table/tbody/tr/td/table/tbody/tr[*]/td/table/tbody/tr/td[7]/a\\\"\"}}}},{\"step\":\"screenshot\",\"depth\":1,\"arguments\":{}},{\"step\":\"salva_pagina\",\"depth\":1,\"arguments\":{}}]}",
  "encoding_detection_method": 1,
  "templated_url_parameter_handlers": [],
  "static_form_parameter_handlers": [],
  "templated_url_response_handlers": [],
  "static_form_response_handlers": [],
  "crawler_id": 195,
  "instance_id": "165227565414343"
}

Parâmetros:

{
    "Ibiaí": {"param_url": "PmIbiai", "data_path":"ibiai"},
    "Presidente Kubitschek": {"param_url": "PresKubitschek", "data_path":"presidente_kubitschek"},
    "Engenheiro Navarro": { "param_url": "EngNavrro", "data_path":"engenheiro_navarro"},
    "Turmalina": { "param_url": "Turmalina", "data_path":"turmalina"},
    "Chapada Gaúcha": { "param_url":"ChapadaGaucha", "data_path":"chapada_gaucha"},
    "Francisco Badaró": { "param_url":"FrancBandr", "data_path":"francisco_badaro"},
    "Frei Lagonegro": { "param_url":"PMFreiLagoNegro", "data_path":"frei_lagonegro"},
    "Gameleiras": { "param_url": "Gameleiras", "data_path":"gameleiras"},
    "Alvorada de Minas": { "param_url":"alvoradademinas", "data_path":"alvorada_de_minas"},
    "Claro dos Poções": { "param_url":"ClarDosPces", "data_path":"claro_dos_pocoes"},
    "Água Boa": { "param_url":"AguaBoa", "data_path":"agua_boa"},
    "Novorizonte": { "param_url":"Novorizonte", "data_path":"novorizonte"},
    "Cachoeira de Pajeú": { "param_url":"CachoeiraPajeu", "data_path":"cachoeira_de_pajeu"},
    "Berilo": { "param_url":"Berilo", "data_path":"berilo"},
    "Patis": { "param_url":"Patis", "data_path":"patis"},
    "Espinosa": { "param_url":"Espnsa", "data_path":"espinosa"},
    "Bonito de Minas": { "param_url":"BonitoDeMinas", "data_path":"bonito_de_minas"},
    "São João da Lagoa": { "param_url":"SJDLgoa", "data_path":"sao_joao_da_lagoa"},
    "Materlândia": { "param_url":"Materlandia", "data_path":"materlandia"},
    "Gouveia": { "param_url":"Gouveia", "data_path":"gouveia"},
    "Ponto Chique": { "param_url":"PontoChique", "data_path":"ponto_chique"}, 
    "Catuti": { "param_url":"Catuti", "data_path":"catuti"},
    "Fruta de Leite": { "param_url":"FrutadeLeite", "data_path":"fruta_de_leite"},
    "Mirabela": { "param_url":"mirabela", "data_path":"mirabela"},
    "Itinga": { "param_url":"Itinga", "data_path":"itinga"},
    "Felício dos Santos": { "param_url":"FelicioSantos", "data_path":"felicio_dos_santos"},
    "Senador Modestino Gonçalves": { "param_url":"SenModestino", "data_path":"senador_modestino_goncalves"},
    "Itamarandiba": { "param_url":"Itamarandiba", "data_path":"itamarandiba"},
    "Coração de Jesus": { "param_url":"CoracaoDeJes", "data_path":"coracao_de_jesus"},
    "Francisco Sá": { "param_url":"FranciscoSa", "data_path":"francisco_sa"},
    "Josenópolis": { "param_url":"Josenopolis", "data_path":"josenopolis"},
    "São Romão": { "param_url":"SaoRomao", "data_path":"sao_romao"},
    "Indaiabira": { "param_url":"indaiabira", "data_path":"indaiabira"},
    "Pintópolis": { "param_url":"PMPintopolis", "data_path":"pintopolis"},
    "Mamonas": { "param_url":"mamonas", "data_path":"mamonas"},
    "Datas": { "param_url":"Datas", "data_path":"datas"},
    "Jaíba": { "param_url":"JaibaPM", "data_path":"jaiba"},
    "Augusto de Lima": { "param_url":"PMAugustoDeLima", "data_path":"augusto_de_lima"},
    "Juramento": { "param_url":"Juramento", "data_path":"juramento"},
    "Capitão Enéas": { "param_url":"CapitaoEneas", "data_path":"capitao_eneas"},
    "Janaúba": { "param_url":"PMJanauba", "data_path":"janauba"},
    "Cristália": { "param_url":"Cristalia", "data_path":"cristalia"},
    "Guaraciama": { "param_url":"PMGuaraciama", "data_path":"guaraciama"},
    "Pai Pedro": { "param_url":"PaiPedro", "data_path":"pai_pedro"},
    "Águas Vermelhas": { "param_url":"PMAguasVermelhas", "data_path":"aguas_vermelhas"},
    "Bocaiúva": { "param_url":"PMBocaiuva", "data_path":"bocaiuva"},
    "Várzea da Palma": { "param_url":"PMVarzeaDaPalma", "data_path":"varzea_da_palma"},
    "Divisópolis": { "param_url":"Divisopolis", "data_path":"divisopolis"},
    "Virgem da Lapa": { "param_url":"VirgemLapa", "data_path":"virgem_da_lapa"},
    "Novo Cruzeiro": { "param_url":"pmnovocruzeiro", "data_path":"novo_cruzeiro"},
    "Divisa Alegre": { "param_url":"DivisaAlegre", "data_path":"divisa_alegre"},
    "Glaucilândia": { "param_url":"Glaucilandia", "data_path":"glaucilandia"},
    "São Gonçalo do Rio Preto": { "param_url":"SaoGoncaloDoRioPreto",  "data_path":"sao_goncalo_do_rio_preto"},
    "Capelinha": { "param_url":"PMCapelinha", "data_path":"capelinha"},
    "Lontra": { "param_url":"PmLontra", "data_path":"lontra"},
    "Curral de Dentro": { "param_url":"CurralDeDentro", "data_path":"curral_de_dentro"},
    "São João das Missões": { "param_url":"SaoJoaoDasMiss", "data_path":"sao_joao_das_missoes"},
    "Jequitaí": { "param_url":"Jequitai", "data_path":"joquitai"},
    "Rio Pardo de Minas": { "param_url":"PrefRioPMinas", "data_path":"rio_pardo_de_minas"},
    "Nova Porteirinha": { "param_url":"NovaPorteirinha", "data_path":"nova_porteirinha"},
    "Aricanduva": { "param_url":"Aricanduva", "data_path":"aricanduva"},
    "Ladainha": { "param_url":"Ladainha", "data_path":"ladainha"},
    "Serra Azul de Minas": { "param_url":"SerraAzulDeMinas", "data_path":"serra_azul_de_minas"},
    "Coluna": { "param_url":"Coluna", "data_path":"coluna"},
    "Mário Campos": { "param_url":"MarioCampos", "data_path":"mario_campos"},
    "Vargem Grande do Rio Pardo": { "param_url":"VgmGrnd", "data_path":"vargem_grande_do_rio_pardo"},
    "Joaquim Felício": { "param_url":"JquimFelicio", "data_path":"joaquim_felicio"},
    "Monte Azul": { "param_url":"MonteAzul", "data_path":"monte_azul"},
    "Miravânia": { "param_url":"miravania", "data_path":"miravania"},
    "Couto de Magalhães de Minas": { "param_url":"CoutoMagalhaes", "data_path":"couto_de_magalhaes_de_minas"},
    "Mato Verde": { "param_url":"MatoVerde", "data_path":"mato_verde"},
    "Pedra Azul": { "param_url":"PedraAzul", "data_path":"pedra_azul"},
    "Santa Cruz de Salinas": { "param_url":"SantaCruzDeSalinas", "data_path":"santa_cruz_de_salinas"},
    "Brasília de Minas": { "param_url":"BrasiliaDeMinas", "data_path":"brasilia_de_minas"},
    "Mata Verde": { "param_url":"MataVerde", "data_path":"mata_verde"},
    "Itacambira": { "param_url":"Itacambira", "data_path":"itacambira"},
    "Santo Hipólito": { "param_url":"SantoHipolito", "data_path":"santo_hipolito"},
    "Padre Carvalho": { "param_url":"PMPadreCarvalho", "data_path":"padre_carvalho"},
    "Santana do Riacho": { "param_url":"PmSantanaDoRiacho", "data_path":"santana_do_riacho"},
    "Jenipapo de Minas": { "param_url":"JenipapoMinas", "data_path":"jenipapo_de_minas"},
    "Veredinha": { "param_url":"Veredinha", "data_path":"veredinha"},
    "Santo Antônio do Retiro": { "param_url":"SantoAntoniodoRetiro", "data_path":"santo_antonio_do_retiro"},
    "São João do Pacuí": { "param_url":"SaoJoaoPacui", "data_path":"sao_joao_do_pacui"},
    "Manga": { "param_url":"Manga", "data_path":"manga"},
    "Montezuma": { "param_url":"Montezuma", "data_path":"montezuma"},
    "Itacarambi": { "param_url":"PMItacarambi", "data_path":"itacarambi"}
}
RitaRez commented 2 years ago

Foram realizadas tentativas de mais de um user diferente, com arquivo de parâmetros com menos municípios e outros coletores do template síntese e tecnologia. Todos levaram ao mesmo erro.

albertoueda commented 2 years ago

Resolvido: descobrimos que o problema eram os espaços em branco na tag e subtag, que geravam URLs inválidas para o coletor. Ao mesmo tempo, o try-catch não logava corretamente o erro para o usuário (json obtido), o que dificultava sua investigação. O log foi corrigido e o formulário atualizado.

Ex. de configuração gerada que causava o erro (config_Ibiaí.json, data_path com espaços):

    "source_name": "[Realiza\u00e7\u00e3o-F01] Coleta de Servidores Publicos - Folha de Pagamento de Ibia\u00ed",
    "base_url": "http://cidadesmg.com.br/portaltransparencia/faces/user/folha/FFolhaPagamento.xhtml?Param=PmIbiai",
    "obey_robots": false,
    "data_path": "/datalake/ufmg/webcrawlerc01/realizacaof01/ibiai/servidores publicos/folha de pagamento",
    "request_type": "GET",
    "form_request_type": "POST",
    "antiblock_download_delay": 2,
    "..."

Obs: tais requisições funcionavam no passado, mesmo com os espaços em branco, i.e., mesmas configurações que funcionavam passaram a dar erro. A mudança de comportamento será notificada a C01.