I am trying to use pytest parallelization capabilities to run the following code. However, the tests stop before the script gets fully compiled. As you can see in the script there are multiple instances of sleep and a large number of looping elements. Is there a way to keep the tests running indefinitely? Thanks!
I am using the following line in the command line to compile the script.
import pytest
import polars as pl
from itertools import chain
import os
from pathlib import Path
import re
# pytest parametrization
@pytest.mark.parametrize("data", ["canadian","cherokee"])
def test_multi_threaded(sb,data):
# set the root directory
os.chdir("C:/warrant-data-oklahoma")
# loop over the years
for year_val in range(2016,2018):
# generate the counter variable
no_data_counter = 0
# check if the directory exists or not for saving the file
directory = Path.cwd().joinpath(str(year_val), data)
if directory.exists():
print("Directory to save files exists. Starting Scraping.")
else:
directory.mkdir(parents=True, exist_ok=True)
for case_num in range(1,20000):
# use a try-except approach
try:
# condition on the county
if str(data)=='cotton':
ndc_threshold=500
else:
ndc_threshold=250
# condition on the no_data_counter to be less than the threshold
if no_data_counter>ndc_threshold:
print("It appears I have reached the maximum number of cases for year "+str(year_val)+" in county "+str(data))
else:
# checking if all files exist or not
if Path.cwd().joinpath(str(year_val), data, "tr_"+str(year_val)+"_"+str(case_num)+"_overview.csv").is_file() and Path.cwd().joinpath(str(year_val), data, "tr_"+str(year_val)+"_"+str(case_num)+"_parties.csv").is_file() and Path.cwd().joinpath(str(year_val), data, "tr_"+str(year_val)+"_"+str(case_num)+"_attorneys.csv").is_file() and Path.cwd().joinpath(str(year_val), data, "tr_"+str(year_val)+"_"+str(case_num)+"_events.csv").is_file() and Path.cwd().joinpath(str(year_val), data, "tr_"+str(year_val)+"_"+str(case_num)+"_counts.csv").is_file() and Path.cwd().joinpath(str(year_val), data, "tr_"+str(year_val)+"_"+str(case_num)+"_dockets.csv").is_file() and Path.cwd().joinpath(str(year_val), data, "tr_"+str(year_val)+"_"+str(case_num)+"_defendant_info.csv").is_file():
print("All Files Exists. Skipping Scraping for File "+str(case_num)+" in county "+str(data))
else:
print("Starting Scraping for File "+str(case_num)+" in county "+str(data))
sb.uc_open_with_reconnect(
"https://www.oscn.net/dockets/GetCaseInformation.aspx?db="+data+"&number=TR-"+str(year_val)+"-"+str(case_num),
reconnect_time=12
)
sb.sleep(11)
try:
sb.uc_click('/html/body/div/div/div[1]/div/label/input', by = "xpath", reconnect_time=12) # this clicks the submit button #challenge-stage > div > label > input[type=checkbox]
except:
# do nothing
print("Found no Submit Button. Starting Scraping.")
try:
sb.uc_click("/html/body/div/form/input[1]", by = "xpath", reconnect_time=12) # this clicks the submit button
except:
# do nothing
print("Found no Submit Button. Starting Scraping.")
except:
print('Scraping Failed. A new approach is needed.')
Without a stack trace, I can't really see where your script is failing. If unreliable code is properly wrapped in try/except blocks, then loops should continue going.
Hey!
I am trying to use pytest parallelization capabilities to run the following code. However, the tests stop before the script gets fully compiled. As you can see in the script there are multiple instances of sleep and a large number of looping elements. Is there a way to keep the tests running indefinitely? Thanks!
I am using the following line in the command line to compile the script.