Closed dbae1145 closed 1 year ago
I found a resolution to this issue, update the 01_construct_features.py
file to the below @dbae1145
import pandas as pd
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import config
import nltk
import pickle
from functions import timeit, scrape, parse_request
if __name__ == '__main__':
df = pd.read_csv(config.MAIN_DATASET_PATH)
df = df.rename(
columns={'main_category:confidence': 'main_category_confidence'})
df = df[['url', 'main_category', 'main_category_confidence']]
df = df[(df['main_category'] != 'Not_working') &
(df['main_category_confidence'] >= 0.5)]
df['url'] = df['url'].apply(lambda x: 'http://' + x)
df['tld'] = df.url.apply(lambda x: x.split('.')[-1])
df = df[df.tld.isin(config.TOP_LEVEL_DOMAIN_WHITELIST)
].reset_index(drop=True)
df['tokens'] = ''
print("Scraping begins. Start: ", datetime.now())
with ThreadPoolExecutor(config.THREADING_WORKERS) as executor:
start = datetime.now()
results = executor.map(scrape, [(i, elem)
for i, elem in enumerate(df['url'])])
exec_1 = timeit(start)
print('Scraping finished. Execution time: ', exec_1)
print("Analyzing responses. Start: ", datetime.now())
with ProcessPoolExecutor(config.MULTIPROCESSING_WORKERS) as ex:
start = datetime.now()
res = ex.map(parse_request, [(i, elem)
for i, elem in enumerate(results)])
for props in res:
i = props[0]
tokens = props[1]
df.at[i, 'tokens'] = tokens
exec_2 = timeit(start)
print('Analyzing responses. Execution time: ', exec_2)
df.to_csv(config.TOKENS_PATH, index=False)
print('Generating words frequency for each category: ', datetime.now())
start = datetime.now()
words_frequency = {}
for category in df.main_category.unique():
print(category)
all_words = []
df_temp = df[df.main_category == category]
for word in df_temp.tokens:
all_words.extend(word)
most_common = [word[0] for word in nltk.FreqDist(
all_words).most_common(config.FREQUENCY_TOP_WORDS)]
words_frequency[category] = most_common
# Save words_frequency model
pickle_out = open(config.WORDS_FREQUENCY_PATH, "wb")
pickle.dump(words_frequency, pickle_out)
pickle_out.close()
exec_3 = timeit(start)
print('Generating words frequency for each category Finished. Execution time: ', exec_3)
print('Script finished.\nTimes log:\nPart 1: ', exec_1,
'\nPart 2: ', exec_2, '\nPart 3: ', exec_3)
Hello,
I'm currently experiencing a RuntimeError that says "An attempt has been made to start a new process before the current process has finished its bootstrapping phase."
File "C:\Users\sbae\PycharmProjects\URL-categorization-using-machine-learning\01_construct_features.py", line 29, in
res = ex.map(parse_request, [(i, elem) for i, elem in enumerate(results)])
File "C:\Program Files\Python310\lib\concurrent\futures\process.py", line 761, in map
results = super().map(partial(_process_chunk, fn),
File "C:\Program Files\Python310\lib\concurrent\futures_base.py", line 610, in map
fs = [self.submit(fn, args) for args in zip(iterables)]
File "C:\Program Files\Python310\lib\concurrent\futures_base.py", line 610, in
fs = [self.submit(fn, args) for args in zip(iterables)]
File "C:\Program Files\Python310\lib\concurrent\futures\process.py", line 732, in submit
self._adjust_process_count()
File "C:\Program Files\Python310\lib\concurrent\futures\process.py", line 692, in _adjust_process_count
self._spawn_process()
File "C:\Program Files\Python310\lib\concurrent\futures\process.py", line 709, in _spawn_process
p.start()
File "C:\Program Files\Python310\lib\multiprocessing\process.py", line 121, in start
self._popen = self._Popen(self)
File "C:\Program Files\Python310\lib\multiprocessing\context.py", line 336, in _Popen
return Popen(process_obj)
File "C:\Program Files\Python310\lib\multiprocessing\popen_spawn_win32.py", line 45, in init
prep_data = spawn.get_preparation_data(process_obj._name)
File "C:\Program Files\Python310\lib\multiprocessing\spawn.py", line 154, in get_preparation_data
_check_not_importing_main()
File "C:\Program Files\Python310\lib\multiprocessing\spawn.py", line 134, in _check_not_importing_main
raise RuntimeError('''
RuntimeError:
An attempt has been made to start a new process before the
current process has finished its bootstrapping phase.