domantasm96 / URL-categorization-using-machine-learning

MIT License
81 stars 40 forks source link

RuntimeError #10

Closed dbae1145 closed 1 year ago

dbae1145 commented 1 year ago

Hello,

I'm currently experiencing a RuntimeError that says "An attempt has been made to start a new process before the current process has finished its bootstrapping phase."

File "C:\Users\sbae\PycharmProjects\URL-categorization-using-machine-learning\01_construct_features.py", line 29, in res = ex.map(parse_request, [(i, elem) for i, elem in enumerate(results)]) File "C:\Program Files\Python310\lib\concurrent\futures\process.py", line 761, in map results = super().map(partial(_process_chunk, fn), File "C:\Program Files\Python310\lib\concurrent\futures_base.py", line 610, in map fs = [self.submit(fn, args) for args in zip(iterables)] File "C:\Program Files\Python310\lib\concurrent\futures_base.py", line 610, in fs = [self.submit(fn, args) for args in zip(iterables)] File "C:\Program Files\Python310\lib\concurrent\futures\process.py", line 732, in submit self._adjust_process_count() File "C:\Program Files\Python310\lib\concurrent\futures\process.py", line 692, in _adjust_process_count self._spawn_process() File "C:\Program Files\Python310\lib\concurrent\futures\process.py", line 709, in _spawn_process p.start() File "C:\Program Files\Python310\lib\multiprocessing\process.py", line 121, in start self._popen = self._Popen(self) File "C:\Program Files\Python310\lib\multiprocessing\context.py", line 336, in _Popen return Popen(process_obj) File "C:\Program Files\Python310\lib\multiprocessing\popen_spawn_win32.py", line 45, in init prep_data = spawn.get_preparation_data(process_obj._name) File "C:\Program Files\Python310\lib\multiprocessing\spawn.py", line 154, in get_preparation_data _check_not_importing_main() File "C:\Program Files\Python310\lib\multiprocessing\spawn.py", line 134, in _check_not_importing_main raise RuntimeError(''' RuntimeError: An attempt has been made to start a new process before the current process has finished its bootstrapping phase.

    This probably means that you are not using fork to start your
    child processes and you have forgotten to use the proper idiom
    in the main module:

        if __name__ == '__main__':
            freeze_support()
            ...

    The "freeze_support()" line can be omitted if the program
    is not going to be frozen to produce an executable.

image

Zeerti commented 1 year ago

I found a resolution to this issue, update the 01_construct_features.py file to the below @dbae1145

import pandas as pd
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import config
import nltk
import pickle
from functions import timeit, scrape, parse_request

if __name__ == '__main__':
    df = pd.read_csv(config.MAIN_DATASET_PATH)
    df = df.rename(
        columns={'main_category:confidence': 'main_category_confidence'})
    df = df[['url', 'main_category', 'main_category_confidence']]

    df = df[(df['main_category'] != 'Not_working') &
            (df['main_category_confidence'] >= 0.5)]
    df['url'] = df['url'].apply(lambda x: 'http://' + x)
    df['tld'] = df.url.apply(lambda x: x.split('.')[-1])
    df = df[df.tld.isin(config.TOP_LEVEL_DOMAIN_WHITELIST)
            ].reset_index(drop=True)
    df['tokens'] = ''

    print("Scraping begins. Start: ", datetime.now())
    with ThreadPoolExecutor(config.THREADING_WORKERS) as executor:
        start = datetime.now()
        results = executor.map(scrape, [(i, elem)
                               for i, elem in enumerate(df['url'])])
    exec_1 = timeit(start)
    print('Scraping finished. Execution time: ', exec_1)

    print("Analyzing responses. Start: ", datetime.now())
    with ProcessPoolExecutor(config.MULTIPROCESSING_WORKERS) as ex:
        start = datetime.now()
        res = ex.map(parse_request, [(i, elem)
                     for i, elem in enumerate(results)])

    for props in res:
        i = props[0]
        tokens = props[1]
        df.at[i, 'tokens'] = tokens
    exec_2 = timeit(start)
    print('Analyzing responses. Execution time: ', exec_2)

    df.to_csv(config.TOKENS_PATH, index=False)

    print('Generating words frequency for each category: ', datetime.now())
    start = datetime.now()
    words_frequency = {}
    for category in df.main_category.unique():
        print(category)
        all_words = []
        df_temp = df[df.main_category == category]
        for word in df_temp.tokens:
            all_words.extend(word)
        most_common = [word[0] for word in nltk.FreqDist(
            all_words).most_common(config.FREQUENCY_TOP_WORDS)]
        words_frequency[category] = most_common

    # Save words_frequency model
    pickle_out = open(config.WORDS_FREQUENCY_PATH, "wb")
    pickle.dump(words_frequency, pickle_out)
    pickle_out.close()

    exec_3 = timeit(start)
    print('Generating words frequency for each category Finished. Execution time: ', exec_3)

    print('Script finished.\nTimes log:\nPart 1: ', exec_1,
          '\nPart 2: ', exec_2, '\nPart 3: ', exec_3)