I have the following unit test running on my brand new (March, 2021) top of the line Macbook Pro with every bell and whistle:
import timeit
import numpy as np
def test_speed_get_cleanco() -> None:
# First use timeit to find out how performant nameparser is
setup_code = """from DeepDiscovery.oc.utils import get_cleanco
company_names = ["Apple, Inc.", "Filo Oy.", "Jim Jones Company", "Albert Goods, LLC", "Smithson Painting", "Chovak Lubil sk"] * 10
"""
test_code = """
for company_name in company_names:
c = get_cleanco.__wrapped__(company_name)
a = c.type
b = c.country
d = c.basename
"""
t = timeit.Timer(test_code, setup=setup_code)
run_repeats = t.repeat(repeat=3, number=100)
avg_time = np.mean(run_repeats)
time_per_record = avg_time / 100
time_per_million = 1000000 * time_per_record
print(
f"Time per million DeepDiscovery.oc.utils.get_cleanco calls: {time_per_million:,.2f}"
)
The code it tests is:
import pyspark.sql.functions as F
import pyspark.sql.types as T
from cleanco import prepare_terms, basename, typesources, countrysources, matches
from pyspark.sql import Row
# Setup Cleanco parameters
terms = prepare_terms()
type_sources = typesources()
country_sources = countrysources()
cleanco_schema = T.StructType(
[
T.StructField("basename", T.StringType(), True),
T.StructField("type", T.StringType(), True),
T.StructField("country", T.StringType(), True),
]
)
@F.udf(cleanco_schema)
def get_cleanco(name: str) -> Row:
"""get_cleanco Use cleanco to parse company names into their basename, business type and country it came from
Parameters
----------
name : str
A company name
Returns
-------
pyspark.sql.Row
This Row object has three fields:
basename - the company name without its corporate ending
country - the first country returned by cleanco.matches(string.capwords(name.lower()), country_sources)
type - the first company type return by cleanco.matches(string.capwords(name.lower()), type_sources)
"""
clean_name = basename(
string.capwords(name.lower()), terms, prefix=False, middle=False, suffix=True
)
country = matches(string.capwords(name.lower()), country_sources)
if len(country) > 0:
country = country[0]
else:
country = None
company_type = matches(string.capwords(name.lower()), type_sources)
if len(company_type) > 0:
company_type = company_type[0]
else:
company_type = None
return Row(**{"basename": clean_name, "country": country, "type": company_type})
It has the following output indicating it takes 8 seconds per execution of the function:
Time per million DeepDiscovery.oc.utils.get_cleanco calls: 8,356,185.99
I have tested this without the Spark part and it is still slow. What am I to do? It is extremely slow to run this on lots of data.
I have the following unit test running on my brand new (March, 2021) top of the line Macbook Pro with every bell and whistle:
The code it tests is:
It has the following output indicating it takes 8 seconds per execution of the function:
I have tested this without the Spark part and it is still slow. What am I to do? It is extremely slow to run this on lots of data.