CAIDA / catalog-data

Repo which holds some panda solutions and papers
3 stars 6 forks source link

add panda alternative script to recipe #611

Closed bhuffaker closed 1 year ago

bhuffaker commented 1 year ago

add the following script as an additional alternative solution to: https://catalog.caida.org/recipe/getting_an_asns_name_country_organization

from datetime import datetime
import gzip
import os
import pandas as pd
import sys
filename = "20230401.as-org2info.txt.gz"
date = datetime.strptime(filename, '%Y%m%d.as-org2info.txt.gz')
with gzip.open(filename, "r") as fin:
    content = fin.readlines()
    index_org = [x for x in range(len(content)) if "# format:org_id|changed|org_name|country|source" in str(content[x])][0]
    index_asn = [x for x in range(len(content)) if "# format:aut|changed|aut_name|org_id|opaque_id|source" in str(content[x])][0]
print(index_org, index_asn, index_asn - index_org - 2)
org_df = pd.read_csv(filename, delimiter="|", skiprows=index_org, nrows=index_asn - index_org - 2).rename(columns={"# format:org_id": "org_id"})[["org_id", "org_name", "country", "source"]]
asn_df = pd.read_csv(filename, delimiter="|", skiprows=index_asn).rename(columns={"# format:aut": "asn", "aut_name": "asn_name"})[["asn", "asn_name", "org_id"]]
asn_df.merge(org_df, on="org_id").to_csv(str(date.date())+'_org2info.csv', index=False)