Twitter CSV as input - Githubissues

RIOZHU123 commented 2 years ago

a feasible function for converting Twitter dataset (Twitter CSV format converted by twarc2) to the acceptable format in preprocess.py might be necessary

def csv2csv(df):
    mydf = df[['id', 'author.id', 'author.username', 'text', 'created_at', 'entities.urls']].copy()
    rprw_col = ['referenced_tweets.retweeted.id', 'referenced_tweets.replied_to.id']

    def timeconverter(tw_id):
        return (int(tw_id) >> 22) / 1000
    mydf['timestamp'] = mydf['id'].apply(timeconverter)

    def urlconverter(url_dict):
        urls = []
        if url_dict is not np.nan:
            for url_dict in json.loads(url_dict):
                urls.append(url_dict.get('expanded_url'))
            urls = ','.join(urls)
        else:
            urls = None
        return urls
    mydf['entities.urls'] = mydf['entities.urls'].apply(urlconverter)

    def re_strl2vec(strlist):      
        retweet_list = []
        reply_list = []
        for tw_dict in strlist:
            retweet_id = np.nan
            reply_id = np.nan
            if tw_dict is not np.nan:
                tw_dict = json.loads(tw_dict)
                for tw in tw_dict:
                    if tw.get('type') == 'retweeted':
                        retweet_id = tw.get('id')
                    elif tw.get('type') == 'replied_to':
                        reply_id == tw.get('id')
            retweet_list.append(retweet_id)
            reply_list.append(reply_id)
        return retweet_list, reply_list
    retweet_list, reply_list = re_strl2vec(df['referenced_tweets'].to_list())
    mydf['repost_id'] = retweet_list
    mydf['reply_id'] = reply_list

    select_columns = ['id', 'author.id', 'author.username', 'repost_id', 'reply_id', 'text', 'created_at', 'entities.urls']
    column_names = ['message_id', 'user_id', 'username', 'repost_id', 'reply_id', 'message', 'timestamp', 'urls']
    mydf = mydf[select_columns]
    mydf.columns = column_names
    return mydf

SamHames commented 2 years ago

Given we already support Twitter V2 JSON natively what does dealing with a specific tools CSV output add? There are other Twitter data collection tools that output different formats, and I think it makes sense to only support the canonical Twitter output format.

RIOZHU123 commented 2 years ago

Thanks Sam - totally agree!! I attached the Python code in this issue for converting Twarc CSV to CN Toolkit CSV - hope it helps in some circumstances. But again, I agree since the toolkit should fit in more general cases

QUT-Digital-Observatory / coordination-network-toolkit

Twitter CSV as input #47