twintproject / twint

An advanced Twitter scraping & OSINT tool written in Python that doesn't use Twitter's API, allowing you to scrape a user's followers, following, Tweets and more while evading most API limitations.
MIT License
15.68k stars 2.71k forks source link

[ERROR] TypeError: sequence item 0: expected str instance, dict found #1031

Open ghost opened 3 years ago

ghost commented 3 years ago

Initial Check

Command Ran

import twint
c = twint.Config()
c.Database = path + '/tweets.db'
c.Username = 'MrAndyNgo'
twint.run.Search(c)

Description of Issue

at first i thought it was a simple mistake with types in my own code, but i stripped it all the way down to the base code after performing all tests and i get the same result no matter what.

[+] Inserting into Database: /user/home/tweets.db
Traceback (most recent call last):
  File "/home/developer/Docs/Development/Scrapers/twint-scraper2.py", line 10, in <module>
    twint.run.Search(c)
  File "/home/developer/.local/lib/python3.8/site-packages/twint/run.py", line 410, in Search
    run(config, callback)
  File "/home/developer/.local/lib/python3.8/site-packages/twint/run.py", line 329, in run
    get_event_loop().run_until_complete(Twint(config).main(callback))
  File "/usr/lib/python3.8/asyncio/base_events.py", line 616, in run_until_complete
    return future.result()
  File "/home/developer/.local/lib/python3.8/site-packages/twint/run.py", line 235, in main
    await task
  File "/home/developer/.local/lib/python3.8/site-packages/twint/run.py", line 286, in run
    await self.tweets()
  File "/home/developer/.local/lib/python3.8/site-packages/twint/run.py", line 226, in tweets
    await output.Tweets(tweet, self.config, self.conn)
  File "/home/developer/.local/lib/python3.8/site-packages/twint/output.py", line 166, in Tweets
    await checkData(tweets, config, conn)
  File "/home/developer/.local/lib/python3.8/site-packages/twint/output.py", line 140, in checkData
    db.tweets(conn, tweet, config)
  File "/home/developer/.local/lib/python3.8/site-packages/twint/storage/db.py", line 264, in tweets
    ",".join(Tweet.mentions),
**TypeError: sequence item 0: expected str instance, dict found**

Environment Details

Ubuntu 20.04, XFCE4 Terminal, Python3.8

OlvArthur commented 3 years ago

this is what you'd need for handling mentions on line 34 in format.py, because in the new implementation mentions is not a list, instead it is a dict :

34 output = output.replace("{mentions}", ",".join([json.dumps(mention) for mention in t.mentions]))

Originally posted by @himanshudabas in https://github.com/twintproject/twint/issues/960#issuecomment-707859707

You have to implement this either by making a fork or modifying the local twint directory. You also need to import json at the top of the file

rachmadaniHaryono commented 3 years ago

i tried above change but Tweet.mentions on db.py is still dict instead of string

my solution is use the same method as mentioned on db.py

patch ```diff From 392fbd97bda384eda4406b8e27463f29f6c2e4da Mon Sep 17 00:00:00 2001 From: rachmadaniHaryono Date: Thu, 26 Nov 2020 13:57:17 +0800 Subject: [PATCH 1/5] new: dev: json.dumps tweet mentions --- twint/storage/db.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/twint/storage/db.py b/twint/storage/db.py index 1b2d2bc..c3fa9da 100644 --- a/twint/storage/db.py +++ b/twint/storage/db.py @@ -243,6 +243,10 @@ def tweets(conn, Tweet, config): try: time_ms = round(time.time()*1000) cursor = conn.cursor() + try: + mentions = ",".join(Tweet.mentions) + except TypeError as err: + mentions = json.dumps(Tweet.mentions) entry = (Tweet.id, Tweet.id_str, Tweet.tweet, @@ -261,7 +265,7 @@ def tweets(conn, Tweet, config): Tweet.username, Tweet.name, Tweet.link, - ",".join(Tweet.mentions), + mentions, ",".join(Tweet.hashtags), ",".join(Tweet.cashtags), ",".join(Tweet.urls), -- 2.27.0 From b30d111c1d6b5c2e4dbdb05c915491aac8910165 Mon Sep 17 00:00:00 2001 From: rachmadaniHaryono Date: Thu, 26 Nov 2020 14:04:17 +0800 Subject: [PATCH 2/5] new: dev: log on error --- twint/storage/db.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/twint/storage/db.py b/twint/storage/db.py index c3fa9da..977d810 100644 --- a/twint/storage/db.py +++ b/twint/storage/db.py @@ -1,10 +1,14 @@ +import hashlib +import json +import logging import sqlite3 import sys import time -import hashlib - from datetime import datetime +LOGGER = logging.getLogger(__name__) + + def Conn(database): if database: print("[+] Inserting into Database: " + str(database)) @@ -246,6 +250,7 @@ def tweets(conn, Tweet, config): try: mentions = ",".join(Tweet.mentions) except TypeError as err: + LOGGER.exception(err) mentions = json.dumps(Tweet.mentions) entry = (Tweet.id, Tweet.id_str, -- 2.27.0 From 3b78fdb4be90795dca98b392d0d5a368a35bf028 Mon Sep 17 00:00:00 2001 From: rachmadaniHaryono Date: Thu, 26 Nov 2020 14:14:25 +0800 Subject: [PATCH 3/5] fix: dev: datetime with WITA --- twint/storage/db.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/twint/storage/db.py b/twint/storage/db.py index 977d810..cbcd208 100644 --- a/twint/storage/db.py +++ b/twint/storage/db.py @@ -293,7 +293,18 @@ def tweets(conn, Tweet, config): if Tweet.retweet: query = 'INSERT INTO retweets VALUES(?,?,?,?,?)' - _d = datetime.timestamp(datetime.strptime(Tweet.retweet_date, "%Y-%m-%d %H:%M:%S")) + + def get_datetime(inp): + return datetime.timestamp(datetime.strptime(inp, "%Y-%m-%d %H:%M:%S")) + + try: + _d = get_datetime(Tweet.retweet_date) + except ValueError as err: + if Tweet.retweet_date.endswith(' WITA'): + LOGGER.exception(err) + _d = get_datetime(Tweet.retweet_date.rsplit(' WITA', 1)[0]) + else: + raise err cursor.execute(query, (int(Tweet.user_rt_id), Tweet.user_rt, Tweet.id, int(Tweet.retweet_id), _d)) if Tweet.reply_to: -- 2.27.0 From 0f1f0d06fe6c1d73edcd9f587ed317462a03ffd3 Mon Sep 17 00:00:00 2001 From: rachmadaniHaryono Date: Thu, 26 Nov 2020 14:16:51 +0800 Subject: [PATCH 4/5] fix: dev: reply user_id --- twint/storage/db.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/twint/storage/db.py b/twint/storage/db.py index cbcd208..f4ec794 100644 --- a/twint/storage/db.py +++ b/twint/storage/db.py @@ -310,7 +310,15 @@ def tweets(conn, Tweet, config): if Tweet.reply_to: for reply in Tweet.reply_to: query = 'INSERT INTO replies VALUES(?,?,?)' - cursor.execute(query, (Tweet.id, int(reply['user_id']), reply['username'])) + try: + reply_user_id = int(reply['user_id']) + except KeyError as err: + LOGGER.exception(err) + if 'user_id' not in reply: + reply_user_id = 0 + else: + raise err + cursor.execute(query, (Tweet.id, reply_user_id, reply['username'])) conn.commit() except sqlite3.IntegrityError: -- 2.27.0 From 49aa347b1f7256aad7aaaf4acb7a8543aa9b1db7 Mon Sep 17 00:00:00 2001 From: rachmadaniHaryono Date: Thu, 26 Nov 2020 14:25:07 +0800 Subject: [PATCH 5/5] fix: dev: value from dict --- twint/storage/db.py | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/twint/storage/db.py b/twint/storage/db.py index f4ec794..38cd3b6 100644 --- a/twint/storage/db.py +++ b/twint/storage/db.py @@ -4,6 +4,7 @@ import logging import sqlite3 import sys import time +import typing from datetime import datetime LOGGER = logging.getLogger(__name__) @@ -310,15 +311,26 @@ def tweets(conn, Tweet, config): if Tweet.reply_to: for reply in Tweet.reply_to: query = 'INSERT INTO replies VALUES(?,?,?)' - try: - reply_user_id = int(reply['user_id']) - except KeyError as err: - LOGGER.exception(err) - if 'user_id' not in reply: - reply_user_id = 0 - else: - raise err - cursor.execute(query, (Tweet.id, reply_user_id, reply['username'])) + + def get_value(dict_inp: typing.Dict[str, typing.Any], key: str, default_value: typing.Any): + """get value from key with default_value. + + .. note:: + + it may be better to replace this with default :py:class:`dict.get` + """ + try: + return dict_inp[key] + except KeyError as err: + if key not in dict_inp: + LOGGER.exception(err) + return default_value + else: + raise err + + reply_user_id = int(get_value(reply, 'user_id', 0)) + reply_username = get_value(reply, 'username', '') + cursor.execute(query, (Tweet.id, reply_user_id, reply_username)) conn.commit() except sqlite3.IntegrityError: -- 2.27.0 ```

above patch is based on a45a8ac and also include some error i found when running following command

$ twint --username username --database tweet.db --debug

e: above patch is outdated, updated patch is on my branch here https://github.com/rachmadaniHaryono/twint/tree/develop