markowanga / stweet

Advanced python library to scrap Twitter (tweets, users) from unofficial API
MIT License
589 stars 67 forks source link

Get 404 error when trying TweetSearchRunner.run() in version 2.1.1 #102

Closed shi1ro closed 1 year ago

shi1ro commented 1 year ago

In python 3.9.6 and stweet 2.1.1 J just follow the example code and make few changes

import stweet as st
from stweet.twitter_api.twitter_auth_web_client_interceptor import TwitterAuthWebClientInterceptor
import arrow
PROXY_URL = 'socks5://localhost:4781'
web_client = st.RequestsWebClient(
        proxy=st.RequestsWebClientProxyConfig(
            http_proxy=PROXY_URL,
            https_proxy=PROXY_URL
        ),
        interceptors=[TwitterAuthWebClientInterceptor()]
    )
def try_search():
    since = '2023-01-01'
    until = '2023-01-02'
    arrow_since = arrow.get(since, "YYYY-MM-DD")
    arrow_until = arrow.get(until, "YYYY-MM-DD")
    search_tweets_task = st.SearchTweetsTask(all_words='covid',since=arrow_since,
        until=arrow_until)
    output_jl_tweets = st.JsonLineFileRawOutput('output_raw_search_tweets.jl')
    output_jl_users = st.JsonLineFileRawOutput('output_raw_search_users.jl')
    output_print = st.PrintRawOutput()

    st.TweetSearchRunner(search_tweets_task=search_tweets_task,
                         tweet_raw_data_outputs=[output_print, output_jl_tweets],
                         user_raw_data_outputs=[output_print, output_jl_users],web_client=web_client).run()

def try_user_scrap():
    user_task = st.GetUsersTask(['iga_swiatek'])
    output_json = st.JsonLineFileRawOutput('output_raw_user.jl')
    output_print = st.PrintRawOutput()
    st.GetUsersRunner(get_user_task=user_task, raw_data_outputs=[output_print, output_json],web_client=web_client).run()

def try_tweet_by_id_scrap():
    id_task = st.TweetsByIdTask('1447348840164564994')
    output_json = st.JsonLineFileRawOutput('output_raw_id.jl')
    output_print = st.PrintRawOutput()
    st.TweetsByIdRunner(tweets_by_id_task=id_task,
                        raw_data_outputs=[output_print, output_json]).run()

if __name__ == '__main__':
    try_search()
    #try_user_scrap()
    #try_tweet_by_id_scrap()

It works when calling try_user_scrap and try_tweet_by_id_scrap but fails in try_search

Here is traceback:

Traceback (most recent call last):
  File "F:\upshi1ro\jprank\Twittertest\st.py", line 45, in <module>
    try_search()
  File "F:\upshi1ro\jprank\Twittertest\st.py", line 24, in try_search
    st.TweetSearchRunner(search_tweets_task=search_tweets_task,
  File "D:\study\python3\lib\site-packages\stweet\search_runner\search_runner.py", line 51, in run
    self._execute_next_tweets_request()
  File "D:\study\python3\lib\site-packages\stweet\search_runner\search_runner.py", line 72, in _execute_next_tweets_request
    raise ScrapBatchBadResponse(response)
stweet.exceptions.scrap_batch_bad_response.ScrapBatchBadResponse: RequestResponse(status_code=404, text='{"errors":[{"message":"Sorry, that page does not exist","code":34}]}')

I also try to print request_params in _execute_next_tweets_request:

RequestDetails(http_method=<HttpMethod.GET: 1>, url='https://twitter.com/i/api/2/search/adaptive.json', headers={}, params={'include_profile_interstitial_type': '1', 'include_blocking': '1', 'include_blocked_by': '1', 'include_followed_by': '1', 'include_want_retweets': '1', 'include_mute_edge': '1', 'include_can_dm': '1', 'include_can_media_tag': '1', 'skip_status': '1', 'cards_platform': 'Web-12', 'include_cards': '1', 'include_ext_alt_text': 'true', 'include_quote_count': 'true', 'include_reply_count': '1', 'tweet_mode': 'extended', 'include_entities': 'true', 'include_user_entities': 'true', 'include_ext_media_color': 'true', 'include_ext_media_availability': 'true', 'send_error_codes': 'true', 'simple_quoted_tweet': 'true', 'q': 'covid since:2023-01-01 until:2023-01-02', 'count': 20, 'query_source': 'typed_query', 'pc': '1', 'spelling_corrections': '1', 'ext': 'mediaStats,highlightedLabel,voiceInfo'}, timeout=60)
seitaro1227 commented 1 year ago

I also had a case of 404 via http_proxy. As a workaround, I used urllib3 1.25.11 and it worked fine.

shi1ro commented 1 year ago

I also had a case of 404 via http_proxy. As a workaround, I used urllib3 1.25.11 and it worked fine.

Oh thanks, it also works in my environment