Rishikant181 / Rettiwt-API

A CLI tool and an API for fetching data from Twitter for free!
https://rishikant181.github.io/Rettiwt-API/
MIT License
479 stars 46 forks source link

Trying to save all tweets of a public account but it only goes back a month #573

Open StamosArhon opened 3 months ago

StamosArhon commented 3 months ago

I wrote a script that will get all tweets from a specific public account and save them in a json file in batches of 500, starting from the current day and going back in time until the first tweet. So when the first json file is filled, the script gets the id of the last saved tweet and creates the next json file to keep saving from the next tweet. The issue is that I started from 25th of July 2024 and I can only go back to early June but no further. From a certain point on the new json files opened are starting again from the current day.

This is the script:

const { Rettiwt, EResourceType, CursoredData, EErrorCodes } = require('rettiwt-api');
const fs = require('fs');

require('dotenv').config(); // Load API key from .env file
const apiKey = process.env.API_KEY;

const targetUserId = '579067205';
const tweetsPerFile = 500;

// Custom error handler
class CustomErrorHandler {
  handle(error) {
    if ((error.code === EErrorCodes.RATE_LIMIT_EXCEEDED && error.response) ||
        (error.response && error.response.status === 503)) { // Handle 503 errors
      let retryAfter;
      if (error.code === EErrorCodes.RATE_LIMIT_EXCEEDED) {
        retryAfter = parseInt(error.response.headers['x-rate-limit-reset']) || 15;
        console.warn(`Rate limit exceeded. Retrying after ${retryAfter} seconds...`);
      } else {
        // If 503 error, wait for 5 minutes
        retryAfter = 5 * 60; // 5 minutes in seconds
        console.warn(`Service Unavailable (503). Retrying after ${retryAfter} seconds...`);
      }
      throw new RateLimitError(retryAfter * 1000); // Convert to milliseconds
    } else {
      // Re-throw other errors
      throw error;
    }
  }
}

// Custom RateLimitError class
class RateLimitError extends Error {
  constructor(ms) {
    super(`Rate limit exceeded. Retry after ${ms} milliseconds.`);
    this.name = 'RateLimitError';
    this.ms = ms;
  }
}

// Initialize Rettiwt with the custom error handler
const rettiwt = new Rettiwt({
  apiKey,
  errorHandler: new CustomErrorHandler()
});

async function getAllTweets(userId, cursor = undefined, allTweets = [], delay = 5000, fileIndex = 0, seenTweetIds = new Set()) {
  console.log(`Starting tweet fetching for user: ${userId}, file index: ${fileIndex}, cursor: ${cursor}`);

  try {
    const fetchArgs = {
      id: userId,
      count: 20,
      cursor: cursor
    };

    const response = await rettiwt.user.request(EResourceType.USER_TIMELINE, fetchArgs);
    const timelineData = new CursoredData(response, 'Tweet');

    // Deduplicate based on tweet ID only
    const uniqueTweets = timelineData.list.filter(tweet => !seenTweetIds.has(tweet.id));
    uniqueTweets.forEach(tweet => seenTweetIds.add(tweet.id));

    allTweets = allTweets.concat(uniqueTweets);

    // Split into files if necessary
    if (allTweets.length >= tweetsPerFile) {
      sortTweetsByDate(allTweets);
      saveTweetsToFile(allTweets, fileIndex);
      allTweets = [];
      fileIndex++;
      seenTweetIds = new Set();
    }

    if (timelineData.next && timelineData.next.value) {
      console.log(`Fetched ${allTweets.length} tweets so far (current file), fetching more...`);
      cursor = timelineData.next.value;

      await new Promise(resolve => setTimeout(resolve, delay));
      return getAllTweets(userId, cursor, allTweets, delay, fileIndex, seenTweetIds);
    } else {
      // Save any remaining tweets
      if (allTweets.length > 0) {
        saveTweetsToFile(allTweets, fileIndex);
      }
      return allTweets;
    }

  } catch (error) {
    if ((error instanceof RateLimitError) ||
        (error.response && (error.response.status === 429 || error.response.status === 503))) {
      if (error instanceof RateLimitError) {
        console.warn(error.message);
        await new Promise(resolve => setTimeout(resolve, error.ms)); 
      } else {
        let retryAfter = 15 * 60 * 1000; // 15 minutes by default
        if (error.response.status === 503) {
          retryAfter = 5 * 60 * 1000; // 5 minutes for 503 errors
        }
        console.warn(`Rate limit or Service Unavailable. Retrying after ${retryAfter / (1000 * 60)} minutes...`);
        await new Promise(resolve => setTimeout(resolve, retryAfter)); 
      }

      // Get the cursor from the last successful request
      cursor = allTweets.length > 0 ? allTweets[allTweets.length - 1].id : undefined;

      saveTweetsToFile(allTweets, fileIndex);
      // Pass the correct cursor to the recursive call
      return getAllTweets(userId, cursor, allTweets, delay, fileIndex, seenTweetIds);
    } else {
      console.error('Unhandled Error fetching tweets:', error);
      return allTweets;
    }
  }
}

// Function to sort tweets by date (newest to oldest)
function sortTweetsByDate(tweets) {
  tweets.sort((a, b) => new Date(b.createdAt).getTime() - new Date(a.createdAt).getTime());
}

// Function to save tweets to a file with file index
function saveTweetsToFile(tweets, fileIndex) {
  const filename = `tweets_${targetUserId}_${fileIndex}.json`;
  fs.writeFileSync(filename, JSON.stringify(tweets, null, 2));
  console.log('Tweets saved to file:', filename);
}

// Function to load tweets from all available files,
// checking for tweet count and last tweet ID in incomplete files
function loadTweetsFromFile() {
  let allTweets = [];
  let fileIndex = 0;
  let cursor = undefined; 

  while (true) {
    const filename = `tweets_${targetUserId}_${fileIndex}.json`;
    if (fs.existsSync(filename)) {
      const data = fs.readFileSync(filename, 'utf-8');
      const tweetsInFile = JSON.parse(data);

      // If the file is incomplete, get the IDs of the tweets already in the file 
      if (tweetsInFile.length < tweetsPerFile) { 
        console.warn(`File ${filename} does not have the expected number of tweets. Resuming from this file.`);
        tweetsInFile.forEach(tweet => seenIds.add(tweet.id));
        // Get the cursor from the LAST tweet in the incomplete file
        cursor = tweetsInFile[tweetsInFile.length - 1].id; // <-- CORRECTION HERE
        return {
          tweets: tweetsInFile,
          fileIndex: fileIndex,
          cursor: cursor // Return the cursor
        };
      }

      console.log('Loading saved tweets from file:', filename);
      allTweets = allTweets.concat(tweetsInFile);
      fileIndex++; 
    } else {
      break;
    }
  }

  return {
    tweets: allTweets,
    fileIndex: fileIndex,
    cursor: cursor 
  };
}

// Start fetching tweets
const loadedData = loadTweetsFromFile();
const savedTweets = loadedData.tweets;
const startingFileIndex = loadedData.fileIndex;
const startingCursor = loadedData.cursor;
const seenIds = new Set(); 

// Populate seenTweetIds from loadedTweets
savedTweets.forEach(tweet => seenIds.add(tweet.id));

getAllTweets(targetUserId, startingCursor, savedTweets, 5000, startingFileIndex, seenIds) 
  .then(tweets => {
    console.log(`Fetched a total of ${tweets.length} tweets from user ${targetUserId}.`);
  })
  .catch(error => console.error('Error:', error));

I have been racking my brain with this for a week now, trying every workaround I could think of so any insight would be more than welcome! :)``

Rishikant181 commented 3 months ago

USER_TWEETS endpoint has a limit as to how far back you can go, since it scrapes from the user's page. For your use case, the recommended endpoint will be TWEET_SEARCH which let's you search all tweets from a user (or using any other filter). It also let's you search by date ranges. The TWEET_SEARCH endpoint allows access to all tweets, even the oldest ones.

These are all the search filters TWEET_SEARCH supports

So when the first json file is filled, the script gets the id of the last saved tweet and creates the next json file to keep saving from the next tweet

In your case, you can use the fromUsers field combined with sinceId to get the necessary results.

StamosArhon commented 3 months ago

Thank you very much for the suggestion. I did try to use the TWEET_SEARCH endpoint and created a sample script to just check if it was fetching tweets from a given date range:

const { Rettiwt } = require('rettiwt-api');
require('dotenv').config(); // Load environment variables from .env

async function fetchTweetsFromDateRange() {
  try {
    console.log("Starting tweet fetch process...");

    const rettiwt = new Rettiwt({ 
      apiKey: process.env.API_KEY,
      logging: true // Enable rettiwt-api's internal logging
    });

    const startDate = new Date('2022-01-01T00:00:00.000Z'); 
    const endDate = new Date('2022-12-31T23:59:59.000Z'); 

    console.log("Start Date:", startDate);
    console.log("End Date:", endDate);

    const filter = {
      fromUsers: ['579067205'], // Target user ID
      startDate: startDate,    // Start date for filtering
      endDate: endDate,        // End date for filtering
    };

    let cursor;
    let allTweets = []; // Array to store all tweets

    do {
      console.log("Fetching tweets with cursor:", cursor);

      const tweets = await rettiwt.tweet.search(filter, 20, cursor);

      console.log("Tweets object received:", tweets);

      if (tweets && tweets.list) {
        allTweets = allTweets.concat(tweets.list);
        console.log(`Fetched ${tweets.list.length} tweets in this batch. Total: ${allTweets.length}`);
      } else {
        console.warn("Warning: No tweets.list property found in response.");
      }

      cursor = tweets.next ? tweets.next.value : undefined;

      await new Promise(resolve => setTimeout(resolve, 1000)); // Wait for 1 second (adjust if needed)

    } while (cursor);

    console.log(`Final tweet count: ${allTweets.length}`);
    console.log("All fetched tweets:", allTweets);

  } catch (err) {
    console.error("Error fetching tweets:", err); 
    if (err.response) {
      console.error("API Response Status:", err.response.status);
      console.error("API Response Data:", err.response.data);
    }
  }
}

fetchTweetsFromDateRange();

This is the log I get from it:

Starting tweet fetch process...
Start Date: 2022-01-01T00:00:00.000Z
End Date: 2022-12-31T23:59:59.000Z
Fetching tweets with cursor: undefined
[Rettiwt-API] [2024-07-28T11:12:13.307Z] [REQUEST] {"resource":"TWEET_SEARCH","args":{"filter":{"fromUsers":["579067205"],"startDate":"2022-01-01T00:00:00.000Z","endDate":"2022-12-31T23:59:59.000Z"},"count":20}}
[Rettiwt-API] [2024-07-28T11:12:13.307Z] [AUTHORIZATION] {"authenticated":true}
[Rettiwt-API] [2024-07-28T11:12:13.307Z] [VALIDATE] {"target":"FETCH_ARGS"}
[Rettiwt-API] [2024-07-28T11:12:13.309Z] [GET] {"target":"HTTPS_AGENT"}
[Rettiwt-API] [2024-07-28T11:12:13.310Z] [GET] {"target":"USER_CREDENTIAL"}
Tweets object received: CursoredData {
  list: [],
  next: Cursor {
    value: 'DAADDAABAAgAAgAAAAIIAAMAAAAACAAEAAAAAAoABRk5H1-wgCcQCgAGGTkfX7B_2PAAAA'
  }
}
Fetched 0 tweets in this batch. Total: 0
Fetching tweets with cursor: DAADDAABAAgAAgAAAAIIAAMAAAAACAAEAAAAAAoABRk5H1-wgCcQCgAGGTkfX7B_2PAAAA
[Rettiwt-API] [2024-07-28T11:12:14.675Z] [REQUEST] {"resource":"TWEET_SEARCH","args":{"filter":{"fromUsers":["579067205"],"startDate":"2022-01-01T00:00:00.000Z","endDate":"2022-12-31T23:59:59.000Z"},"count":20,"cursor":"DAADDAABAAgAAgAAAAIIAAMAAAAACAAEAAAAAAoABRk5H1-wgCcQCgAGGTkfX7B_2PAAAA"}}
[Rettiwt-API] [2024-07-28T11:12:14.675Z] [AUTHORIZATION] {"authenticated":true}
[Rettiwt-API] [2024-07-28T11:12:14.675Z] [VALIDATE] {"target":"FETCH_ARGS"}
[Rettiwt-API] [2024-07-28T11:12:14.676Z] [GET] {"target":"HTTPS_AGENT"}
[Rettiwt-API] [2024-07-28T11:12:14.676Z] [GET] {"target":"USER_CREDENTIAL"}
Tweets object received: CursoredData {
  list: [],
  next: Cursor {
    value: 'DAADDAABAAgAAgAAAAIIAAMAAAAACAAEAAAAAQoABRk5H1-wgCcQCgAGGTkfX7B_seAAAA'
  }
}
Fetched 0 tweets in this batch. Total: 0

Again, I tried a bunch of variations but I can't seem to get tweets from a given date, much less being able to get them all. I don't know if I am making some obvious mistake, so I'm sorry if it's something trivial that I am not seeing, but in any case thanks for taking the time to help.

Rishikant181 commented 3 months ago

fromUsers: ['579067205']

Here, it should be username and not id