Russell-Newton / TikTokPy

Extract data from TikTok without needing any login information or API keys.
https://pypi.org/project/tiktokapipy/
MIT License
214 stars 28 forks source link

[QUESTION] My script run slow and how can I improve it. #69

Closed Kokoabassplayer closed 1 year ago

Kokoabassplayer commented 1 year ago

Ask your question Explain what you would like to do/know. Specific questions are likely to be answered sooner than vague ones.

{My script} run slow when this {warning} is happen, I manual check the {post} and it said "Video currently unavailable". I try my best to improve it, but I can't.

what I need to do to make my script run faster and not stuck and slow with this type of warning.

my script=

#pip install pytz
#pip install tqdm
#pip install tiktokapipy
#pip install tiktokapipy --upgrade
#python -m playwright install

from tiktokapipy.api import TikTokAPI
from tqdm import tqdm
from datetime import datetime, timedelta
import csv
import concurrent.futures
import time
import threading

def get_video_stat(username, start_date, end_date):
    RATE_LIMIT = 550
    RATE_LIMIT_INTERVAL = 60  # seconds
    SAFETY_MARGIN = 0.9  # 90% of the actual rate limit
    ADJUSTED_RATE_LIMIT = int(RATE_LIMIT * SAFETY_MARGIN)
    requests_made = 0
    last_request_time = datetime.now()
    start_date_timestamp = datetime.strptime(start_date, "%Y-%m-%d").timestamp()
    end_date_timestamp = datetime.strptime(end_date, "%Y-%m-%d").timestamp()

    print("Initializing...")
    with TikTokAPI() as api:
        print(f"Fetching user info for {username}...")
        user_info = api.user(username)
        #print(user_info)  # Debug print

        profile_name = username

        # Pre-filter videos based on date range
        videos_list = [video for video in user_info.videos if start_date_timestamp <= video.create_time.timestamp() <= end_date_timestamp]

        print(f"Number of videos to process: {len(videos_list)}")  # Debug print

        total_videos = len(videos_list)
        print(f"Processing {total_videos} videos...")
        csv_filename = f"{profile_name}_{start_date}_{end_date}.csv"

        video_stats_list = []
        errors = []

        def process_video(video):
            nonlocal requests_made, last_request_time
            try:
                # Rate limiting
                with rate_limit_lock:
                    current_time = datetime.now()
                    elapsed_time = (current_time - last_request_time).total_seconds()
                    if requests_made >= ADJUSTED_RATE_LIMIT and elapsed_time < RATE_LIMIT_INTERVAL:
                        sleep_time = RATE_LIMIT_INTERVAL - elapsed_time
                        time.sleep(sleep_time)
                        requests_made = 0
                        last_request_time = datetime.now() + timedelta(seconds=sleep_time)
                    else:
                        time.sleep(0.1)  # Introducing a small delay for safety
                    requests_made += 1

                video_stats = {
                            'user_id': user_info.id,
                            'username': profile_name,
                            'nickname': user_info.nickname,
                            'private_account': user_info.private_account,
                            'verified_account': user_info.verified,
                            'description': video.desc,
                            'tags': ", ".join(video.diversification_labels) if video.diversification_labels else None,
                            'challenges': ", ".join([challenge.title for challenge in video.challenges]) if video.challenges else None,
                            'video_id': video.id,
                            'video_url': video.url,
                            'create_time': video.create_time.replace(tzinfo=None),
                            'music_title': video.music.title if video.music else None,
                            'num_comments': video.stats.comment_count,
                            'num_likes': video.stats.digg_count,
                            'num_views': video.stats.play_count,
                            'num_shares': video.stats.share_count,
                            'engagement_rate': ((video.stats.comment_count + video.stats.digg_count + video.stats.share_count) / video.stats.play_count) * 100 if video.stats.play_count > 0 else 0,
                            'image_post': str(video.image_post) if video.image_post else None
                        }
                return video_stats
            except Exception as e:
                errors.append(f"Error processing video with ID {video.id}: {e}")
                return None

        rate_limit_lock = threading.Lock()
        start_time = time.time()  # Start time for rate calculation

        with tqdm(total=total_videos, desc="Processing videos") as pbar:
            for stats in concurrent.futures.ThreadPoolExecutor().map(process_video, videos_list):
                video_stats_list.append(stats)  # Append the returned stats to the list
                elapsed_time = time.time() - start_time
                script_rate = (requests_made / elapsed_time) * 60  # Requests per minute
                remaining_requests = RATE_LIMIT - script_rate
                pbar.set_postfix(Remaining_Requests=f"{remaining_requests:.2f} req/min", Script_Rate=f"{script_rate:.2f} req/min", refresh=True)
                pbar.update(1)

        # Filter out None values (videos that had errors)
        video_stats_list = [stats for stats in video_stats_list if stats]

        #print("Video Stats List:", video_stats_list)  # Debug print

        with open(csv_filename, mode='w', newline='', encoding='utf-8-sig') as file:
            fieldnames = ['user_id', 'username', 'nickname', 'private_account', 'verified_account', 'description', 'tags', 'challenges', 'video_id', 'video_url', 'create_time', 'music_title', 'num_comments', 'num_likes', 'num_views', 'num_shares', 'engagement_rate', 'image_post']
            writer = csv.DictWriter(file, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(video_stats_list)

        print(f"Saved data to {csv_filename}")
        for error in errors:
            print(error)

# Example usage
username = 'thestandardth'
start_date = '2023-07-01'
end_date = '2023-07-02'
get_video_stat(username, start_date, end_date)

warning=""" C:\Users\nuttapong.but\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\tiktokapipy\util\deferred_collectors.py:62: TikTokAPIWarning: Unable to grab video with id 7153277589048610075
self._fetch_sync() """

post="""https://www.tiktok.com/@/video/7153277589048610075"""

Version Information Please include what versions of pydantic, playwright, and tiktokapipy you have installed (can be found with pip freeze).

pydantic==2.1.1 pydantic_core==2.4.0 playwright==1.36.0 tiktokapipy==0.2.3

System Information Please provide information about the system you're running the library on. What's the OS? Is it a desktop, server, VM, or container?

window 10, hp laptop

Processor 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz 2.80 GHz Installed RAM 16.0 GB (15.8 GB usable) Device ID C5D971F5-E166-4233-B1BB-1C35DE90741D Product ID 00330-80023-38617-AA510 System type 64-bit operating system, x64-based processor Pen and touch Touch support with 10 touch points

Region Information Where in the world are you located? If you're not in the US, it's likely that TikTok treats your region differently, which may present some challenges.

Thailand

Additional context Add any other context or screenshots about the question here.

terminal image: image

problematic post image: image

Russell-Newton commented 1 year ago

The slowest part of your code is collecting the videos_list. The user.videos iterator fetches all a user's videos' data from the API, which is inherently slow. If all you want is a date range, you can take advantage of the fact that user posts are served by most recent:

videos_list = []

# Start collecting videos created before end_date_timestamp (you may want to convert this to a UTC timestamp)
user.videos._cursor = int(end_date_timestamp * 1000)  # TikTok's API expects timestamps in milliseconds

for video in user.videos:
    # Stop collecting once you've reached videos older than start_date_timestamp
    if video.create_time.timestamp < start_date_timestamp:
        break
    videos_list.append(video)

This will keep you from collecting every single video.

As another note, the warnings you see about unavailable videos are bound to happen. The API provides some information about private videos, so TikTokPy will try to collect information about them and spits out a warning when it realizes it's private. You can suppress all warnings of category TikTokAPIWarning if you don't want to see them.

Kokoabassplayer commented 1 year ago

perfect! thank you very much for the solution. appreciate you work.

here is my final working code:

### doc: https://tiktokpy.readthedocs.io/en/latest/users/usage.html#examples
### issues: รันช้าถ้ามีโพสที่ลบไปหรือซ่อนอยู่ เปิดทิคเก็ตไปแล้ว รอเค้าตอบ https://github.com/Russell-Newton/TikTokPy/issues/69
### tiktok มี rate limit ด้วยว่าห้ามดึงเกินเท่าไหร่ https://developers.tiktok.com/doc/tiktok-api-v2-rate-limit?enter_method=left_navigation

### install ให้ครบด้วยนะครับ
#pip install pytz
#pip install tqdm
#pip install tiktokapipy
#pip install tiktokapipy --upgrade
#python -m playwright install

from tiktokapipy.api import TikTokAPI
from tqdm import tqdm
from datetime import datetime, timedelta
import csv
import time
from time import sleep  # Added missing import

def get_video_stat(username, start_date, end_date):
    # Record the start time
    overall_start_time = time.time()
    RATE_LIMIT = 550
    RATE_LIMIT_INTERVAL = 60  # seconds
    SAFETY_MARGIN = 0.9  # 90% of the actual rate limit
    ADJUSTED_RATE_LIMIT = int(RATE_LIMIT * SAFETY_MARGIN)
    requests_made = 0
    last_request_time = datetime.now()
    start_date_timestamp = datetime.strptime(start_date, "%Y-%m-%d").timestamp()
    end_date_timestamp = datetime.strptime(end_date, "%Y-%m-%d").timestamp()

    print("Initializing...")
    # ... [Your previous code]

    with TikTokAPI() as api:
        print(f"Fetching user info for {username}...")
        user_info = api.user(username)
        profile_name = username

        # Optimized video collection based on the provided solution
        videos_list = []
        user_info.videos._cursor = int(end_date_timestamp * 1000) # TikTok's API expects timestamps in milliseconds
        for video in user_info.videos:
            if video.create_time.timestamp() < start_date_timestamp:
                break
            videos_list.append(video)

        print(f"Number of videos to process: {len(videos_list)}")

        # ... [Rest of your code]

        total_videos = len(videos_list)
        print(f"Processing {total_videos} videos...")
        csv_filename = f"{profile_name}_{start_date}_{end_date}.csv"

        video_stats_list = []
        errors = []

        with tqdm(total=total_videos, desc="Processing videos") as pbar:
            for video in videos_list:
                try:
                    # Rate limiting
                    current_time = datetime.now()
                    elapsed_time = (current_time - last_request_time).total_seconds()
                    if requests_made >= ADJUSTED_RATE_LIMIT and elapsed_time < RATE_LIMIT_INTERVAL:
                        sleep_time = RATE_LIMIT_INTERVAL - elapsed_time
                        time.sleep(sleep_time)
                        requests_made = 0
                        last_request_time = datetime.now() + timedelta(seconds=sleep_time)
                    else:
                        time.sleep(0.1)  # Introducing a small delay for safety
                    requests_made += 1

                    video_stats = {
                                'user_id': user_info.id,
                                'username': profile_name,
                                'nickname': user_info.nickname,
                                'private_account': user_info.private_account,
                                'verified_account': user_info.verified,
                                'description': video.desc,
                                'tags': ", ".join(video.diversification_labels) if video.diversification_labels else None,
                                'challenges': ", ".join([challenge.title for challenge in video.challenges]) if video.challenges else None,
                                'video_id': video.id,
                                'video_url': video.url,
                                'create_time': video.create_time.replace(tzinfo=None),
                                'music_title': video.music.title if video.music else None,
                                'num_comments': video.stats.comment_count,
                                'num_likes': video.stats.digg_count,
                                'num_views': video.stats.play_count,
                                'num_shares': video.stats.share_count,
                                'engagement_rate': ((video.stats.comment_count + video.stats.digg_count + video.stats.share_count) / video.stats.play_count) * 100 if video.stats.play_count > 0 else 0,
                                'image_post': str(video.image_post) if video.image_post else None
                            }
                    video_stats_list.append(video_stats)

                    # Update tqdm postfix data
                    elapsed_time_since_start = time.time() - overall_start_time
                    script_rate = (requests_made / elapsed_time_since_start) * 60  # Requests per minute
                    remaining_requests = RATE_LIMIT - script_rate
                    pbar.set_postfix(Remaining_Requests=f"{remaining_requests:.2f} req/min", Script_Rate=f"{script_rate:.2f} req/min", refresh=True)
                    pbar.update(1)
                except Exception as e:
                    errors.append(f"Error processing video with ID {video.id}: {e}")

        # Filter out None values (videos that had errors)
        video_stats_list = [stats for stats in video_stats_list if stats]

        with open(csv_filename, mode='w', newline='', encoding='utf-8-sig') as file:
            fieldnames = ['user_id', 'username', 'nickname', 'private_account', 'verified_account', 'description', 'tags', 'challenges', 'video_id', 'video_url', 'create_time', 'music_title', 'num_comments', 'num_likes', 'num_views', 'num_shares', 'engagement_rate', 'image_post']
            writer = csv.DictWriter(file, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(video_stats_list)

        # Calculate the elapsed time
        overall_elapsed_time = time.time() - overall_start_time
        minutes, seconds = divmod(overall_elapsed_time, 60)

        print(f"Saved data to {csv_filename}")
        for error in errors:
            print(error)

        # Print the total time of running
        print(f"Total time of running: {int(minutes)} minutes and {int(seconds)} seconds")

# Example usage 
#thestandardth
#ondemandacademy
#kokoabassplayer0
username = 'kokoabassplayer0'
start_date = '2021-01-01'
end_date = '2023-08-02'
get_video_stat(username, start_date, end_date)