some movies/tv shows dont play

ttx99 commented 9 months ago

with fmoviesz.to as reference thor dark world doesnt play

Ciarands commented 9 months ago

Yeah looks like the actual url you're trying errors, got 502'd then 404'd trying to access it, very strange that fmoviesz has it, ill investigate. Ty for report!

Ciarands commented 9 months ago

As far as I can tell vidsrc.to is allowing other sites to host their content with them without actually indexing the movies, this particular movie doesn't have any link to any IMDB/TMDB code and as such cannot be retrieved by this scraper. I have written a small scraper for this site, I'll maybe add support for it in future.

import os
import re
import json
import httpx
import base64
import asyncio
import questionary
import urllib.parse
from bs4 import BeautifulSoup
from typing import Optional, Dict

class VRF:
    DEFAULT_ENCRYPTION_KEY = b"FWsfu0KQd9vxYGNB"
    VIDPLAY_ENCRYPTION_KEY = b"8z5Ag5wgagfsOuhz"

    @staticmethod
    def rc4(key: bytes, data: str) -> bytearray:
        s = bytearray(range(256))
        j = 0

        for i in range(256):
            j = (j + s[i] + key[i % len(key)]) & 0xff
            s[i], s[j] = s[j], s[i]

        encrypted = bytearray(len(data))
        i = 0
        k = 0

        for index in range(len(data)):
            i = (i + 1) & 0xff
            k = (k + s[i]) & 0xff
            s[i], s[k] = s[k], s[i]
            t = (s[i] + s[k]) & 0xff

            if isinstance(data[index], str):
                encrypted[index] = ord(data[index]) ^ s[t]
            elif isinstance(data[index], int):
                encrypted[index] = data[index] ^ s[t]

        return encrypted

    @staticmethod
    def encrypt(data: str) -> str:
        def rot13(vrf):
            vrf = bytearray(vrf)
            for i in range(len(vrf)):
                byte = vrf[i]
                if 65 <= byte <= 90:
                    vrf[i] = ((byte - 65 + 13) % 26 + 65)
                elif 97 <= byte <= 122:
                    vrf[i] = ((byte - 97 + 13) % 26 + 97)
            return vrf

        def vrf_shift(vrf):
            vrf = bytearray(vrf)
            shifts = [-4, -2, -6, 5, -2]
            for i in range(len(vrf)):
                shift = shifts[i % 5]
                vrf[i] = (vrf[i] + shift) & 0xFF
            return vrf

        vrf = VRF.rc4(VRF.DEFAULT_ENCRYPTION_KEY, data)
        vrf = base64.urlsafe_b64encode(vrf)
        vrf = rot13(vrf)
        vrf = base64.urlsafe_b64encode(vrf)
        vrf = vrf_shift(vrf)
        return urllib.parse.quote(vrf.decode("utf-8"))

class Fmoviesz:
    BASE_URL = "https://fmoviesz.to"
    USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0"

    def __init__(self, **kwargs) -> None:
        self.source = kwargs.pop("source", None)
        self.query = kwargs.pop("query", None)
        self.url = kwargs.pop("url", None)

        assert os.path.exists("cookie.json"), "Please create a cookie.json file and manually input your cf_clearance cookie from your browsers localstorage\ne.g {\"cf_clearance\": \"YOUR_CLOUDFLARE_CLEARANCE\"}"

        with open("cookie.json", "r", encoding="utf-8") as f:
            cookies = json.load(f)

        self.default_cookies = {
            "cf_clearance": cookies.pop("cf_clearance")
        }
        self.default_headers = {
            "User-Agent": Fmoviesz.USER_AGENT, 
            "Referrer": Fmoviesz.BASE_URL,
        }

    @staticmethod
    def dump_cloudflare_cookies(cookies: httpx.Cookies) -> None:
        with open("cookie.json", "w", encoding="utf-8") as f:
            json.dump(dict(cookies), f, indent=2)

    async def test_cloudflare(self, response_text: Optional[str] = None) -> bool:
        if not response_text:
            response_text = (await self.client.get(Fmoviesz.BASE_URL)).text
        if "just a moment" not in response_text:
            return True
        return False

    async def resolve_query(self, query: str) -> Dict:
        media_hits = {}
        req = await self.client.get(f"{Fmoviesz.BASE_URL}/filter?{urllib.parse.urlencode({'keyword': query})}")
        valid = await self.test_cloudflare(req.text)
        if req.status_code != 200 or not valid:
            raise ValueError("Bad Response when attempting to query fmoviez (Bad status code or cloudflare clearance has expired)")

        soup = BeautifulSoup(req.text, "html.parser")
        for media in soup.find_all("div", {"class": "meta"}):
            media_data = media.find("a")
            if not media_data:
                continue
            media_info = media.find("div")
            if not media_info:
                continue
            media_metadata = media_info.find_all("span")
            if not media_metadata:
                continue

            i = 0
            while True:
                media_name = f"{media_metadata[1].text} - {media_data.text} ({media_metadata[0].text}) [{i}]"
                if media_name in media_hits.keys():
                    i += 1
                else:
                    break

            media_hits.update({media_name: f"{Fmoviesz.BASE_URL}{media_data.get('href')}"})

        return media_hits

    async def fetch_source_id(self, data_id: str) -> str:
        req = await self.client.get(f"{Fmoviesz.BASE_URL}/ajax/episode/list/{data_id}", params={"vrf": VRF.encrypt(data_id)})
        data_id = re.search(r"data-id=\\\"(\d+)\\\"", req.text).group(1)
        return data_id

    async def fetch_sources(self, source_id: str) -> Dict:
        req = await self.client.get(f"{Fmoviesz.BASE_URL}/ajax/server/list/{source_id}", params={"vrf": VRF.encrypt(source_id)})
        return {name: id for id, name in re.findall(r"data-link-id=\\\"(\d+)\\\">\s+<span>(\w+)", req.text)}

    async def fetch_playback_data(self, file_id: str) -> Dict:
        req = await self.client.get(f"{Fmoviesz.BASE_URL}/ajax/server/{file_id}", params={"vrf": VRF.encrypt(file_id)})
        return req.json().get("result")

    async def decode_playback_url(self, encoded: str) -> str:
        standardized_input = encoded.replace("_", "/").replace("-", "+")
        b64_data = base64.b64decode(standardized_input)
        decoded = VRF.rc4(VRF.VIDPLAY_ENCRYPTION_KEY, b64_data)
        return urllib.parse.unquote(decoded.decode("utf-8"))

    async def fetch_media(self, url: str) -> str:
        req = await self.client.get(url)
        valid = await self.test_cloudflare(req.text)
        if req.status_code != 200 or not valid:
            raise ValueError("Bad response when attempting to scrape media page (Bad status code or cloudflare clearance has expired)")
        data_id_match = re.search(r"data-id=\"(\d+)\"", req.text)
        if not data_id_match:
            raise ValueError("Could not match data-id")
        data_id = data_id_match.group(1)
        source_id = await self.fetch_source_id(data_id)
        sources = await self.fetch_sources(source_id)
        selection = self.source or await questionary.select("Select the media you want to watch", choices=sources).unsafe_ask_async()
        source = sources.get(selection)
        playback_data = await self.fetch_playback_data(source)
        playback_url = await self.decode_playback_url(playback_data.get("url"))
        return playback_url

    async def resolve(self, url: Optional[str] = None, query: Optional[str] = None) -> str:
        assert url or query or self.url or self.query, "Please pass a url or query."
        async with httpx.AsyncClient() as self.client:
            try:
                self.client.cookies.update(self.default_cookies)
                self.client.headers.update(self.default_headers)
                query = query or self.query
                url = url or self.url
                valid = await self.test_cloudflare()
                assert valid, "cf_clearance is invalid or impl is broken, please check your clearance hasnt changed"
                if not url and query:
                    media_hits = await self.resolve_query(query)
                    selection = await questionary.select("Select the media you want to watch", choices=media_hits).unsafe_ask_async()
                    url = media_hits.get(selection)
                return await self.fetch_media(url)
            finally:
                self.dump_cloudflare_cookies(self.client.cookies)

if __name__ == "__main__":
    fmz = Fmoviesz(
        source = "Vidplay", # only implemented vidplay
        query = "Thor: The Dark World",
    )
    url = asyncio.run(fmz.resolve())
    print(url) # see https://github.com/Ciarands/vidsrc-to-resolver/blob/main/sources/vidplay.py for reference on extracting the m3u8 from this

ttx99 commented 9 months ago

I dont know how to go about this, dont know coding. I just know how to follow instructions.

ttx99 commented 9 months ago

I save it as fmv.py in a text editor and run , it shows this: python fmv.py Traceback (most recent call last): File "/home/tv/fmv.py", line 189, in <module> fmz = Fmoviesz( ^^^^^^^^^ File "/home/tv/fmv.py", line 78, in __init__ assert os.path.exists("cookie.json"), "Please create a cookie.json file and manually input your cf_clearance cookie from your browsers localstorage\ne.g {\"cf_clearance\": \"YOUR_CLOUDFLARE_CLEARANCE\"}" AssertionError: Please create a cookie.json file and manually input your cf_clearance cookie from your browsers localstorage e.g {"cf_clearance": "YOUR_CLOUDFLARE_CLEARANCE"}

Ciarands commented 9 months ago

I dont know how to go about this, dont know coding. I just know how to follow instructions.

Ah fair enough, no worries then. This movie is hosted by fmoviez on vidsrc.to's backend, it can only be scraped by scraping fmoviez, which due to cloudflare is kinda inconvenient.

I save it as fmv.py in a text editor and run , it shows this: python fmv.py Traceback (most recent call last): File "/home/tv/fmv.py", line 189, in fmz = Fmoviesz( ^^^^^^^^^ File "/home/tv/fmv.py", line 78, in init assert os.path.exists("cookie.json"), "Please create a cookie.json file and manually input your cf_clearance cookie from your browsers localstorage\ne.g {\"cf_clearance\": \"YOUR_CLOUDFLARE_CLEARANCE\"}" AssertionError: Please create a cookie.json file and manually input your cf_clearance cookie from your browsers localstorage e.g {"cf_clearance": "YOUR_CLOUDFLARE_CLEARANCE"}

also yeah that was purposeful, the error message explains how to fix. Please create a cookie.json file and manually input your cf_clearance cookie from your browsers localstorage e.g {"cf_clearance": "YOUR_CLOUDFLARE_CLEARANCE"} The only way to scrape a site with cloudflare protections is with a cf_clearance, you can find it in your browsers cookies if you open up devtools.

However the scraper isn't super useful if you aren't able to program as you'll need to combine it with this, in order to actually extract the .m3u8 to be able to play it in something like mpv. I could always introduce something that allows the user to manually solve the captcha in future in order to generate a cf_clearance, but not super keen on that idea, i'll look into it though.

ttx99 commented 9 months ago

yikes, sorry this is beyond my understanding

Ciarands commented 9 months ago

all good, ill leave this open for anyone else curious, im working on a project in my free time which will be wrapped by a CLI which will be able to scrape this. https://github.com/movie-cat/providers

ttx99 commented 9 months ago

:rocket: :+1: :heart: :100: :cat:

Ciarands / vidsrc-to-resolver

some movies/tv shows dont play #15