Open ttx99 opened 9 months ago
Yeah looks like the actual url you're trying errors, got 502'd then 404'd trying to access it, very strange that fmoviesz has it, ill investigate. Ty for report!
As far as I can tell vidsrc.to is allowing other sites to host their content with them without actually indexing the movies, this particular movie doesn't have any link to any IMDB/TMDB code and as such cannot be retrieved by this scraper. I have written a small scraper for this site, I'll maybe add support for it in future.
import os
import re
import json
import httpx
import base64
import asyncio
import questionary
import urllib.parse
from bs4 import BeautifulSoup
from typing import Optional, Dict
class VRF:
DEFAULT_ENCRYPTION_KEY = b"FWsfu0KQd9vxYGNB"
VIDPLAY_ENCRYPTION_KEY = b"8z5Ag5wgagfsOuhz"
@staticmethod
def rc4(key: bytes, data: str) -> bytearray:
s = bytearray(range(256))
j = 0
for i in range(256):
j = (j + s[i] + key[i % len(key)]) & 0xff
s[i], s[j] = s[j], s[i]
encrypted = bytearray(len(data))
i = 0
k = 0
for index in range(len(data)):
i = (i + 1) & 0xff
k = (k + s[i]) & 0xff
s[i], s[k] = s[k], s[i]
t = (s[i] + s[k]) & 0xff
if isinstance(data[index], str):
encrypted[index] = ord(data[index]) ^ s[t]
elif isinstance(data[index], int):
encrypted[index] = data[index] ^ s[t]
return encrypted
@staticmethod
def encrypt(data: str) -> str:
def rot13(vrf):
vrf = bytearray(vrf)
for i in range(len(vrf)):
byte = vrf[i]
if 65 <= byte <= 90:
vrf[i] = ((byte - 65 + 13) % 26 + 65)
elif 97 <= byte <= 122:
vrf[i] = ((byte - 97 + 13) % 26 + 97)
return vrf
def vrf_shift(vrf):
vrf = bytearray(vrf)
shifts = [-4, -2, -6, 5, -2]
for i in range(len(vrf)):
shift = shifts[i % 5]
vrf[i] = (vrf[i] + shift) & 0xFF
return vrf
vrf = VRF.rc4(VRF.DEFAULT_ENCRYPTION_KEY, data)
vrf = base64.urlsafe_b64encode(vrf)
vrf = rot13(vrf)
vrf = base64.urlsafe_b64encode(vrf)
vrf = vrf_shift(vrf)
return urllib.parse.quote(vrf.decode("utf-8"))
class Fmoviesz:
BASE_URL = "https://fmoviesz.to"
USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0"
def __init__(self, **kwargs) -> None:
self.source = kwargs.pop("source", None)
self.query = kwargs.pop("query", None)
self.url = kwargs.pop("url", None)
assert os.path.exists("cookie.json"), "Please create a cookie.json file and manually input your cf_clearance cookie from your browsers localstorage\ne.g {\"cf_clearance\": \"YOUR_CLOUDFLARE_CLEARANCE\"}"
with open("cookie.json", "r", encoding="utf-8") as f:
cookies = json.load(f)
self.default_cookies = {
"cf_clearance": cookies.pop("cf_clearance")
}
self.default_headers = {
"User-Agent": Fmoviesz.USER_AGENT,
"Referrer": Fmoviesz.BASE_URL,
}
@staticmethod
def dump_cloudflare_cookies(cookies: httpx.Cookies) -> None:
with open("cookie.json", "w", encoding="utf-8") as f:
json.dump(dict(cookies), f, indent=2)
async def test_cloudflare(self, response_text: Optional[str] = None) -> bool:
if not response_text:
response_text = (await self.client.get(Fmoviesz.BASE_URL)).text
if "just a moment" not in response_text:
return True
return False
async def resolve_query(self, query: str) -> Dict:
media_hits = {}
req = await self.client.get(f"{Fmoviesz.BASE_URL}/filter?{urllib.parse.urlencode({'keyword': query})}")
valid = await self.test_cloudflare(req.text)
if req.status_code != 200 or not valid:
raise ValueError("Bad Response when attempting to query fmoviez (Bad status code or cloudflare clearance has expired)")
soup = BeautifulSoup(req.text, "html.parser")
for media in soup.find_all("div", {"class": "meta"}):
media_data = media.find("a")
if not media_data:
continue
media_info = media.find("div")
if not media_info:
continue
media_metadata = media_info.find_all("span")
if not media_metadata:
continue
i = 0
while True:
media_name = f"{media_metadata[1].text} - {media_data.text} ({media_metadata[0].text}) [{i}]"
if media_name in media_hits.keys():
i += 1
else:
break
media_hits.update({media_name: f"{Fmoviesz.BASE_URL}{media_data.get('href')}"})
return media_hits
async def fetch_source_id(self, data_id: str) -> str:
req = await self.client.get(f"{Fmoviesz.BASE_URL}/ajax/episode/list/{data_id}", params={"vrf": VRF.encrypt(data_id)})
data_id = re.search(r"data-id=\\\"(\d+)\\\"", req.text).group(1)
return data_id
async def fetch_sources(self, source_id: str) -> Dict:
req = await self.client.get(f"{Fmoviesz.BASE_URL}/ajax/server/list/{source_id}", params={"vrf": VRF.encrypt(source_id)})
return {name: id for id, name in re.findall(r"data-link-id=\\\"(\d+)\\\">\s+<span>(\w+)", req.text)}
async def fetch_playback_data(self, file_id: str) -> Dict:
req = await self.client.get(f"{Fmoviesz.BASE_URL}/ajax/server/{file_id}", params={"vrf": VRF.encrypt(file_id)})
return req.json().get("result")
async def decode_playback_url(self, encoded: str) -> str:
standardized_input = encoded.replace("_", "/").replace("-", "+")
b64_data = base64.b64decode(standardized_input)
decoded = VRF.rc4(VRF.VIDPLAY_ENCRYPTION_KEY, b64_data)
return urllib.parse.unquote(decoded.decode("utf-8"))
async def fetch_media(self, url: str) -> str:
req = await self.client.get(url)
valid = await self.test_cloudflare(req.text)
if req.status_code != 200 or not valid:
raise ValueError("Bad response when attempting to scrape media page (Bad status code or cloudflare clearance has expired)")
data_id_match = re.search(r"data-id=\"(\d+)\"", req.text)
if not data_id_match:
raise ValueError("Could not match data-id")
data_id = data_id_match.group(1)
source_id = await self.fetch_source_id(data_id)
sources = await self.fetch_sources(source_id)
selection = self.source or await questionary.select("Select the media you want to watch", choices=sources).unsafe_ask_async()
source = sources.get(selection)
playback_data = await self.fetch_playback_data(source)
playback_url = await self.decode_playback_url(playback_data.get("url"))
return playback_url
async def resolve(self, url: Optional[str] = None, query: Optional[str] = None) -> str:
assert url or query or self.url or self.query, "Please pass a url or query."
async with httpx.AsyncClient() as self.client:
try:
self.client.cookies.update(self.default_cookies)
self.client.headers.update(self.default_headers)
query = query or self.query
url = url or self.url
valid = await self.test_cloudflare()
assert valid, "cf_clearance is invalid or impl is broken, please check your clearance hasnt changed"
if not url and query:
media_hits = await self.resolve_query(query)
selection = await questionary.select("Select the media you want to watch", choices=media_hits).unsafe_ask_async()
url = media_hits.get(selection)
return await self.fetch_media(url)
finally:
self.dump_cloudflare_cookies(self.client.cookies)
if __name__ == "__main__":
fmz = Fmoviesz(
source = "Vidplay", # only implemented vidplay
query = "Thor: The Dark World",
)
url = asyncio.run(fmz.resolve())
print(url) # see https://github.com/Ciarands/vidsrc-to-resolver/blob/main/sources/vidplay.py for reference on extracting the m3u8 from this
I dont know how to go about this, dont know coding. I just know how to follow instructions.
I save it as fmv.py in a text editor and run , it shows this:
python fmv.py Traceback (most recent call last): File "/home/tv/fmv.py", line 189, in <module> fmz = Fmoviesz( ^^^^^^^^^ File "/home/tv/fmv.py", line 78, in __init__ assert os.path.exists("cookie.json"), "Please create a cookie.json file and manually input your cf_clearance cookie from your browsers localstorage\ne.g {\"cf_clearance\": \"YOUR_CLOUDFLARE_CLEARANCE\"}" AssertionError: Please create a cookie.json file and manually input your cf_clearance cookie from your browsers localstorage e.g {"cf_clearance": "YOUR_CLOUDFLARE_CLEARANCE"}
I dont know how to go about this, dont know coding. I just know how to follow instructions.
Ah fair enough, no worries then. This movie is hosted by fmoviez on vidsrc.to's backend, it can only be scraped by scraping fmoviez, which due to cloudflare is kinda inconvenient.
I save it as fmv.py in a text editor and run , it shows this: python fmv.py Traceback (most recent call last): File "/home/tv/fmv.py", line 189, in
fmz = Fmoviesz( ^^^^^^^^^ File "/home/tv/fmv.py", line 78, in init assert os.path.exists("cookie.json"), "Please create a cookie.json file and manually input your cf_clearance cookie from your browsers localstorage\ne.g {\"cf_clearance\": \"YOUR_CLOUDFLARE_CLEARANCE\"}" AssertionError: Please create a cookie.json file and manually input your cf_clearance cookie from your browsers localstorage e.g {"cf_clearance": "YOUR_CLOUDFLARE_CLEARANCE"}
also yeah that was purposeful, the error message explains how to fix.
Please create a cookie.json file and manually input your cf_clearance cookie from your browsers localstorage e.g {"cf_clearance": "YOUR_CLOUDFLARE_CLEARANCE"}
The only way to scrape a site with cloudflare protections is with a cf_clearance, you can find it in your browsers cookies if you open up devtools.
However the scraper isn't super useful if you aren't able to program as you'll need to combine it with this, in order to actually extract the .m3u8 to be able to play it in something like mpv. I could always introduce something that allows the user to manually solve the captcha in future in order to generate a cf_clearance, but not super keen on that idea, i'll look into it though.
yikes, sorry this is beyond my understanding
all good, ill leave this open for anyone else curious, im working on a project in my free time which will be wrapped by a CLI which will be able to scrape this. https://github.com/movie-cat/providers
:rocket: :+1: :heart: :100: :cat:
with fmoviesz.to as reference thor dark world doesnt play