Closed zrose584 closed 2 years ago
This commit: https://github.com/user234683/youtube-local/commit/21fda2d569c84285a4cfd8eb7660e061f7742437 was supposed to fix it but maybe they changed the JSON again. Do they come back if you reload the page? If you have debugging_save_responses (I think that's the setting name) set to True, the watch page will be saved in the data/debug folder. Then you can run this script with "watch" as the argument to extract the JSON and examine it:
import sys
import re
import json
file_name = sys.argv[1]
single_char_codes = {
'n': '\n',
'\\': '\\',
'"': '"',
"'": "'",
'b': '\b',
'f': '\f',
'n': '\n',
'r': '\r',
't': '\t',
'v': '\x0b',
'0': '\x00',
'\n': '', # backslash followed by literal newline joins lines
}
def js_escape_replace(match):
r'''Resolves javascript string escape sequences such as \x..'''
# some js-strings in the watch page html include them for no reason
# https://mathiasbynens.be/notes/javascript-escapes
escaped_sequence = match.group(1)
if escaped_sequence[0] in ('x', 'u'):
return chr(int(escaped_sequence[1:], base=16))
# In javascript, if it's not one of those escape codes, it's just
the
# literal character. e.g., "\a" = "a"
return single_char_codes.get(escaped_sequence, escaped_sequence)
PLAYER_RESPONSE_RE = re.compile(r'<script[^>]*?>[^<]*?var
ytInitialPlayerResponse = ({.*?)</script>')
INITIAL_DATA_RE = re.compile(r"<script[^>]*?>var ytInitialData =
'(.+?[^\\])';")
BASE_JS_RE = re.compile(r'jsUrl":\s*"([\w\-\./]+?/base.js)"')
JS_STRING_ESCAPE_RE = re.compile(r'\\([^xu]|x..|u....)')
def extract_watch_info_from_html(watch_html):
base_js_match = BASE_JS_RE.search(watch_html)
player_response_match = PLAYER_RESPONSE_RE.search(watch_html)
initial_data_match = INITIAL_DATA_RE.search(watch_html)
if base_js_match is not None:
base_js_url = base_js_match.group(1)
else:
base_js_url = None
if player_response_match is not None:
decoder = json.JSONDecoder()
# this will make it ignore extra stuff after end of object
player_response =
decoder.raw_decode(player_response_match.group(1))[0]
else:
print('Cannot find ytInitialPlayerResponse')
player_response = None
if initial_data_match is not None:
initial_data = initial_data_match.group(1)
initial_data = JS_STRING_ESCAPE_RE.sub(js_escape_replace,
initial_data)
initial_data = json.loads(initial_data)
else:
print('extract_watch_info_from_html: failed to find
initialData')
initial_data = None
# imitate old format expected by extract_watch_info
fake_polymer_json = {
'player': {
'args': {},
'assets': {
'js': base_js_url
}
},
'playerResponse': player_response,
'response': initial_data,
}
new_file_name, ext = os.path.splitext(file_name)
new_file_name += '_extracted_response.json'
with open(os.path.join('./', new_file_name), 'w') as f:
f.write(json.dumps(fake_polymer_json))
with open(os.path.join('./', file_name), 'r') as f:
watch_html = f.read()
extract_watch_info_from_html(watch_html)```
Thanks, I didn't had https://github.com/user234683/youtube-local/commit/21fda2d569c84285a4cfd8eb7660e061f7742437, it works now!
I noticed that subtitles are often missing, even manually created (e.g. not auto-translated) ones. Any ideas?