[YOUTUBE]Fix Youtube VIDEO_ID

Hola compañeros de Codigo Donwloader, despues de estar probando y probando por que los videos de este curso Enlaceno se descargaban, me puse a investigar un poco y encontre que algunos cursos tienen enlace de Youtube y no de codigofacilito con el siguiente codigo youtube_video_id,

<div class=" " id="my-player"><input type="hidden" name="youtube_video_id" id="youtube_video_id" value="b2PW-BHwTxU" /><div class=" video-container video_player"><div id="player"></div></div></div> entonces cuando el collertor.py revisaba este daba error y se cerraba ya que no eran los enlaces tipicos para completar https://video-storage.codigofacilito.com/, por lo que me puse a modificar un poco el codigo y pude hacerlo funcionar con un try y validando si tiene el id de youtube para mandar el enlace a ytb_dl quedando de la siguiente manera:

def get_video_detail_sync(url: str, page: Page) -> Video:
    """Retrieve detailed information for a video from its URL.

    Args:
        url (str): The URL of the video page.
        page (Page): The Playwright page object.

    Returns:
        Video: An instance of the Video model with the retrieved details.
    """

    if not is_video_url(url):
        error_message = f"[VIDEO] Invalid video URL: {url}"
        logger.error(error_message)
        raise URLError(error_message)

    # Navegar a la URL y esperar que el DOM esté listo.
    page.goto(url=url, wait_until="domcontentloaded")

    # Buscar el título del video.
    try:
        title = page.locator(
            """
            h1[class='ibm bold-600 no-margin f-text-22'], 
            h1[class='ibm bold-600 no-margin f-text-48']
            """
        ).inner_text(timeout=1000)
        title = clean_new_line(title)
        title = clean_string(title)
    except Exception as e:
        error_message = f"[VIDEO] Title not found: {url}"
        logger.error(error_message)
        raise VideoError(error_message) from e

    # Obtener directamente el valor del input de YouTube.
    youtube_id = None
    try:
        youtube_id = page.locator("input#youtube_video_id").get_attribute("value")
        if youtube_id:
            print(f"[YOUTUBE] ID found: {youtube_id}")
        else:
            print("[YOUTUBE] ID is empty.")
    except Exception as e:
        print(f"[YOUTUBE] Error retrieving ID: {e}")
        youtube_id = None

    # Obtener los IDs del video y curso.
    video_id = page.locator("input[name='video_id']").first.get_attribute("value")
    course_id = page.locator("input[name='course_id']").first.get_attribute("value")

    if not video_id or not course_id:
        error_message = f"[VIDEO] IDs not found: {url}"
        logger.error(error_message)
        raise VideoError(error_message)

    # Determinar la URL base del m3u8 según la presencia del ID de YouTube.
    if youtube_id:
        base_m3u8_url = "https://www.youtube.com/watch?v="
        m3u8_url = f"{base_m3u8_url}{youtube_id}"
    else:
        base_m3u8_url = "https://video-storage.codigofacilito.com/"
        m3u8_url = f"{base_m3u8_url}/hls/{course_id}/{video_id}/playlist.m3u8"

    # Determinar el tipo de media.
    media_type: Optional[MediaType] = None
    if "/videos/" in url:
        media_type = MediaType.STREAMING
    elif "/articulos/" in url:
        media_type = MediaType.READING

    # Retornar el objeto Video con los detalles obtenidos.
    return Video(
        id=video_id,
        url=url,
        m3u8_url=m3u8_url,
        title=title,
        media_type=media_type,
        description=None,
    )

pero tengo un error que es [YOUTUBE] Error retrieving ID: Timeout 30000ms exceeded. y creo tiene que ver con helpers.py, pero creo que ustedes podrian ayudar a solucionar esto y darle un mejor formato al codigo que medio implemente, saludos

De igual manera ya esta el nuevo metodo arreglado el cual ahora el vide se optiene de esta parte:

"""Collectors for Facilito API""" import os import re from typing import Optional from playwright.sync_api import Page from rich import print as tprint from .. import consts from ..errors import BootcampError, CourseError, URLError, VideoError from ..helpers import ( clean_bootcamp_title, clean_new_line, clean_string, is_bootcamp_url, is_course_url, is_video_url, ) from ..models.bootcamp import Bootcamp, BootcampClass, BootcampModule, BootcampVideo from ..models.course import Course, CourseSection, VideoURL from ..models.video import MediaType, Video from ..utils import expanders from ..utils.logger import logger def get_article_sync(url: str, page: Page) -> Page: """ Get info by article """ page.goto(url=url, wait_until=None) page.evaluate( """ let elements_to_delete = document.querySelectorAll("div[class='player-header']"); for (let element of elements_to_delete) { element.parentNode.removeChild(element); } """ ) page.evaluate( """ elements_to_delete = document.querySelectorAll("div[class='row f-gap-medium middle-xs']"); for (let element of elements_to_delete) { element.parentNode.removeChild(element); } """ ) page.evaluate( """ elements_to_delete = document.querySelectorAll("div[class='player-sidebar relative']"); for (let element of elements_to_delete) { element.parentNode.removeChild(element); } """ ) return page def get_video_detail_sync(url: str, page: Page) -> Video: """Retrieve detailed information for a video from its URL. Args: url (str): The URL of the video page. page (Page): The Playwright page object. Returns: Video: An instance of the Video model with the retrieved details. """ if not is_video_url(url): error_message = f"[VIDEO] Invalid video URL: {url}" logger.error(error_message) raise URLError(error_message) # Navegar a la URL y esperar que el DOM esté listo. page.goto(url=url, wait_until="domcontentloaded") # Buscar el título del video. try: title = page.locator( """ h1[class='ibm bold-600 no-margin f-text-22'], h1[class='ibm bold-600 no-margin f-text-48'] """ ).inner_text(timeout=1000) title = clean_new_line(title) title = clean_string(title) except Exception as e: error_message = f"[VIDEO] Title not found: {url}" logger.error(error_message) raise VideoError(error_message) from e # Obtener el valor del input de YouTube. youtube_id = None try: youtube_id = page.locator("input#youtube_video_id").get_attribute("value") if youtube_id: print(f"[YOUTUBE] ID found: {youtube_id}") else: print("[YOUTUBE] ID is empty.") except Exception as e: print(f"[YOUTUBE] Error retrieving ID: {e}") youtube_id = None # Extraer el videoUrl del script en la página. video_url_js = None try: video_url_js = page.evaluate( """() => { const scriptContent = Array.from(document.querySelectorAll('script')) .map(script => script.textContent) .find(content => content.includes('let videoUrl')); const match = scriptContent.match(/let videoUrl = "(.*?)"/); return match ? match[1] : null; }""" ) if video_url_js: print(f"[VIDEO] videoUrl found: {video_url_js}") else: print("[VIDEO] videoUrl not found.") except Exception as e: print(f"[VIDEO] Error extracting videoUrl: {e}") # Obtener los IDs del video y curso. video_id = page.locator("input[name='video_id']").first.get_attribute("value") course_id = page.locator("input[name='course_id']").first.get_attribute("value") if not video_id or not course_id: error_message = f"[VIDEO] IDs not found: {url}" logger.error(error_message) raise VideoError(error_message) # Determinar la URL base del m3u8 según la presencia del ID de YouTube o videoUrl. if youtube_id: base_m3u8_url = "https://www.youtube.com/watch?v=" m3u8_url = f"{base_m3u8_url}{youtube_id}" elif video_url_js: m3u8_url = f"https://video-storage.codigofacilito.com{video_url_js}" else: base_m3u8_url = "https://video-storage.codigofacilito.com/" m3u8_url = f"{base_m3u8_url}/hls/{course_id}/{video_id}/playlist.m3u8" # Determinar el tipo de media. media_type: Optional[MediaType] = None if "/videos/" in url: media_type = MediaType.STREAMING elif "/articulos/" in url: media_type = MediaType.READING # Retornar el objeto Video con los detalles obtenidos. return Video( id=video_id, url=url, m3u8_url=m3u8_url, title=title, media_type=media_type, description=None, ) # TODO: improve this function, handles more error cases 👇 def get_course_detail_sync(url: str, page: Page) -> Course: """ Retrieves detailed information about a course from a given URL. Args: url (str): The URL of the course to be detailed. page (Page): The playwright page object to interact with the webpage. Returns: Course: An object containing the course details.""" if not is_course_url(url): error_message = f"[COURSE] Invalid course URL: {url}" logger.error(error_message) raise URLError(error_message) page.goto(url=url, wait_until=None) # expand collapsed sections expanders.expand_course_sections(page) # get course title title = page.title() # get course sections try: sections = _get_sections(page) except Exception as e: error_message = f"[COURSE] an error occurred: {url}" logger.error(error_message) raise CourseError(error_message) from e course = Course( url=url, title=title, sections=sections, ) return course def get_bootcamp_detail_sync(url: str, page: Page) -> Bootcamp: """ Retrieves detailed information about a bootcamp from a given URL. Args: url (str): The URL of the bootcamp to be detailed. page (Page): The playwright page object to interact with the webpage. Returns: Bootcamp: An object containing the bootcamp details.""" if not is_bootcamp_url(url): error_message = f"[BOOTCAMP] Invalid course URL: {url}" logger.error(error_message) raise URLError(error_message) page.goto(url=url, wait_until=None) expanders.expand_bootcamps_modules(page) # get bootcamp title bootcamp_title = clean_bootcamp_title(page.title()) tprint( f"[bold red]Bootcamp title:[/bold red] [bright_red]{bootcamp_title}[/bright_red]" ) path = f"{consts.DOWNLOADS_DIR}/" path += f"Bootcamp - {bootcamp_title}/" # get bootcamp modules try: all_modules = _get_modules(page=page, path=path) bootcamp_obj = Bootcamp(url=url, title=bootcamp_title, modules=all_modules) return bootcamp_obj except Exception as e: error_message = f"[BOOTCAMP] an error occurred: {url}" logger.error(error_message) raise BootcampError(error_message) from e def _get_sections(page: Page) -> list[CourseSection]: """Get course sections from a page. This function collects all course sections from the given page by looking for specific HTML div elements with class 'f-top-16' and extracts their corresponding titles. Args: page (Page): The playwright page object representing the web page. Returns: list[CourseSection]: A list of CourseSection objects. """ sections: list[CourseSection] = [] # possibly some containers are empty sections_container_divs = page.query_selector_all("div[class='f-top-16']") for div in sections_container_divs: title_match = div.query_selector("h4") if title_match is None: continue logger.debug("[Section Title] %s", title_match.inner_text()) a_tags = div.query_selector_all("a") all_videos: list[VideoURL] = [] for a_tag in a_tags: p_element_title = a_tag.query_selector( "p[class='ibm f-text-16 bold no-margin-bottom f-top-small']" ) if p_element_title is not None: video_title = p_element_title.inner_text() video_url = a_tag.get_attribute("href") video_url_obj = VideoURL( title=video_title, url=f"{consts.BASE_URL}{video_url}", ) all_videos.append(video_url_obj) logger.debug("This section has %s videos", len(all_videos)) sections.append( CourseSection( title=title_match.inner_text(), videos_url=all_videos, ), ) return sections def _get_videos(url: str, page: Page, dict_info: dict) -> list[BootcampVideo]: try: page.goto(url=url, wait_until=None) div_collapsible = page.query_selector( "div[class*='collapsible-body no-border topics-li']" ) a_tags = div_collapsible.query_selector_all("a") videos_class: list[BootcampVideo] = [] for a_tag in a_tags: video_url = a_tag.get_attribute("href") video_sequence = int( clean_new_line( a_tag.query_selector( "p[class*='no-margin h5 bold f-blues-text']" ).inner_html() ) .strip() .split(" ")[1] ) video_title = a_tag.query_selector( "p[class*='ibm f-text-16 bold no-margin-bottom f-top-small']" ).inner_html() video_title = clean_new_line(video_title) if video_title.upper() != "CLASE COMPLETA": pattern = re.compile( r"Clase Completa(?:\s*de\s*|\s*-\s*)?(.*)", re.IGNORECASE ) match = re.search(pattern, video_title) if match: video_title = match.group(1) video_title = clean_string(video_title) tprint(" [bold green]Video title:[/bold green]", end=" ") tprint(f"[green]{video_sequence:02d}. {video_title}[/green]") video_obj = BootcampVideo( url=f"{consts.BASE_URL}{video_url}", sequence=video_sequence, title=video_title, ) videos_class.append(video_obj) return videos_class except Exception as e: error_message = ( f"[MODULE] {dict_info['module_sequence']:02d}. {dict_info['module_title']} " ) error_message += ( f"[CLASS] {dict_info['class_sequence']:02d}. {dict_info['class_title']} " ) error_message += f"=> {e}" logger.error(error_message) tprint(f"Exception: {e}") def _get_classes(a_tags, path: str, page: Page, dict_info: dict) -> list[BootcampClass]: """ Get information about the classes that make up a bootcamp module Args: a_tags (_type_): All hypelinks into page / website path (str): Father dir path page (Page): Page dict_info (dict): Dictionary with information to use Raises: ClassErrorName: When a class has no name Returns: list[BootcampClass]: List with all information of classes """ all_classes: list[BootcampClass] = [] for a_tag in a_tags: p_title = a_tag.query_selector( "p[class='ibm f-text-16 bold no-margin-bottom f-top-small']" ) p_tag = ( a_tag.query_selector("p[class*='no-margin h5 bold f-blues-text--2']") ).inner_html() match = re.search(consts.CLASS_NAME, p_tag) p_sequence = [int(match.group(1)), match.group(2)] if p_title is not None: class_title = clean_string(p_title.inner_html()) class_url = consts.BASE_URL + a_tag.get_attribute("href") if p_sequence[1] != "Curso": tprint(" [bold magenta]Class Title:[/bold magenta]", end=" ") tprint( f"[bright_magenta]{p_sequence[0]:02d}. {class_title}[/bright_magenta]" ) new_page = page.context.new_page() all_videos = _get_videos( class_url, new_page, dict_info={ "module_sequence": dict_info["module_sequence"], "module_title": dict_info["module_title"], "class_sequence": p_sequence[0], "class_title": class_title, }, ) while not new_page.is_closed(): new_page.close() all_classes.append( BootcampClass( sequence=p_sequence[0], title=class_title, url=f"{consts.BASE_URL}{class_url}", videos=all_videos, ) ) else: _print_error_class( path=path, dict_info={ "module_sequence": dict_info["module_sequence"], "module_title": dict_info["module_title"], "class_sequence": p_sequence[0], "class_title": class_title, "class_url": class_url, }, ) else: error_message = ( f"[MODULE] {dict_info['module_title']} [CLASS] {p_sequence[0]:02d}" ) logger.error(error_message) return all_classes def _get_modules(page: Page, path: str) -> list[BootcampModule]: """ Get bootcamp modules from a page. Args: page (Page): The playwright page object representing the web page. Returns: list[BootcampModule]: A list of BootcampModule objects. """ all_modules: list[BootcampModule] = [] all_ul = page.query_selector_all( "ul[class='collapsible no-box-shadow no-border f-topics flex-column f-top f-gap-medium flex-block f-dark-mode']" )[0] modules_li = all_ul.query_selector_all("li[class*='f-radius-small']") for module in modules_li: module_title = clean_string(module.query_selector("h4").inner_html()) span_module = ( module.query_selector("span[class='f-green-text f-green-text--2 bold h5']") .inner_html() .split("\n") ) span_module = list(filter(None, span_module)) module_sequence = int(span_module[1]) if module_title is None: continue logger.debug("[Module Title] %s", module_title) tprint(" [bold cyan]Module Title:[/bold cyan]", end=" ") tprint(f"[bright_cyan]{module_sequence:02d}. {module_title}[/bright_cyan]") path += f"{module_sequence:02d}. {module_title}/" a_tags = module.query_selector_all("a") all_classes = _get_classes( a_tags=a_tags, path=path, page=page, dict_info={ "module_sequence": module_sequence, "module_title": module_title, }, ) bootcamp_module_obj = BootcampModule( sequence=module_sequence, title=module_title, classes=all_classes ) all_modules.append(bootcamp_module_obj) return all_modules def _generate_file(path: str, file_name: str, type_file: str, url: str) -> bool: if not os.path.exists(path): os.makedirs(path, exist_ok=True) with open(f"{path}{file_name}.{type_file}", "w", encoding="utf-8") as f: f.write(" -- CURSO NO CLASE -- \n") f.write("En este bootcamp, esta no es una clase sino un curso completo,") f.write(" puede revisarlo:\n") f.write(f"Curso: {file_name} => {url}\n") f.write(" -- CURSO NO CLASE -- \n") return os.path.exists(f"{path}{file_name}.{type_file}") def _print_error_class(path: str, dict_info: dict): tprint("[blink bold red] -- CURSO NO CLASE -- [/blink bold red]") tprint("La siguiente no es una clase sino un curso completo, revisarlo:") tprint("[bold bright_blue][MODULO][/bold bright_blue]", end=" ") tprint(f"[bright_blue]{dict_info['module_sequence']:02d}. [/bright_blue]", end="") tprint(f"[bright_blue]{dict_info['module_title']}[/bright_blue]") tprint("[bold yellow][CURSO][/bold yellow]", end=" ") tprint(f"[yellow]{dict_info['class_sequence']:02d}. [/yellow]", end="") tprint(f"[yellow]{dict_info['class_title']}[/yellow]", end=" ") tprint(f"=> [link]{dict_info['class_url']}[/link]") tprint("[blink bold red] -- CURSO NO CLASE -- [/blink bold red]") if not _generate_file( path=path, file_name=f"{dict_info['class_sequence']:02d}. {dict_info['class_title']}", type_file="txt", url=dict_info["class_url"], ): tprint("[bold red]Error ![/bold red] generating file => ", end="") tprint(f"{dict_info['class_sequence']:02d}. {dict_info['class_title']}") __all__ = [ "get_video_detail_sync", "get_course_detail_sync", "get_bootcamp_detail_sync", ]

ivansaul / codigo_facilito_downloader

[YOUTUBE]Fix Youtube VIDEO_ID #42