ivansaul / codigo_facilito_downloader

Descarga automatizada de Codigo Facilito
141 stars 45 forks source link

[YOUTUBE]Fix Youtube VIDEO_ID #42

Open Slider32MX opened 1 month ago

Slider32MX commented 1 month ago

Hola compañeros de Codigo Donwloader, despues de estar probando y probando por que los videos de este curso Enlaceno se descargaban, me puse a investigar un poco y encontre que algunos cursos tienen enlace de Youtube y no de codigofacilito con el siguiente codigo youtube_video_id,

<div class=" " id="my-player"><input type="hidden" name="youtube_video_id" id="youtube_video_id" value="b2PW-BHwTxU" /><div class=" video-container video_player"><div id="player"></div></div></div> entonces cuando el collertor.py revisaba este daba error y se cerraba ya que no eran los enlaces tipicos para completar https://video-storage.codigofacilito.com/, por lo que me puse a modificar un poco el codigo y pude hacerlo funcionar con un try y validando si tiene el id de youtube para mandar el enlace a ytb_dl quedando de la siguiente manera:

def get_video_detail_sync(url: str, page: Page) -> Video:
    """Retrieve detailed information for a video from its URL.

    Args:
        url (str): The URL of the video page.
        page (Page): The Playwright page object.

    Returns:
        Video: An instance of the Video model with the retrieved details.
    """

    if not is_video_url(url):
        error_message = f"[VIDEO] Invalid video URL: {url}"
        logger.error(error_message)
        raise URLError(error_message)

    # Navegar a la URL y esperar que el DOM esté listo.
    page.goto(url=url, wait_until="domcontentloaded")

    # Buscar el título del video.
    try:
        title = page.locator(
            """
            h1[class='ibm bold-600 no-margin f-text-22'], 
            h1[class='ibm bold-600 no-margin f-text-48']
            """
        ).inner_text(timeout=1000)
        title = clean_new_line(title)
        title = clean_string(title)
    except Exception as e:
        error_message = f"[VIDEO] Title not found: {url}"
        logger.error(error_message)
        raise VideoError(error_message) from e

    # Obtener directamente el valor del input de YouTube.
    youtube_id = None
    try:
        youtube_id = page.locator("input#youtube_video_id").get_attribute("value")
        if youtube_id:
            print(f"[YOUTUBE] ID found: {youtube_id}")
        else:
            print("[YOUTUBE] ID is empty.")
    except Exception as e:
        print(f"[YOUTUBE] Error retrieving ID: {e}")
        youtube_id = None

    # Obtener los IDs del video y curso.
    video_id = page.locator("input[name='video_id']").first.get_attribute("value")
    course_id = page.locator("input[name='course_id']").first.get_attribute("value")

    if not video_id or not course_id:
        error_message = f"[VIDEO] IDs not found: {url}"
        logger.error(error_message)
        raise VideoError(error_message)

    # Determinar la URL base del m3u8 según la presencia del ID de YouTube.
    if youtube_id:
        base_m3u8_url = "https://www.youtube.com/watch?v="
        m3u8_url = f"{base_m3u8_url}{youtube_id}"
    else:
        base_m3u8_url = "https://video-storage.codigofacilito.com/"
        m3u8_url = f"{base_m3u8_url}/hls/{course_id}/{video_id}/playlist.m3u8"

    # Determinar el tipo de media.
    media_type: Optional[MediaType] = None
    if "/videos/" in url:
        media_type = MediaType.STREAMING
    elif "/articulos/" in url:
        media_type = MediaType.READING

    # Retornar el objeto Video con los detalles obtenidos.
    return Video(
        id=video_id,
        url=url,
        m3u8_url=m3u8_url,
        title=title,
        media_type=media_type,
        description=None,
    )

pero tengo un error que es [YOUTUBE] Error retrieving ID: Timeout 30000ms exceeded. y creo tiene que ver con helpers.py, pero creo que ustedes podrian ayudar a solucionar esto y darle un mejor formato al codigo que medio implemente, saludos

De igual manera ya esta el nuevo metodo arreglado el cual ahora el vide se optiene de esta parte:

--   |

quedando el collector.pý

"""Collectors for Facilito API"""

import os
import re
from typing import Optional

from playwright.sync_api import Page
from rich import print as tprint

from .. import consts
from ..errors import BootcampError, CourseError, URLError, VideoError
from ..helpers import (
    clean_bootcamp_title,
    clean_new_line,
    clean_string,
    is_bootcamp_url,
    is_course_url,
    is_video_url,
)
from ..models.bootcamp import Bootcamp, BootcampClass, BootcampModule, BootcampVideo
from ..models.course import Course, CourseSection, VideoURL
from ..models.video import MediaType, Video
from ..utils import expanders
from ..utils.logger import logger

def get_article_sync(url: str, page: Page) -> Page:
    """
    Get info by article
    """
    page.goto(url=url, wait_until=None)
    page.evaluate(
        """
        let elements_to_delete = document.querySelectorAll("div[class='player-header']");
        for (let element of elements_to_delete) {
            element.parentNode.removeChild(element);
        }
    """
    )
    page.evaluate(
        """
        elements_to_delete = document.querySelectorAll("div[class='row f-gap-medium middle-xs']");
        for (let element of elements_to_delete) {
            element.parentNode.removeChild(element);
        }
    """
    )
    page.evaluate(
        """
        elements_to_delete = document.querySelectorAll("div[class='player-sidebar relative']");
        for (let element of elements_to_delete) {
            element.parentNode.removeChild(element);
        }
    """
    )
    return page

def get_video_detail_sync(url: str, page: Page) -> Video:
    """Retrieve detailed information for a video from its URL.

    Args:
        url (str): The URL of the video page.
        page (Page): The Playwright page object.

    Returns:
        Video: An instance of the Video model with the retrieved details.
    """

    if not is_video_url(url):
        error_message = f"[VIDEO] Invalid video URL: {url}"
        logger.error(error_message)
        raise URLError(error_message)

    # Navegar a la URL y esperar que el DOM esté listo.
    page.goto(url=url, wait_until="domcontentloaded")

    # Buscar el título del video.
    try:
        title = page.locator(
            """
            h1[class='ibm bold-600 no-margin f-text-22'], 
            h1[class='ibm bold-600 no-margin f-text-48']
            """
        ).inner_text(timeout=1000)
        title = clean_new_line(title)
        title = clean_string(title)
    except Exception as e:
        error_message = f"[VIDEO] Title not found: {url}"
        logger.error(error_message)
        raise VideoError(error_message) from e

    # Obtener el valor del input de YouTube.
    youtube_id = None
    try:
        youtube_id = page.locator("input#youtube_video_id").get_attribute("value")
        if youtube_id:
            print(f"[YOUTUBE] ID found: {youtube_id}")
        else:
            print("[YOUTUBE] ID is empty.")
    except Exception as e:
        print(f"[YOUTUBE] Error retrieving ID: {e}")
        youtube_id = None

    # Extraer el videoUrl del script en la página.
    video_url_js = None
    try:
        video_url_js = page.evaluate(
            """() => {
                const scriptContent = Array.from(document.querySelectorAll('script'))
                    .map(script => script.textContent)
                    .find(content => content.includes('let videoUrl'));
                const match = scriptContent.match(/let videoUrl = "(.*?)"/);
                return match ? match[1] : null;
            }"""
        )
        if video_url_js:
            print(f"[VIDEO] videoUrl found: {video_url_js}")
        else:
            print("[VIDEO] videoUrl not found.")
    except Exception as e:
        print(f"[VIDEO] Error extracting videoUrl: {e}")

    # Obtener los IDs del video y curso.
    video_id = page.locator("input[name='video_id']").first.get_attribute("value")
    course_id = page.locator("input[name='course_id']").first.get_attribute("value")

    if not video_id or not course_id:
        error_message = f"[VIDEO] IDs not found: {url}"
        logger.error(error_message)
        raise VideoError(error_message)

    # Determinar la URL base del m3u8 según la presencia del ID de YouTube o videoUrl.
    if youtube_id:
        base_m3u8_url = "https://www.youtube.com/watch?v="
        m3u8_url = f"{base_m3u8_url}{youtube_id}"
    elif video_url_js:
        m3u8_url = f"https://video-storage.codigofacilito.com{video_url_js}"
    else:
        base_m3u8_url = "https://video-storage.codigofacilito.com/"
        m3u8_url = f"{base_m3u8_url}/hls/{course_id}/{video_id}/playlist.m3u8"

    # Determinar el tipo de media.
    media_type: Optional[MediaType] = None
    if "/videos/" in url:
        media_type = MediaType.STREAMING
    elif "/articulos/" in url:
        media_type = MediaType.READING

    # Retornar el objeto Video con los detalles obtenidos.
    return Video(
        id=video_id,
        url=url,
        m3u8_url=m3u8_url,
        title=title,
        media_type=media_type,
        description=None,
    )

# TODO: improve this function, handles more error cases 👇
def get_course_detail_sync(url: str, page: Page) -> Course:
    """
    Retrieves detailed information about a course from a given URL.

    Args:
        url (str): The URL of the course to be detailed.
        page (Page): The playwright page object to interact with the webpage.

    Returns:
        Course: An object containing the course details."""

    if not is_course_url(url):
        error_message = f"[COURSE] Invalid course URL: {url}"
        logger.error(error_message)
        raise URLError(error_message)

    page.goto(url=url, wait_until=None)

    # expand collapsed sections
    expanders.expand_course_sections(page)

    # get course title
    title = page.title()

    # get course sections
    try:
        sections = _get_sections(page)
    except Exception as e:
        error_message = f"[COURSE] an error occurred: {url}"
        logger.error(error_message)
        raise CourseError(error_message) from e

    course = Course(
        url=url,
        title=title,
        sections=sections,
    )

    return course

def get_bootcamp_detail_sync(url: str, page: Page) -> Bootcamp:
    """
    Retrieves detailed information about a bootcamp from a given URL.

    Args:
        url (str): The URL of the bootcamp to be detailed.
        page (Page): The playwright page object to interact with the webpage.

    Returns:
        Bootcamp: An object containing the bootcamp details."""

    if not is_bootcamp_url(url):
        error_message = f"[BOOTCAMP] Invalid course URL: {url}"
        logger.error(error_message)
        raise URLError(error_message)

    page.goto(url=url, wait_until=None)
    expanders.expand_bootcamps_modules(page)

    # get bootcamp title
    bootcamp_title = clean_bootcamp_title(page.title())

    tprint(
        f"[bold red]Bootcamp title:[/bold red] [bright_red]{bootcamp_title}[/bright_red]"
    )

    path = f"{consts.DOWNLOADS_DIR}/"
    path += f"Bootcamp - {bootcamp_title}/"

    # get bootcamp modules
    try:
        all_modules = _get_modules(page=page, path=path)
        bootcamp_obj = Bootcamp(url=url, title=bootcamp_title, modules=all_modules)
        return bootcamp_obj
    except Exception as e:
        error_message = f"[BOOTCAMP] an error occurred: {url}"
        logger.error(error_message)
        raise BootcampError(error_message) from e

def _get_sections(page: Page) -> list[CourseSection]:
    """Get course sections from a page.

    This function collects all course sections from the given page by looking for
    specific HTML div elements with class 'f-top-16' and extracts their corresponding titles.

    Args:
        page (Page): The playwright page object representing the web page.

    Returns:
        list[CourseSection]: A list of CourseSection objects.
    """
    sections: list[CourseSection] = []

    # possibly some containers are empty
    sections_container_divs = page.query_selector_all("div[class='f-top-16']")

    for div in sections_container_divs:
        title_match = div.query_selector("h4")
        if title_match is None:
            continue

        logger.debug("[Section Title] %s", title_match.inner_text())

        a_tags = div.query_selector_all("a")
        all_videos: list[VideoURL] = []
        for a_tag in a_tags:
            p_element_title = a_tag.query_selector(
                "p[class='ibm f-text-16 bold no-margin-bottom f-top-small']"
            )
            if p_element_title is not None:
                video_title = p_element_title.inner_text()
                video_url = a_tag.get_attribute("href")
                video_url_obj = VideoURL(
                    title=video_title,
                    url=f"{consts.BASE_URL}{video_url}",
                )
                all_videos.append(video_url_obj)

        logger.debug("This section has %s videos", len(all_videos))

        sections.append(
            CourseSection(
                title=title_match.inner_text(),
                videos_url=all_videos,
            ),
        )

    return sections

def _get_videos(url: str, page: Page, dict_info: dict) -> list[BootcampVideo]:
    try:
        page.goto(url=url, wait_until=None)
        div_collapsible = page.query_selector(
            "div[class*='collapsible-body no-border topics-li']"
        )
        a_tags = div_collapsible.query_selector_all("a")
        videos_class: list[BootcampVideo] = []
        for a_tag in a_tags:
            video_url = a_tag.get_attribute("href")
            video_sequence = int(
                clean_new_line(
                    a_tag.query_selector(
                        "p[class*='no-margin h5 bold f-blues-text']"
                    ).inner_html()
                )
                .strip()
                .split(" ")[1]
            )
            video_title = a_tag.query_selector(
                "p[class*='ibm f-text-16 bold no-margin-bottom f-top-small']"
            ).inner_html()
            video_title = clean_new_line(video_title)
            if video_title.upper() != "CLASE COMPLETA":
                pattern = re.compile(
                    r"Clase Completa(?:\s*de\s*|\s*-\s*)?(.*)", re.IGNORECASE
                )
                match = re.search(pattern, video_title)
                if match:
                    video_title = match.group(1)
            video_title = clean_string(video_title)
            tprint("      [bold green]Video title:[/bold green]", end=" ")
            tprint(f"[green]{video_sequence:02d}. {video_title}[/green]")
            video_obj = BootcampVideo(
                url=f"{consts.BASE_URL}{video_url}",
                sequence=video_sequence,
                title=video_title,
            )
            videos_class.append(video_obj)
        return videos_class
    except Exception as e:
        error_message = (
            f"[MODULE] {dict_info['module_sequence']:02d}. {dict_info['module_title']} "
        )
        error_message += (
            f"[CLASS] {dict_info['class_sequence']:02d}. {dict_info['class_title']} "
        )
        error_message += f"=> {e}"
        logger.error(error_message)
        tprint(f"Exception: {e}")

def _get_classes(a_tags, path: str, page: Page, dict_info: dict) -> list[BootcampClass]:
    """
    Get information about the classes that make up
    a bootcamp module

    Args:
        a_tags (_type_): All hypelinks into page / website
        path (str): Father dir path
        page (Page): Page
        dict_info (dict): Dictionary with information to use

    Raises:
        ClassErrorName: When a class has no name

    Returns:
        list[BootcampClass]: List with all information of classes
    """

    all_classes: list[BootcampClass] = []
    for a_tag in a_tags:
        p_title = a_tag.query_selector(
            "p[class='ibm f-text-16 bold no-margin-bottom f-top-small']"
        )
        p_tag = (
            a_tag.query_selector("p[class*='no-margin h5 bold f-blues-text--2']")
        ).inner_html()
        match = re.search(consts.CLASS_NAME, p_tag)
        p_sequence = [int(match.group(1)), match.group(2)]
        if p_title is not None:
            class_title = clean_string(p_title.inner_html())
            class_url = consts.BASE_URL + a_tag.get_attribute("href")
            if p_sequence[1] != "Curso":
                tprint("    [bold magenta]Class Title:[/bold magenta]", end=" ")
                tprint(
                    f"[bright_magenta]{p_sequence[0]:02d}. {class_title}[/bright_magenta]"
                )
                new_page = page.context.new_page()
                all_videos = _get_videos(
                    class_url,
                    new_page,
                    dict_info={
                        "module_sequence": dict_info["module_sequence"],
                        "module_title": dict_info["module_title"],
                        "class_sequence": p_sequence[0],
                        "class_title": class_title,
                    },
                )
                while not new_page.is_closed():
                    new_page.close()
                all_classes.append(
                    BootcampClass(
                        sequence=p_sequence[0],
                        title=class_title,
                        url=f"{consts.BASE_URL}{class_url}",
                        videos=all_videos,
                    )
                )
            else:
                _print_error_class(
                    path=path,
                    dict_info={
                        "module_sequence": dict_info["module_sequence"],
                        "module_title": dict_info["module_title"],
                        "class_sequence": p_sequence[0],
                        "class_title": class_title,
                        "class_url": class_url,
                    },
                )
        else:
            error_message = (
                f"[MODULE] {dict_info['module_title']} [CLASS] {p_sequence[0]:02d}"
            )
            logger.error(error_message)
    return all_classes

def _get_modules(page: Page, path: str) -> list[BootcampModule]:
    """
    Get bootcamp modules from a page.

    Args:
        page (Page): The playwright page object representing the web page.

    Returns:
        list[BootcampModule]: A list of BootcampModule objects.
    """

    all_modules: list[BootcampModule] = []
    all_ul = page.query_selector_all(
        "ul[class='collapsible no-box-shadow no-border f-topics flex-column f-top f-gap-medium flex-block f-dark-mode']"
    )[0]
    modules_li = all_ul.query_selector_all("li[class*='f-radius-small']")
    for module in modules_li:
        module_title = clean_string(module.query_selector("h4").inner_html())
        span_module = (
            module.query_selector("span[class='f-green-text f-green-text--2 bold h5']")
            .inner_html()
            .split("\n")
        )
        span_module = list(filter(None, span_module))
        module_sequence = int(span_module[1])
        if module_title is None:
            continue
        logger.debug("[Module Title] %s", module_title)
        tprint("  [bold cyan]Module Title:[/bold cyan]", end=" ")
        tprint(f"[bright_cyan]{module_sequence:02d}. {module_title}[/bright_cyan]")
        path += f"{module_sequence:02d}. {module_title}/"
        a_tags = module.query_selector_all("a")
        all_classes = _get_classes(
            a_tags=a_tags,
            path=path,
            page=page,
            dict_info={
                "module_sequence": module_sequence,
                "module_title": module_title,
            },
        )
        bootcamp_module_obj = BootcampModule(
            sequence=module_sequence, title=module_title, classes=all_classes
        )
        all_modules.append(bootcamp_module_obj)

    return all_modules

def _generate_file(path: str, file_name: str, type_file: str, url: str) -> bool:
    if not os.path.exists(path):
        os.makedirs(path, exist_ok=True)
    with open(f"{path}{file_name}.{type_file}", "w", encoding="utf-8") as f:
        f.write("  -- CURSO NO CLASE --  \n")
        f.write("En este bootcamp, esta no es una clase sino un curso completo,")
        f.write(" puede revisarlo:\n")
        f.write(f"Curso: {file_name} => {url}\n")
        f.write("  -- CURSO NO CLASE --  \n")
    return os.path.exists(f"{path}{file_name}.{type_file}")

def _print_error_class(path: str, dict_info: dict):
    tprint("[blink bold red]  -- CURSO NO CLASE --  [/blink bold red]")
    tprint("La siguiente no es una clase sino un curso completo, revisarlo:")
    tprint("[bold bright_blue][MODULO][/bold bright_blue]", end=" ")
    tprint(f"[bright_blue]{dict_info['module_sequence']:02d}. [/bright_blue]", end="")
    tprint(f"[bright_blue]{dict_info['module_title']}[/bright_blue]")
    tprint("[bold yellow][CURSO][/bold yellow]", end=" ")
    tprint(f"[yellow]{dict_info['class_sequence']:02d}. [/yellow]", end="")
    tprint(f"[yellow]{dict_info['class_title']}[/yellow]", end=" ")
    tprint(f"=> [link]{dict_info['class_url']}[/link]")
    tprint("[blink bold red]  -- CURSO NO CLASE --  [/blink bold red]")
    if not _generate_file(
        path=path,
        file_name=f"{dict_info['class_sequence']:02d}. {dict_info['class_title']}",
        type_file="txt",
        url=dict_info["class_url"],
    ):
        tprint("[bold red]Error ![/bold red] generating file => ", end="")
        tprint(f"{dict_info['class_sequence']:02d}. {dict_info['class_title']}")

__all__ = [
    "get_video_detail_sync",
    "get_course_detail_sync",
    "get_bootcamp_detail_sync",
]
ivansaul commented 1 month ago

Hola @Slider32MX . Muchas gracias por reportar el problema y por tomarte el tiempo de investigar una solución. Es muy útil saber que algunos cursos tienen enlaces a YouTube en lugar de los típicos de CF. Aprecio mucho que hayas compartido el código modificado para manejar estos casos, lo revisaré y consideraré cómo integrarlo en la próxima actualización.

Gracias de nuevo por tu contribución. Saludos! ✌️