interlock-network / bouncer

Discord bot to scan & neutralize malicious links
GNU General Public License v3.0
5 stars 0 forks source link

Refactor URL identification to be more savvy #171

Closed DecentralizedDan closed 2 years ago

DecentralizedDan commented 2 years ago

Please attribute to Le0Developer in the README (or elsewhere)

import typing

SCHEMES = ("http", "https")

def has_url(content: str) -> bool:
    for scheme in SCHEMES:
        if f"{scheme}://" in content:
            return True
    return False

def get_urls(content: str) -> typing.Generator[str, None, None]:
    words = content.split()
    for word in words:
        if "://" not in word:
            continue

        # urls can start anywhere in a word
        # eg: testhttp://test.com is valid
        for scheme in SCHEMES:
            if f"{scheme}://" in word:
                break
        else:
            continue

        start = word.index(f"{scheme}://")
        url = word[start + len(scheme) + 3:]
        if url.endswith(")"):
            # handle urls with () in their path correctly
            # test.com/) -> test.com
            # test.com/() -> test.com()
            # test.com/()) -> test.com()
            count = url.count("(") - url.count(")")
            url = url[:-1] if count < 0 else url
        else:
            url = url.rstrip("]")

        # protocol://\test.com embeds but doesnt appear clickable,
        # but lets still parse that correctly
        if url.startswith("\\"):
            url = url.lstrip("\\/")

        yield scheme + "://" + url

def remove_urls(content: str) -> str:
    # remove the urls, starting with longest first
    # imagine this example: "http://test.com/ http//test.com/abcdef"
    # would keep "/abcdef"
    urls = list(get_urls(content))
    urls.sort(key=len, reverse=True)
    for url in urls:
        content = content.replace(url, "")
    return content
jmercouris commented 2 years ago

Completed!