Source code for fortrace.utility.web_scraper

from typing import Literal
from urllib.parse import urljoin

import requests
import urllib3
from bs4 import BeautifulSoup

from fortrace.utility.logger_helper import setup_logger

logger = setup_logger(__name__)



[docs]
def scrape_web_page(
    url: str | urllib3.util.Url, element: Literal["a", "img"]
) -> list[urllib3.util.Url]:
    """Scrape a web page for all urls behind certain elements.

    Args:
        url: the web page to be scraped
        element: which kind of element links should be scraped from the page

    Returns:
        list of all urls from the web page (relative ones will be made absolute)
    """
    start_url = (
        url if isinstance(url, urllib3.util.Url) else urllib3.util.parse_url(url)
    )
    try:
        page = requests.get(start_url, timeout=10)
    except requests.Timeout:
        logger.error("Received timeout exception for request to %s", url)
        return []
    soup = BeautifulSoup(page.text, "html.parser")

    urls = []
    # find all hrefs on page
    if element == "a":
        for a in soup.find_all("a", href=True):
            a_url = urllib3.util.parse_url(a["href"])
            if a_url.scheme == "javascript":
                continue  # TODO: support scraping of javascript pages with webdriver of selenium package
            # a_url is a relative url
            if a_url.scheme not in ["http", "https"]:
                a_url = urljoin(str(start_url), str(a_url))
                urls.append(urllib3.util.parse_url(a_url))
            else:
                urls.append(a_url)
    elif element == "img":
        for img in soup.find_all("img", src=True):
            urls.append(urllib3.util.parse_url(img))
            # images can then be downloaded by browsing to the url and pressing ctrl-s
    return urls