Source code for fortrace.utility.web_scraper

from typing import Literal
from urllib.parse import urljoin

import requests
import urllib3
from bs4 import BeautifulSoup

from fortrace.utility.logger_helper import setup_logger

logger = setup_logger(__name__)


[docs] def scrape_web_page( url: str | urllib3.util.Url, element: Literal["a", "img"] ) -> list[urllib3.util.Url]: """Scrape a web page for all urls behind certain elements. Args: url: the web page to be scraped element: which kind of element links should be scraped from the page Returns: list of all urls from the web page (relative ones will be made absolute) """ start_url = ( url if isinstance(url, urllib3.util.Url) else urllib3.util.parse_url(url) ) try: page = requests.get(start_url, timeout=10) except requests.Timeout: logger.error("Received timeout exception for request to %s", url) return [] soup = BeautifulSoup(page.text, "html.parser") urls = [] # find all hrefs on page if element == "a": for a in soup.find_all("a", href=True): a_url = urllib3.util.parse_url(a["href"]) if a_url.scheme == "javascript": continue # TODO: support scraping of javascript pages with webdriver of selenium package # a_url is a relative url if a_url.scheme not in ["http", "https"]: a_url = urljoin(str(start_url), str(a_url)) urls.append(urllib3.util.parse_url(a_url)) else: urls.append(a_url) elif element == "img": for img in soup.find_all("img", src=True): urls.append(urllib3.util.parse_url(img)) # images can then be downloaded by browsing to the url and pressing ctrl-s return urls