from typing import Literal
from urllib.parse import urljoin
import requests
import urllib3
from bs4 import BeautifulSoup
from fortrace.utility.logger_helper import setup_logger
logger = setup_logger(__name__)
[docs]
def scrape_web_page(
url: str | urllib3.util.Url, element: Literal["a", "img"]
) -> list[urllib3.util.Url]:
"""Scrape a web page for all urls behind certain elements.
Args:
url: the web page to be scraped
element: which kind of element links should be scraped from the page
Returns:
list of all urls from the web page (relative ones will be made absolute)
"""
start_url = (
url if isinstance(url, urllib3.util.Url) else urllib3.util.parse_url(url)
)
try:
page = requests.get(start_url, timeout=10)
except requests.Timeout:
logger.error("Received timeout exception for request to %s", url)
return []
soup = BeautifulSoup(page.text, "html.parser")
urls = []
# find all hrefs on page
if element == "a":
for a in soup.find_all("a", href=True):
a_url = urllib3.util.parse_url(a["href"])
if a_url.scheme == "javascript":
continue # TODO: support scraping of javascript pages with webdriver of selenium package
# a_url is a relative url
if a_url.scheme not in ["http", "https"]:
a_url = urljoin(str(start_url), str(a_url))
urls.append(urllib3.util.parse_url(a_url))
else:
urls.append(a_url)
elif element == "img":
for img in soup.find_all("img", src=True):
urls.append(urllib3.util.parse_url(img))
# images can then be downloaded by browsing to the url and pressing ctrl-s
return urls