From 038b8ce6e979c8c986105a4a30039d30a3e7a34c Mon Sep 17 00:00:00 2001 From: xoy Date: Wed, 20 Dec 2023 21:12:40 +0100 Subject: [PATCH] First commit --- .gitignore | 3 +++ linkscrape.py | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+) create mode 100644 .gitignore create mode 100644 linkscrape.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..061ef89 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +venv/ +__pycache__/ +main.py \ No newline at end of file diff --git a/linkscrape.py b/linkscrape.py new file mode 100644 index 0000000..019118b --- /dev/null +++ b/linkscrape.py @@ -0,0 +1,62 @@ +import requests +import re + +class LinkScrape: + hostname: str + _known_urls: list[str] # URLs including hostname + _unknown_urls: list[str] # URLs excluding hostname + + def __init__(self, hostnames: str): + self.hostnames = hostnames + self._known_urls = list() + self._unknown_urls = list() + + def get_html(self, url: str) -> str | bool: + try: + response = requests.get(url) + + if response.status_code == 200: + return response.text + else: + print(f"Error: {response.status_code} - Unable to fetch HTML content") + return None + except Exception as e: + print(f"An error occurred: {str(e)}") + return None + + def url_contains_hostname(self, url: str) -> bool: + if url[0] == '/': return True + + pattern = r'https://([^/]+)/' + match = re.search(pattern, url) + if match is not None: + hostname = match.group(1) + else: + return False + + for check in self.hostnames: + if check in hostname: return True + return False + + def extract_urls(self, html: str) -> list[str]: + lines = html.splitlines() + pattern = re.compile(r'href="([^"]+)"|src="([^"]+)"') + + for line in lines: + if 'href=' in line or 'src=' in line: + for match in pattern.finditer(line): + url = match.group(1) if (match.group(1) is not None) else match.group(2) + if url is not None: + print(f'Found URL: {url}') + if self.url_contains_hostname(url): + if not url in self._known_urls: + self._known_urls.append(url) + else: + if not url in self._unknown_urls: + self._unknown_urls.append(url) + + def get_known_urls(self) -> list[str]: + return self._known_urls + + def get_unknown_urls(self) -> list[str]: + return self._unknown_urls \ No newline at end of file