LinkScrape/linkscrape.py

62 lines
2.1 KiB
Python
Raw Permalink Normal View History

2023-12-20 20:12:40 +00:00
import requests
import re
class LinkScrape:
hostname: str
_known_urls: list[str] # URLs including hostname
_unknown_urls: list[str] # URLs excluding hostname
def __init__(self, hostnames: str):
self.hostnames = hostnames
self._known_urls = list()
self._unknown_urls = list()
def get_html(self, url: str) -> str | bool:
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
print(f"Error: {response.status_code} - Unable to fetch HTML content")
return None
except Exception as e:
print(f"An error occurred: {str(e)}")
return None
def url_contains_hostname(self, url: str) -> bool:
if url[0] == '/': return True
pattern = r'https://([^/]+)/'
match = re.search(pattern, url)
if match is not None:
hostname = match.group(1)
else:
return False
for check in self.hostnames:
if check in hostname: return True
return False
def extract_urls(self, html: str) -> list[str]:
lines = html.splitlines()
pattern = re.compile(r'href="([^"]+)"|src="([^"]+)"')
for line in lines:
if 'href=' in line or 'src=' in line:
for match in pattern.finditer(line):
url = match.group(1) if (match.group(1) is not None) else match.group(2)
if url is not None:
print(f'Found URL: {url}')
if self.url_contains_hostname(url):
if not url in self._known_urls:
self._known_urls.append(url)
else:
if not url in self._unknown_urls:
self._unknown_urls.append(url)
def get_known_urls(self) -> list[str]:
return self._known_urls
def get_unknown_urls(self) -> list[str]:
return self._unknown_urls