62 lines
2.1 KiB
Python
62 lines
2.1 KiB
Python
import requests
|
|
import re
|
|
|
|
class LinkScrape:
|
|
hostname: str
|
|
_known_urls: list[str] # URLs including hostname
|
|
_unknown_urls: list[str] # URLs excluding hostname
|
|
|
|
def __init__(self, hostnames: str):
|
|
self.hostnames = hostnames
|
|
self._known_urls = list()
|
|
self._unknown_urls = list()
|
|
|
|
def get_html(self, url: str) -> str | bool:
|
|
try:
|
|
response = requests.get(url)
|
|
|
|
if response.status_code == 200:
|
|
return response.text
|
|
else:
|
|
print(f"Error: {response.status_code} - Unable to fetch HTML content")
|
|
return None
|
|
except Exception as e:
|
|
print(f"An error occurred: {str(e)}")
|
|
return None
|
|
|
|
def url_contains_hostname(self, url: str) -> bool:
|
|
if url[0] == '/': return True
|
|
|
|
pattern = r'https://([^/]+)/'
|
|
match = re.search(pattern, url)
|
|
if match is not None:
|
|
hostname = match.group(1)
|
|
else:
|
|
return False
|
|
|
|
for check in self.hostnames:
|
|
if check in hostname: return True
|
|
return False
|
|
|
|
def extract_urls(self, html: str) -> list[str]:
|
|
lines = html.splitlines()
|
|
pattern = re.compile(r'href="([^"]+)"|src="([^"]+)"')
|
|
|
|
for line in lines:
|
|
if 'href=' in line or 'src=' in line:
|
|
for match in pattern.finditer(line):
|
|
url = match.group(1) if (match.group(1) is not None) else match.group(2)
|
|
if url is not None:
|
|
print(f'Found URL: {url}')
|
|
if self.url_contains_hostname(url):
|
|
if not url in self._known_urls:
|
|
self._known_urls.append(url)
|
|
else:
|
|
if not url in self._unknown_urls:
|
|
self._unknown_urls.append(url)
|
|
|
|
def get_known_urls(self) -> list[str]:
|
|
return self._known_urls
|
|
|
|
def get_unknown_urls(self) -> list[str]:
|
|
return self._unknown_urls |