Skip to content

Commit 1844640

Browse files
committed
Remove python-bs4 dependency
1 parent 0c15f86 commit 1844640

2 files changed

Lines changed: 116 additions & 107 deletions

File tree

biglinux-webapps/usr/share/biglinux/webapps/webapps/utils/url_utils.py

Lines changed: 115 additions & 106 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,89 @@
1010
import os
1111
import io
1212
from urllib.parse import urlparse, urljoin
13-
from bs4 import BeautifulSoup
13+
from html.parser import HTMLParser
1414
from PIL import Image # Add Pillow import
1515

1616
gi.require_version("Gtk", "4.0")
1717
from gi.repository import GLib
1818

1919

20+
class WebsiteMetadataParser(HTMLParser):
21+
"""Parser for extracting title and icons from HTML"""
22+
23+
def __init__(self):
24+
super().__init__()
25+
self.title = None
26+
self.icons = []
27+
self.og_title = None
28+
self.twitter_title = None
29+
self.og_image = None
30+
self.twitter_image = None
31+
self._in_title = False
32+
33+
def handle_starttag(self, tag, attrs):
34+
attrs_dict = dict(attrs)
35+
36+
if tag == "title":
37+
self._in_title = True
38+
39+
elif tag == "meta":
40+
# Handle Open Graph and Twitter metadata
41+
property_attr = attrs_dict.get("property", "")
42+
name_attr = attrs_dict.get("name", "")
43+
content = attrs_dict.get("content")
44+
45+
if content:
46+
if property_attr == "og:title":
47+
self.og_title = content
48+
elif name_attr == "twitter:title":
49+
self.twitter_title = content
50+
elif property_attr == "og:image":
51+
self.og_image = content
52+
elif name_attr == "twitter:image":
53+
self.twitter_image = content
54+
55+
elif tag == "link":
56+
rel = attrs_dict.get("rel", "").lower()
57+
href = attrs_dict.get("href")
58+
59+
if href:
60+
# Match common icon rel types
61+
if any(
62+
x in rel
63+
for x in ["icon", "shortcut icon", "apple-touch-icon", "mask-icon"]
64+
):
65+
self.icons.append(href)
66+
67+
def handle_endtag(self, tag):
68+
if tag == "title":
69+
self._in_title = False
70+
71+
def handle_data(self, data):
72+
if self._in_title:
73+
if self.title is None:
74+
self.title = data
75+
else:
76+
self.title += data
77+
78+
def get_best_title(self):
79+
if self.title:
80+
return self.title.strip()
81+
if self.og_title:
82+
return self.og_title.strip()
83+
if self.twitter_title:
84+
return self.twitter_title.strip()
85+
return None
86+
87+
def get_all_icons(self):
88+
all_icons = self.icons.copy()
89+
if self.og_image:
90+
all_icons.append(self.og_image)
91+
if self.twitter_image:
92+
all_icons.append(self.twitter_image)
93+
return all_icons
94+
95+
2096
class WebsiteInfoFetcher:
2197
"""Class for fetching website information like title and favicons"""
2298

@@ -56,14 +132,48 @@ def _fetch_info_thread(self, url, callback):
56132
response = session.get(url, timeout=10)
57133
response.raise_for_status()
58134

59-
# Parse the HTML
60-
soup = BeautifulSoup(response.text, "html.parser")
135+
# Parse the HTML with our custom parser
136+
parser = WebsiteMetadataParser()
137+
parser.feed(response.text)
61138

62139
# Get the title
63-
title = self._extract_title(soup, url)
140+
title = parser.get_best_title()
141+
if title:
142+
# Clean up the title
143+
title = re.sub(r"\s+", " ", title)
144+
else:
145+
# Fallback: Use domain name
146+
domain = urlparse(url).netloc.replace("www.", "")
147+
title = domain.capitalize()
64148

65149
# Get favicons
66-
icons = self._extract_favicons(soup, url, session)
150+
raw_icons = parser.get_all_icons()
151+
icons = []
152+
153+
# Normalize icon URLs
154+
base_url = url
155+
for icon_href in raw_icons:
156+
if icon_href:
157+
if not icon_href.startswith(("http://", "https://")):
158+
icon_href = urljoin(base_url, icon_href)
159+
if icon_href not in icons:
160+
icons.append(icon_href)
161+
162+
# Look for favicon in common locations (root)
163+
parsed_url = urlparse(url)
164+
domain_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
165+
favicon_url = urljoin(domain_url, "/favicon.ico")
166+
167+
# Check if we already have this specific favicon
168+
if favicon_url not in icons:
169+
try:
170+
head_response = session.head(favicon_url, timeout=5)
171+
if head_response.status_code == 200:
172+
icons.insert(
173+
0, favicon_url
174+
) # Prioritize default favicon if found
175+
except Exception:
176+
pass
67177

68178
# Save icons to temporary files
69179
icon_paths = []
@@ -82,107 +192,6 @@ def _fetch_info_thread(self, url, callback):
82192
print(f"Error fetching website info: {e}")
83193
GLib.idle_add(callback, "", [])
84194

85-
def _extract_title(self, soup, url):
86-
"""
87-
Extract title from HTML
88-
89-
Parameters:
90-
soup (BeautifulSoup): Parsed HTML
91-
url (str): Website URL
92-
93-
Returns:
94-
str: Website title
95-
"""
96-
# Try to get the title tag
97-
title_tag = soup.find("title")
98-
if title_tag and title_tag.text:
99-
# Clean up the title
100-
title = title_tag.text.strip()
101-
title = re.sub(r"\s+", " ", title)
102-
return title
103-
104-
# Alternative: Try Open Graph title
105-
og_title = soup.find("meta", property="og:title")
106-
if og_title and og_title.get("content"):
107-
return og_title.get("content").strip()
108-
109-
# Alternative: Try Twitter title
110-
twitter_title = soup.find("meta", attrs={"name": "twitter:title"})
111-
if twitter_title and twitter_title.get("content"):
112-
return twitter_title.get("content").strip()
113-
114-
# Fallback: Use domain name
115-
domain = urlparse(url).netloc.replace("www.", "")
116-
return domain.capitalize()
117-
118-
def _extract_favicons(self, soup, url, session):
119-
"""
120-
Extract favicon URLs from HTML
121-
122-
Parameters:
123-
soup (BeautifulSoup): Parsed HTML
124-
url (str): Website URL
125-
session (requests.Session): Requests session
126-
127-
Returns:
128-
list: List of favicon URLs
129-
"""
130-
base_url = url
131-
parsed_url = urlparse(url)
132-
domain = f"{parsed_url.scheme}://{parsed_url.netloc}"
133-
134-
icons = []
135-
136-
# Look for favicon in common locations first
137-
favicon_url = urljoin(domain, "/favicon.ico")
138-
try:
139-
response = session.head(favicon_url, timeout=5)
140-
if response.status_code == 200:
141-
icons.append(favicon_url)
142-
except Exception:
143-
pass
144-
145-
# Find link rel="icon" and rel="shortcut icon" tags
146-
icon_links = soup.find_all(
147-
"link",
148-
rel=re.compile(r"(shortcut icon|icon|apple-touch-icon|mask-icon)", re.I),
149-
)
150-
151-
for link in icon_links:
152-
href = link.get("href")
153-
if href:
154-
# Make sure the URL is absolute
155-
if not href.startswith(("http://", "https://")):
156-
href = urljoin(base_url, href)
157-
icons.append(href)
158-
159-
# Find Apple touch icons
160-
apple_icons = soup.find_all("link", rel=re.compile(r"apple-touch-icon", re.I))
161-
for link in apple_icons:
162-
href = link.get("href")
163-
if href:
164-
if not href.startswith(("http://", "https://")):
165-
href = urljoin(base_url, href)
166-
icons.append(href)
167-
168-
# Find Open Graph images
169-
og_image = soup.find("meta", property="og:image")
170-
if og_image and og_image.get("content"):
171-
icons.append(og_image.get("content"))
172-
173-
# Find Twitter images
174-
twitter_image = soup.find("meta", attrs={"name": "twitter:image"})
175-
if twitter_image and twitter_image.get("content"):
176-
icons.append(twitter_image.get("content"))
177-
178-
# Remove duplicates while maintaining order
179-
unique_icons = []
180-
for icon in icons:
181-
if icon not in unique_icons:
182-
unique_icons.append(icon)
183-
184-
return unique_icons
185-
186195
def _download_icon(self, icon_url, session):
187196
"""
188197
Download an icon to a temporary file and convert non-PNG/SVG to PNG

pkgbuild/PKGBUILD

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ license=('GPL')
88
url="https://github.com/biglinux/$pkgname"
99
source=("git+${url}.git")
1010
pkgdesc="Installs and removes BigLinux WebApps"
11-
depends=('python-bs4' 'python-requests' 'gettext' 'python-pillow' 'python-gobject')
11+
depends=('python-requests' 'gettext' 'python-pillow' 'python-gobject')
1212
md5sums=(SKIP)
1313
if [ -e "${pkgname}.install" ]; then
1414
install=${pkgname}.install

0 commit comments

Comments
 (0)