1010import os
1111import io
1212from urllib .parse import urlparse , urljoin
13- from bs4 import BeautifulSoup
13+ from html . parser import HTMLParser
1414from PIL import Image # Add Pillow import
1515
1616gi .require_version ("Gtk" , "4.0" )
1717from gi .repository import GLib
1818
1919
20+ class WebsiteMetadataParser (HTMLParser ):
21+ """Parser for extracting title and icons from HTML"""
22+
23+ def __init__ (self ):
24+ super ().__init__ ()
25+ self .title = None
26+ self .icons = []
27+ self .og_title = None
28+ self .twitter_title = None
29+ self .og_image = None
30+ self .twitter_image = None
31+ self ._in_title = False
32+
33+ def handle_starttag (self , tag , attrs ):
34+ attrs_dict = dict (attrs )
35+
36+ if tag == "title" :
37+ self ._in_title = True
38+
39+ elif tag == "meta" :
40+ # Handle Open Graph and Twitter metadata
41+ property_attr = attrs_dict .get ("property" , "" )
42+ name_attr = attrs_dict .get ("name" , "" )
43+ content = attrs_dict .get ("content" )
44+
45+ if content :
46+ if property_attr == "og:title" :
47+ self .og_title = content
48+ elif name_attr == "twitter:title" :
49+ self .twitter_title = content
50+ elif property_attr == "og:image" :
51+ self .og_image = content
52+ elif name_attr == "twitter:image" :
53+ self .twitter_image = content
54+
55+ elif tag == "link" :
56+ rel = attrs_dict .get ("rel" , "" ).lower ()
57+ href = attrs_dict .get ("href" )
58+
59+ if href :
60+ # Match common icon rel types
61+ if any (
62+ x in rel
63+ for x in ["icon" , "shortcut icon" , "apple-touch-icon" , "mask-icon" ]
64+ ):
65+ self .icons .append (href )
66+
67+ def handle_endtag (self , tag ):
68+ if tag == "title" :
69+ self ._in_title = False
70+
71+ def handle_data (self , data ):
72+ if self ._in_title :
73+ if self .title is None :
74+ self .title = data
75+ else :
76+ self .title += data
77+
78+ def get_best_title (self ):
79+ if self .title :
80+ return self .title .strip ()
81+ if self .og_title :
82+ return self .og_title .strip ()
83+ if self .twitter_title :
84+ return self .twitter_title .strip ()
85+ return None
86+
87+ def get_all_icons (self ):
88+ all_icons = self .icons .copy ()
89+ if self .og_image :
90+ all_icons .append (self .og_image )
91+ if self .twitter_image :
92+ all_icons .append (self .twitter_image )
93+ return all_icons
94+
95+
2096class WebsiteInfoFetcher :
2197 """Class for fetching website information like title and favicons"""
2298
@@ -56,14 +132,48 @@ def _fetch_info_thread(self, url, callback):
56132 response = session .get (url , timeout = 10 )
57133 response .raise_for_status ()
58134
59- # Parse the HTML
60- soup = BeautifulSoup (response .text , "html.parser" )
135+ # Parse the HTML with our custom parser
136+ parser = WebsiteMetadataParser ()
137+ parser .feed (response .text )
61138
62139 # Get the title
63- title = self ._extract_title (soup , url )
140+ title = parser .get_best_title ()
141+ if title :
142+ # Clean up the title
143+ title = re .sub (r"\s+" , " " , title )
144+ else :
145+ # Fallback: Use domain name
146+ domain = urlparse (url ).netloc .replace ("www." , "" )
147+ title = domain .capitalize ()
64148
65149 # Get favicons
66- icons = self ._extract_favicons (soup , url , session )
150+ raw_icons = parser .get_all_icons ()
151+ icons = []
152+
153+ # Normalize icon URLs
154+ base_url = url
155+ for icon_href in raw_icons :
156+ if icon_href :
157+ if not icon_href .startswith (("http://" , "https://" )):
158+ icon_href = urljoin (base_url , icon_href )
159+ if icon_href not in icons :
160+ icons .append (icon_href )
161+
162+ # Look for favicon in common locations (root)
163+ parsed_url = urlparse (url )
164+ domain_url = f"{ parsed_url .scheme } ://{ parsed_url .netloc } "
165+ favicon_url = urljoin (domain_url , "/favicon.ico" )
166+
167+ # Check if we already have this specific favicon
168+ if favicon_url not in icons :
169+ try :
170+ head_response = session .head (favicon_url , timeout = 5 )
171+ if head_response .status_code == 200 :
172+ icons .insert (
173+ 0 , favicon_url
174+ ) # Prioritize default favicon if found
175+ except Exception :
176+ pass
67177
68178 # Save icons to temporary files
69179 icon_paths = []
@@ -82,107 +192,6 @@ def _fetch_info_thread(self, url, callback):
82192 print (f"Error fetching website info: { e } " )
83193 GLib .idle_add (callback , "" , [])
84194
85- def _extract_title (self , soup , url ):
86- """
87- Extract title from HTML
88-
89- Parameters:
90- soup (BeautifulSoup): Parsed HTML
91- url (str): Website URL
92-
93- Returns:
94- str: Website title
95- """
96- # Try to get the title tag
97- title_tag = soup .find ("title" )
98- if title_tag and title_tag .text :
99- # Clean up the title
100- title = title_tag .text .strip ()
101- title = re .sub (r"\s+" , " " , title )
102- return title
103-
104- # Alternative: Try Open Graph title
105- og_title = soup .find ("meta" , property = "og:title" )
106- if og_title and og_title .get ("content" ):
107- return og_title .get ("content" ).strip ()
108-
109- # Alternative: Try Twitter title
110- twitter_title = soup .find ("meta" , attrs = {"name" : "twitter:title" })
111- if twitter_title and twitter_title .get ("content" ):
112- return twitter_title .get ("content" ).strip ()
113-
114- # Fallback: Use domain name
115- domain = urlparse (url ).netloc .replace ("www." , "" )
116- return domain .capitalize ()
117-
118- def _extract_favicons (self , soup , url , session ):
119- """
120- Extract favicon URLs from HTML
121-
122- Parameters:
123- soup (BeautifulSoup): Parsed HTML
124- url (str): Website URL
125- session (requests.Session): Requests session
126-
127- Returns:
128- list: List of favicon URLs
129- """
130- base_url = url
131- parsed_url = urlparse (url )
132- domain = f"{ parsed_url .scheme } ://{ parsed_url .netloc } "
133-
134- icons = []
135-
136- # Look for favicon in common locations first
137- favicon_url = urljoin (domain , "/favicon.ico" )
138- try :
139- response = session .head (favicon_url , timeout = 5 )
140- if response .status_code == 200 :
141- icons .append (favicon_url )
142- except Exception :
143- pass
144-
145- # Find link rel="icon" and rel="shortcut icon" tags
146- icon_links = soup .find_all (
147- "link" ,
148- rel = re .compile (r"(shortcut icon|icon|apple-touch-icon|mask-icon)" , re .I ),
149- )
150-
151- for link in icon_links :
152- href = link .get ("href" )
153- if href :
154- # Make sure the URL is absolute
155- if not href .startswith (("http://" , "https://" )):
156- href = urljoin (base_url , href )
157- icons .append (href )
158-
159- # Find Apple touch icons
160- apple_icons = soup .find_all ("link" , rel = re .compile (r"apple-touch-icon" , re .I ))
161- for link in apple_icons :
162- href = link .get ("href" )
163- if href :
164- if not href .startswith (("http://" , "https://" )):
165- href = urljoin (base_url , href )
166- icons .append (href )
167-
168- # Find Open Graph images
169- og_image = soup .find ("meta" , property = "og:image" )
170- if og_image and og_image .get ("content" ):
171- icons .append (og_image .get ("content" ))
172-
173- # Find Twitter images
174- twitter_image = soup .find ("meta" , attrs = {"name" : "twitter:image" })
175- if twitter_image and twitter_image .get ("content" ):
176- icons .append (twitter_image .get ("content" ))
177-
178- # Remove duplicates while maintaining order
179- unique_icons = []
180- for icon in icons :
181- if icon not in unique_icons :
182- unique_icons .append (icon )
183-
184- return unique_icons
185-
186195 def _download_icon (self , icon_url , session ):
187196 """
188197 Download an icon to a temporary file and convert non-PNG/SVG to PNG
0 commit comments