@@ -1234,53 +1234,181 @@ async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1, ma
12341234 viewport_height = viewport_size .get (
12351235 "height" , self .browser_config .viewport_height
12361236 )
1237+
1238+ # Snapshot visible elements before first scroll so we capture
1239+ # the initial viewport on virtual-scroll pages.
1240+ await page .evaluate ("""() => {
1241+ window.__c4ai_snapshot = new Map();
1242+ window.__c4ai_snapshot_fn = (function() {
1243+ const MAX_ITEMS = 10000;
1244+ function fingerprint(el) {
1245+ // Prefer id attribute for uniqueness
1246+ if (el.id) return 'id:' + el.id;
1247+ // Look for data-* ID attributes (data-id, data-item-id,
1248+ // data-user-id, etc.) but skip type/role markers like
1249+ // data-testid which are shared across items.
1250+ for (const attr of el.attributes) {
1251+ if (attr.name.startsWith('data-')
1252+ && (attr.name === 'data-id'
1253+ || attr.name.endsWith('-id')
1254+ || attr.name.endsWith('-key'))
1255+ && attr.value) {
1256+ return 'data:' + attr.name + '=' + attr.value;
1257+ }
1258+ }
1259+ // Fallback: two independent 32-bit hashes combined into
1260+ // a 64-bit-equivalent string to avoid collisions at scale.
1261+ const tag = el.tagName;
1262+ const href = el.getAttribute('href') || '';
1263+ const text = (el.textContent || '').trim();
1264+ let h1 = 0, h2 = 0x9e3779b9;
1265+ for (let i = 0; i < text.length; i++) {
1266+ const c = text.charCodeAt(i);
1267+ h1 = ((h1 << 5) - h1 + c) | 0;
1268+ h2 = ((h2 << 7) ^ (h2 >>> 3) ^ c) | 0;
1269+ }
1270+ return 'hash:' + tag + ':' + href + ':' + h1 + ':' + h2;
1271+ }
1272+ // Check if element carries a data-* unique identifier
1273+ // (data-item-id, data-user-id, etc.). We do NOT check
1274+ // el.id here because structural containers commonly have
1275+ // ids (id="feed", id="main") without being content items.
1276+ function hasDataId(el) {
1277+ for (const attr of el.attributes) {
1278+ if (attr.name.startsWith('data-')
1279+ && (attr.name === 'data-id'
1280+ || attr.name.endsWith('-id')
1281+ || attr.name.endsWith('-key'))
1282+ && attr.value) {
1283+ return true;
1284+ }
1285+ }
1286+ return false;
1287+ }
1288+ // Detect whether an element is a repeating-item container
1289+ // (a feed/list parent) vs. a content item. A container has
1290+ // many children that share the same tagName AND does not
1291+ // itself carry a unique identifier.
1292+ function isItemContainer(el) {
1293+ // Elements with data-*-id are content items, not containers
1294+ if (hasDataId(el)) return false;
1295+ const kids = el.children;
1296+ if (kids.length < 3) return false;
1297+ const tagCounts = {};
1298+ for (const c of kids) {
1299+ tagCounts[c.tagName] = (tagCounts[c.tagName] || 0) + 1;
1300+ }
1301+ const maxSame = Math.max(...Object.values(tagCounts));
1302+ // If most children share a tag, this is a list/feed.
1303+ return maxSame >= kids.length * 0.5 && maxSame >= 3;
1304+ }
1305+ return function snapshot() {
1306+ const map = window.__c4ai_snapshot;
1307+ if (map.size >= MAX_ITEMS) return;
1308+ const walk = (parent, depth) => {
1309+ if (depth > 30) return;
1310+ for (const el of parent.children) {
1311+ if (map.size >= MAX_ITEMS) return;
1312+ const text = (el.textContent || '').trim();
1313+ if (text.length <= 5) continue;
1314+ // If this looks like a list container, walk
1315+ // into it to capture individual items.
1316+ if (isItemContainer(el)) {
1317+ walk(el, depth + 1);
1318+ continue;
1319+ }
1320+ const fp = fingerprint(el);
1321+ if (!map.has(fp)) {
1322+ map.set(fp, el.outerHTML);
1323+ }
1324+ }
1325+ };
1326+ walk(document.body, 0);
1327+ };
1328+ })();
1329+ window.__c4ai_snapshot_fn();
1330+ }""" )
1331+
12371332 current_position = viewport_height
12381333
1239- # await page.evaluate(f"window.scrollTo(0, {current_position})")
12401334 await self .safe_scroll (page , 0 , current_position , delay = scroll_delay )
1241- # await self.csp_scroll_to(page, 0, current_position)
1242- # await asyncio.sleep(scroll_delay)
12431335
1244- # total_height = await page.evaluate("document.documentElement.scrollHeight")
12451336 dimensions = await self .get_page_dimensions (page )
12461337 total_height = dimensions ["height" ]
12471338
12481339 scroll_step_count = 0
12491340 while current_position < total_height :
1250- ####
1251- # NEW FEATURE: Check if we've reached the maximum allowed scroll steps
1252- # This prevents infinite scrolling on very long pages or infinite scroll scenarios
1253- # If max_scroll_steps is None, this check is skipped (unlimited scrolling - original behavior)
1254- ####
12551341 if max_scroll_steps is not None and scroll_step_count >= max_scroll_steps :
12561342 break
1343+
12571344 current_position = min (current_position + viewport_height , total_height )
1345+
1346+ # Use window.scrollBy as fallback if scrollTo doesn't move
1347+ prev_scroll = await page .evaluate ("window.scrollY" )
12581348 await self .safe_scroll (page , 0 , current_position , delay = scroll_delay )
1349+ new_scroll = await page .evaluate ("window.scrollY" )
1350+ if new_scroll == prev_scroll and current_position > prev_scroll :
1351+ await page .evaluate (f"window.scrollBy(0, { viewport_height } )" )
1352+ await asyncio .sleep (scroll_delay )
1353+
1354+ # Snapshot after each scroll step
1355+ await page .evaluate ("window.__c4ai_snapshot_fn && window.__c4ai_snapshot_fn()" )
12591356
1260- # Increment the step counter for max_scroll_steps tracking
12611357 scroll_step_count += 1
1262-
1263- # await page.evaluate(f"window.scrollTo(0, {current_position})")
1264- # await asyncio.sleep(scroll_delay)
12651358
1266- # new_height = await page.evaluate("document.documentElement.scrollHeight")
12671359 dimensions = await self .get_page_dimensions (page )
12681360 new_height = dimensions ["height" ]
12691361
12701362 if new_height > total_height :
12711363 total_height = new_height
12721364
1273- # await page.evaluate("window.scrollTo(0, 0)")
1365+ # Inject accumulated snapshot content into a hidden div so that
1366+ # subsequent page.content() includes all scrolled-through items.
1367+ merge_result = await page .evaluate (r"""() => {
1368+ const map = window.__c4ai_snapshot;
1369+ delete window.__c4ai_snapshot;
1370+ delete window.__c4ai_snapshot_fn;
1371+ if (!map || map.size === 0) {
1372+ return { injected: false, count: 0 };
1373+ }
1374+
1375+ const parts = [];
1376+ for (const html of map.values()) {
1377+ parts.push(html);
1378+ }
1379+
1380+ const container = document.createElement('div');
1381+ container.id = '__c4ai_accumulated_content';
1382+ container.style.display = 'none';
1383+ container.innerHTML = parts.join('\n');
1384+ document.body.appendChild(container);
1385+ return { injected: true, count: map.size };
1386+ }""" )
1387+
1388+ if merge_result and merge_result .get ("injected" ):
1389+ self .logger .info (
1390+ message = "Virtual scroll detected: accumulated {count} unique elements" ,
1391+ tag = "PAGE_SCAN" ,
1392+ params = {"count" : merge_result .get ("count" , 0 )},
1393+ )
1394+
12741395 await self .safe_scroll (page , 0 , 0 )
12751396
12761397 except Exception as e :
1398+ # Clean up snapshot state on error
1399+ try :
1400+ await page .evaluate ("""() => {
1401+ delete window.__c4ai_snapshot;
1402+ delete window.__c4ai_snapshot_fn;
1403+ }""" )
1404+ except Exception :
1405+ pass
12771406 self .logger .warning (
12781407 message = "Failed to perform full page scan: {error}" ,
12791408 tag = "PAGE_SCAN" ,
12801409 params = {"error" : str (e )},
12811410 )
12821411 else :
1283- # await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
12841412 await self .safe_scroll (page , 0 , total_height )
12851413
12861414 async def _handle_virtual_scroll (self , page : Page , config : "VirtualScrollConfig" ):
@@ -1341,15 +1469,22 @@ async def _handle_virtual_scroll(self, page: Page, config: "VirtualScrollConfig"
13411469
13421470 // Perform scrolling
13431471 while (scrollCount < config.scroll_count) {
1344- // Scroll the container
1472+ // Scroll the container; fall back to window if container
1473+ // doesn't scroll (e.g. Twitter scrolls the window, not a
1474+ // container element).
1475+ const prevScrollTop = container.scrollTop;
13451476 container.scrollTop += scrollAmount;
1346-
1477+ const usedWindowScroll = (container.scrollTop === prevScrollTop);
1478+ if (usedWindowScroll) {
1479+ window.scrollBy(0, scrollAmount);
1480+ }
1481+
13471482 // Wait for content to potentially load
13481483 await new Promise(resolve => setTimeout(resolve, config.wait_after_scroll * 1000));
1349-
1484+
13501485 // Get current HTML
13511486 const currentHTML = container.innerHTML;
1352-
1487+
13531488 // Determine what changed
13541489 if (currentHTML === previousHTML) {
13551490 // Case 0: No change - continue scrolling
@@ -1362,13 +1497,15 @@ async def _handle_virtual_scroll(self, page: Page, config: "VirtualScrollConfig"
13621497 console.log(`Scroll ${scrollCount + 1}: Content replaced, capturing chunk`);
13631498 htmlChunks.push(previousHTML);
13641499 }
1365-
1500+
13661501 // Update previous HTML for next iteration
13671502 previousHTML = currentHTML;
13681503 scrollCount++;
1369-
1370- // Check if we've reached the end
1371- if (container.scrollTop + container.clientHeight >= container.scrollHeight - 10) {
1504+
1505+ // Check if we've reached the end of scrollable content
1506+ const atContainerEnd = container.scrollTop + container.clientHeight >= container.scrollHeight - 10;
1507+ const atWindowEnd = window.scrollY + window.innerHeight >= document.documentElement.scrollHeight - 10;
1508+ if (usedWindowScroll ? atWindowEnd : atContainerEnd) {
13721509 console.log(`Reached end of scrollable content at scroll ${scrollCount}`);
13731510 // Capture final chunk if content was replaced
13741511 if (htmlChunks.length > 0) {
0 commit comments