Skip to content

Commit 2535705

Browse files
hafezparastclaude
andcommitted
fix: scan_full_page captures all content on virtual-scroll pages (#731)
Replace MutationObserver with scroll-and-snapshot Map approach. Fingerprint by data-*-id attrs first, dual-hash fallback. Cap accumulator at 10K elements. Add window.scrollBy fallback. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent af648e1 commit 2535705

2 files changed

Lines changed: 615 additions & 24 deletions

File tree

crawl4ai/async_crawler_strategy.py

Lines changed: 161 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1234,53 +1234,181 @@ async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1, ma
12341234
viewport_height = viewport_size.get(
12351235
"height", self.browser_config.viewport_height
12361236
)
1237+
1238+
# Snapshot visible elements before first scroll so we capture
1239+
# the initial viewport on virtual-scroll pages.
1240+
await page.evaluate("""() => {
1241+
window.__c4ai_snapshot = new Map();
1242+
window.__c4ai_snapshot_fn = (function() {
1243+
const MAX_ITEMS = 10000;
1244+
function fingerprint(el) {
1245+
// Prefer id attribute for uniqueness
1246+
if (el.id) return 'id:' + el.id;
1247+
// Look for data-* ID attributes (data-id, data-item-id,
1248+
// data-user-id, etc.) but skip type/role markers like
1249+
// data-testid which are shared across items.
1250+
for (const attr of el.attributes) {
1251+
if (attr.name.startsWith('data-')
1252+
&& (attr.name === 'data-id'
1253+
|| attr.name.endsWith('-id')
1254+
|| attr.name.endsWith('-key'))
1255+
&& attr.value) {
1256+
return 'data:' + attr.name + '=' + attr.value;
1257+
}
1258+
}
1259+
// Fallback: two independent 32-bit hashes combined into
1260+
// a 64-bit-equivalent string to avoid collisions at scale.
1261+
const tag = el.tagName;
1262+
const href = el.getAttribute('href') || '';
1263+
const text = (el.textContent || '').trim();
1264+
let h1 = 0, h2 = 0x9e3779b9;
1265+
for (let i = 0; i < text.length; i++) {
1266+
const c = text.charCodeAt(i);
1267+
h1 = ((h1 << 5) - h1 + c) | 0;
1268+
h2 = ((h2 << 7) ^ (h2 >>> 3) ^ c) | 0;
1269+
}
1270+
return 'hash:' + tag + ':' + href + ':' + h1 + ':' + h2;
1271+
}
1272+
// Check if element carries a data-* unique identifier
1273+
// (data-item-id, data-user-id, etc.). We do NOT check
1274+
// el.id here because structural containers commonly have
1275+
// ids (id="feed", id="main") without being content items.
1276+
function hasDataId(el) {
1277+
for (const attr of el.attributes) {
1278+
if (attr.name.startsWith('data-')
1279+
&& (attr.name === 'data-id'
1280+
|| attr.name.endsWith('-id')
1281+
|| attr.name.endsWith('-key'))
1282+
&& attr.value) {
1283+
return true;
1284+
}
1285+
}
1286+
return false;
1287+
}
1288+
// Detect whether an element is a repeating-item container
1289+
// (a feed/list parent) vs. a content item. A container has
1290+
// many children that share the same tagName AND does not
1291+
// itself carry a unique identifier.
1292+
function isItemContainer(el) {
1293+
// Elements with data-*-id are content items, not containers
1294+
if (hasDataId(el)) return false;
1295+
const kids = el.children;
1296+
if (kids.length < 3) return false;
1297+
const tagCounts = {};
1298+
for (const c of kids) {
1299+
tagCounts[c.tagName] = (tagCounts[c.tagName] || 0) + 1;
1300+
}
1301+
const maxSame = Math.max(...Object.values(tagCounts));
1302+
// If most children share a tag, this is a list/feed.
1303+
return maxSame >= kids.length * 0.5 && maxSame >= 3;
1304+
}
1305+
return function snapshot() {
1306+
const map = window.__c4ai_snapshot;
1307+
if (map.size >= MAX_ITEMS) return;
1308+
const walk = (parent, depth) => {
1309+
if (depth > 30) return;
1310+
for (const el of parent.children) {
1311+
if (map.size >= MAX_ITEMS) return;
1312+
const text = (el.textContent || '').trim();
1313+
if (text.length <= 5) continue;
1314+
// If this looks like a list container, walk
1315+
// into it to capture individual items.
1316+
if (isItemContainer(el)) {
1317+
walk(el, depth + 1);
1318+
continue;
1319+
}
1320+
const fp = fingerprint(el);
1321+
if (!map.has(fp)) {
1322+
map.set(fp, el.outerHTML);
1323+
}
1324+
}
1325+
};
1326+
walk(document.body, 0);
1327+
};
1328+
})();
1329+
window.__c4ai_snapshot_fn();
1330+
}""")
1331+
12371332
current_position = viewport_height
12381333

1239-
# await page.evaluate(f"window.scrollTo(0, {current_position})")
12401334
await self.safe_scroll(page, 0, current_position, delay=scroll_delay)
1241-
# await self.csp_scroll_to(page, 0, current_position)
1242-
# await asyncio.sleep(scroll_delay)
12431335

1244-
# total_height = await page.evaluate("document.documentElement.scrollHeight")
12451336
dimensions = await self.get_page_dimensions(page)
12461337
total_height = dimensions["height"]
12471338

12481339
scroll_step_count = 0
12491340
while current_position < total_height:
1250-
####
1251-
# NEW FEATURE: Check if we've reached the maximum allowed scroll steps
1252-
# This prevents infinite scrolling on very long pages or infinite scroll scenarios
1253-
# If max_scroll_steps is None, this check is skipped (unlimited scrolling - original behavior)
1254-
####
12551341
if max_scroll_steps is not None and scroll_step_count >= max_scroll_steps:
12561342
break
1343+
12571344
current_position = min(current_position + viewport_height, total_height)
1345+
1346+
# Use window.scrollBy as fallback if scrollTo doesn't move
1347+
prev_scroll = await page.evaluate("window.scrollY")
12581348
await self.safe_scroll(page, 0, current_position, delay=scroll_delay)
1349+
new_scroll = await page.evaluate("window.scrollY")
1350+
if new_scroll == prev_scroll and current_position > prev_scroll:
1351+
await page.evaluate(f"window.scrollBy(0, {viewport_height})")
1352+
await asyncio.sleep(scroll_delay)
1353+
1354+
# Snapshot after each scroll step
1355+
await page.evaluate("window.__c4ai_snapshot_fn && window.__c4ai_snapshot_fn()")
12591356

1260-
# Increment the step counter for max_scroll_steps tracking
12611357
scroll_step_count += 1
1262-
1263-
# await page.evaluate(f"window.scrollTo(0, {current_position})")
1264-
# await asyncio.sleep(scroll_delay)
12651358

1266-
# new_height = await page.evaluate("document.documentElement.scrollHeight")
12671359
dimensions = await self.get_page_dimensions(page)
12681360
new_height = dimensions["height"]
12691361

12701362
if new_height > total_height:
12711363
total_height = new_height
12721364

1273-
# await page.evaluate("window.scrollTo(0, 0)")
1365+
# Inject accumulated snapshot content into a hidden div so that
1366+
# subsequent page.content() includes all scrolled-through items.
1367+
merge_result = await page.evaluate(r"""() => {
1368+
const map = window.__c4ai_snapshot;
1369+
delete window.__c4ai_snapshot;
1370+
delete window.__c4ai_snapshot_fn;
1371+
if (!map || map.size === 0) {
1372+
return { injected: false, count: 0 };
1373+
}
1374+
1375+
const parts = [];
1376+
for (const html of map.values()) {
1377+
parts.push(html);
1378+
}
1379+
1380+
const container = document.createElement('div');
1381+
container.id = '__c4ai_accumulated_content';
1382+
container.style.display = 'none';
1383+
container.innerHTML = parts.join('\n');
1384+
document.body.appendChild(container);
1385+
return { injected: true, count: map.size };
1386+
}""")
1387+
1388+
if merge_result and merge_result.get("injected"):
1389+
self.logger.info(
1390+
message="Virtual scroll detected: accumulated {count} unique elements",
1391+
tag="PAGE_SCAN",
1392+
params={"count": merge_result.get("count", 0)},
1393+
)
1394+
12741395
await self.safe_scroll(page, 0, 0)
12751396

12761397
except Exception as e:
1398+
# Clean up snapshot state on error
1399+
try:
1400+
await page.evaluate("""() => {
1401+
delete window.__c4ai_snapshot;
1402+
delete window.__c4ai_snapshot_fn;
1403+
}""")
1404+
except Exception:
1405+
pass
12771406
self.logger.warning(
12781407
message="Failed to perform full page scan: {error}",
12791408
tag="PAGE_SCAN",
12801409
params={"error": str(e)},
12811410
)
12821411
else:
1283-
# await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
12841412
await self.safe_scroll(page, 0, total_height)
12851413

12861414
async def _handle_virtual_scroll(self, page: Page, config: "VirtualScrollConfig"):
@@ -1341,15 +1469,22 @@ async def _handle_virtual_scroll(self, page: Page, config: "VirtualScrollConfig"
13411469
13421470
// Perform scrolling
13431471
while (scrollCount < config.scroll_count) {
1344-
// Scroll the container
1472+
// Scroll the container; fall back to window if container
1473+
// doesn't scroll (e.g. Twitter scrolls the window, not a
1474+
// container element).
1475+
const prevScrollTop = container.scrollTop;
13451476
container.scrollTop += scrollAmount;
1346-
1477+
const usedWindowScroll = (container.scrollTop === prevScrollTop);
1478+
if (usedWindowScroll) {
1479+
window.scrollBy(0, scrollAmount);
1480+
}
1481+
13471482
// Wait for content to potentially load
13481483
await new Promise(resolve => setTimeout(resolve, config.wait_after_scroll * 1000));
1349-
1484+
13501485
// Get current HTML
13511486
const currentHTML = container.innerHTML;
1352-
1487+
13531488
// Determine what changed
13541489
if (currentHTML === previousHTML) {
13551490
// Case 0: No change - continue scrolling
@@ -1362,13 +1497,15 @@ async def _handle_virtual_scroll(self, page: Page, config: "VirtualScrollConfig"
13621497
console.log(`Scroll ${scrollCount + 1}: Content replaced, capturing chunk`);
13631498
htmlChunks.push(previousHTML);
13641499
}
1365-
1500+
13661501
// Update previous HTML for next iteration
13671502
previousHTML = currentHTML;
13681503
scrollCount++;
1369-
1370-
// Check if we've reached the end
1371-
if (container.scrollTop + container.clientHeight >= container.scrollHeight - 10) {
1504+
1505+
// Check if we've reached the end of scrollable content
1506+
const atContainerEnd = container.scrollTop + container.clientHeight >= container.scrollHeight - 10;
1507+
const atWindowEnd = window.scrollY + window.innerHeight >= document.documentElement.scrollHeight - 10;
1508+
if (usedWindowScroll ? atWindowEnd : atContainerEnd) {
13721509
console.log(`Reached end of scrollable content at scroll ${scrollCount}`);
13731510
// Capture final chunk if content was replaced
13741511
if (htmlChunks.length > 0) {

0 commit comments

Comments
 (0)