|
1 | 1 | from __future__ import annotations |
2 | 2 |
|
3 | 3 | import asyncio |
| 4 | +import contextlib |
4 | 5 | import logging |
5 | 6 | import warnings |
6 | 7 | from datetime import timedelta |
@@ -236,6 +237,7 @@ async def _open_page( |
236 | 237 | proxy_info=context.proxy_info, |
237 | 238 | get_key_value_store=context.get_key_value_store, |
238 | 239 | log=context.log, |
| 240 | + register_deferred_cleanup=context.register_deferred_cleanup, |
239 | 241 | page=crawlee_page.page, |
240 | 242 | block_requests=partial(block_requests, page=crawlee_page.page), |
241 | 243 | goto_options=GotoOptions(**self._goto_options), |
@@ -296,63 +298,73 @@ async def _navigate( |
296 | 298 | The enhanced crawling context with the Playwright-specific features (page, response, enqueue_links, |
297 | 299 | infinite_scroll and block_requests). |
298 | 300 | """ |
299 | | - async with context.page: |
300 | | - if context.session: |
301 | | - session_cookies = context.session.cookies.get_cookies_as_playwright_format() |
302 | | - await self._update_cookies(context.page, session_cookies) |
303 | | - |
304 | | - if context.request.headers: |
305 | | - await context.page.set_extra_http_headers(context.request.headers.model_dump()) |
306 | | - # Navigate to the URL and get response. |
307 | | - if context.request.method != 'GET': |
308 | | - # Call the notification only once |
309 | | - warnings.warn( |
310 | | - 'Using other request methods than GET or adding payloads has a high impact on performance' |
311 | | - ' in recent versions of Playwright. Use only when necessary.', |
312 | | - category=UserWarning, |
313 | | - stacklevel=2, |
314 | | - ) |
| 301 | + # Enter the page context manager, but defer its cleanup (page.close()) so the page stays open |
| 302 | + # during error handler execution. |
| 303 | + await context.page.__aenter__() |
315 | 304 |
|
316 | | - route_handler = self._prepare_request_interceptor( |
317 | | - method=context.request.method, |
318 | | - headers=context.request.headers, |
319 | | - payload=context.request.payload, |
320 | | - ) |
| 305 | + async def _close_page() -> None: |
| 306 | + with contextlib.suppress(Exception): |
| 307 | + await context.page.__aexit__(None, None, None) |
321 | 308 |
|
322 | | - # Set route_handler only for current request |
323 | | - await context.page.route(context.request.url, route_handler) |
| 309 | + context.register_deferred_cleanup(_close_page) |
324 | 310 |
|
325 | | - try: |
326 | | - async with self._shared_navigation_timeouts[id(context)] as remaining_timeout: |
327 | | - response = await context.page.goto( |
328 | | - context.request.url, timeout=remaining_timeout.total_seconds() * 1000, **context.goto_options |
329 | | - ) |
330 | | - context.request.state = RequestState.AFTER_NAV |
331 | | - except playwright.async_api.TimeoutError as exc: |
332 | | - raise asyncio.TimeoutError from exc |
333 | | - |
334 | | - if response is None: |
335 | | - raise SessionError(f'Failed to load the URL: {context.request.url}') |
336 | | - |
337 | | - # Set the loaded URL to the actual URL after redirection. |
338 | | - context.request.loaded_url = context.page.url |
339 | | - |
340 | | - yield PlaywrightPostNavCrawlingContext( |
341 | | - request=context.request, |
342 | | - session=context.session, |
343 | | - add_requests=context.add_requests, |
344 | | - send_request=context.send_request, |
345 | | - push_data=context.push_data, |
346 | | - use_state=context.use_state, |
347 | | - proxy_info=context.proxy_info, |
348 | | - get_key_value_store=context.get_key_value_store, |
349 | | - log=context.log, |
350 | | - page=context.page, |
351 | | - block_requests=context.block_requests, |
352 | | - goto_options=context.goto_options, |
353 | | - response=response, |
| 311 | + if context.session: |
| 312 | + session_cookies = context.session.cookies.get_cookies_as_playwright_format() |
| 313 | + await self._update_cookies(context.page, session_cookies) |
| 314 | + |
| 315 | + if context.request.headers: |
| 316 | + await context.page.set_extra_http_headers(context.request.headers.model_dump()) |
| 317 | + # Navigate to the URL and get response. |
| 318 | + if context.request.method != 'GET': |
| 319 | + # Call the notification only once |
| 320 | + warnings.warn( |
| 321 | + 'Using other request methods than GET or adding payloads has a high impact on performance' |
| 322 | + ' in recent versions of Playwright. Use only when necessary.', |
| 323 | + category=UserWarning, |
| 324 | + stacklevel=2, |
354 | 325 | ) |
355 | 326 |
|
| 327 | + route_handler = self._prepare_request_interceptor( |
| 328 | + method=context.request.method, |
| 329 | + headers=context.request.headers, |
| 330 | + payload=context.request.payload, |
| 331 | + ) |
| 332 | + |
| 333 | + # Set route_handler only for current request |
| 334 | + await context.page.route(context.request.url, route_handler) |
| 335 | + |
| 336 | + try: |
| 337 | + async with self._shared_navigation_timeouts[id(context)] as remaining_timeout: |
| 338 | + response = await context.page.goto( |
| 339 | + context.request.url, timeout=remaining_timeout.total_seconds() * 1000, **context.goto_options |
| 340 | + ) |
| 341 | + context.request.state = RequestState.AFTER_NAV |
| 342 | + except playwright.async_api.TimeoutError as exc: |
| 343 | + raise asyncio.TimeoutError from exc |
| 344 | + |
| 345 | + if response is None: |
| 346 | + raise SessionError(f'Failed to load the URL: {context.request.url}') |
| 347 | + |
| 348 | + # Set the loaded URL to the actual URL after redirection. |
| 349 | + context.request.loaded_url = context.page.url |
| 350 | + |
| 351 | + yield PlaywrightPostNavCrawlingContext( |
| 352 | + request=context.request, |
| 353 | + session=context.session, |
| 354 | + add_requests=context.add_requests, |
| 355 | + send_request=context.send_request, |
| 356 | + push_data=context.push_data, |
| 357 | + use_state=context.use_state, |
| 358 | + proxy_info=context.proxy_info, |
| 359 | + get_key_value_store=context.get_key_value_store, |
| 360 | + log=context.log, |
| 361 | + register_deferred_cleanup=context.register_deferred_cleanup, |
| 362 | + page=context.page, |
| 363 | + block_requests=context.block_requests, |
| 364 | + goto_options=context.goto_options, |
| 365 | + response=response, |
| 366 | + ) |
| 367 | + |
356 | 368 | def _create_extract_links_function(self, context: PlaywrightPreNavCrawlingContext) -> ExtractLinksFunction: |
357 | 369 | """Create a callback function for extracting links from context. |
358 | 370 |
|
@@ -508,6 +520,7 @@ async def _create_crawling_context( |
508 | 520 | proxy_info=context.proxy_info, |
509 | 521 | get_key_value_store=context.get_key_value_store, |
510 | 522 | log=context.log, |
| 523 | + register_deferred_cleanup=context.register_deferred_cleanup, |
511 | 524 | page=context.page, |
512 | 525 | goto_options=context.goto_options, |
513 | 526 | response=context.response, |
|
0 commit comments