Skip to content

Commit 4750810

Browse files
committed
Enhance AsyncWebCrawler with smart waiting and screenshot capabilities
- Implement smart_wait function in AsyncPlaywrightCrawlerStrategy - Add screenshot support to AsyncCrawlResponse and AsyncWebCrawler - Improve error handling and timeout management in crawling process - Fix typo in CrawlResult model (responser_headers -> response_headers) - Update .gitignore to exclude additional files - Adjust import path in test_basic_crawling.py
1 parent e0e0db4 commit 4750810

10 files changed

+281
-21
lines changed

.gitignore

+5-1
Original file line numberDiff line numberDiff line change
@@ -196,4 +196,8 @@ docs/.DS_Store
196196
tmp/
197197
test_env/
198198
**/.DS_Store
199-
**/.DS_Store
199+
**/.DS_Store
200+
201+
todo.md
202+
git_changes.py
203+
git_changes.md

CHANGELOG.md

+9
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,14 @@
11
# Changelog
22

3+
## [v0.3.5] - 2024-09-02
4+
5+
Enhance AsyncWebCrawler with smart waiting and screenshot capabilities
6+
7+
- Implement smart_wait function in AsyncPlaywrightCrawlerStrategy
8+
- Add screenshot support to AsyncCrawlResponse and AsyncWebCrawler
9+
- Improve error handling and timeout management in crawling process
10+
- Fix typo in CrawlResult model (responser_headers -> response_headers)
11+
312
## [v0.2.77] - 2024-08-04
413

514
Significant improvements in text processing and performance:

crawl4ai/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from .async_webcrawler import AsyncWebCrawler
44
from .models import CrawlResult
55

6-
__version__ = "0.3.4"
6+
__version__ = "0.3.5"
77

88
__all__ = [
99
"AsyncWebCrawler",

crawl4ai/async_crawler_strategy.py

+43-12
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,12 @@
1212
from pathlib import Path
1313
from playwright.async_api import ProxySettings
1414
from pydantic import BaseModel
15+
1516
class AsyncCrawlResponse(BaseModel):
1617
html: str
1718
response_headers: Dict[str, str]
1819
status_code: int
20+
screenshot: Optional[str] = None
1921

2022
class AsyncCrawlerStrategy(ABC):
2123
@abstractmethod
@@ -139,6 +141,45 @@ def _cleanup_expired_sessions(self):
139141
asyncio.create_task(self.kill_session(sid))
140142

141143

144+
async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000):
145+
wait_for = wait_for.strip()
146+
147+
if wait_for.startswith('js:'):
148+
# Explicitly specified JavaScript
149+
js_code = wait_for[3:].strip()
150+
return await self.csp_compliant_wait(page, js_code, timeout)
151+
elif wait_for.startswith('css:'):
152+
# Explicitly specified CSS selector
153+
css_selector = wait_for[4:].strip()
154+
try:
155+
await page.wait_for_selector(css_selector, timeout=timeout)
156+
except Error as e:
157+
if 'Timeout' in str(e):
158+
raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{css_selector}'")
159+
else:
160+
raise ValueError(f"Invalid CSS selector: '{css_selector}'")
161+
else:
162+
# Auto-detect based on content
163+
if wait_for.startswith('()') or wait_for.startswith('function'):
164+
# It's likely a JavaScript function
165+
return await self.csp_compliant_wait(page, wait_for, timeout)
166+
else:
167+
# Assume it's a CSS selector first
168+
try:
169+
await page.wait_for_selector(wait_for, timeout=timeout)
170+
except Error as e:
171+
if 'Timeout' in str(e):
172+
raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{wait_for}'")
173+
else:
174+
# If it's not a timeout error, it might be an invalid selector
175+
# Let's try to evaluate it as a JavaScript function as a fallback
176+
try:
177+
return await self.csp_compliant_wait(page, f"() => {{{wait_for}}}", timeout)
178+
except Error:
179+
raise ValueError(f"Invalid wait_for parameter: '{wait_for}'. "
180+
"It should be either a valid CSS selector, a JavaScript function, "
181+
"or explicitly prefixed with 'js:' or 'css:'.")
182+
142183
async def csp_compliant_wait(self, page: Page, user_wait_function: str, timeout: float = 30000):
143184
wrapper_js = f"""
144185
async () => {{
@@ -250,19 +291,9 @@ async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
250291
wait_for = kwargs.get("wait_for")
251292
if wait_for:
252293
try:
253-
await self.csp_compliant_wait(page, wait_for, timeout=kwargs.get("timeout", 30000))
294+
await self.smart_wait(page, wait_for, timeout=kwargs.get("timeout", 30000))
254295
except Exception as e:
255-
raise RuntimeError(f"Custom wait condition failed: {str(e)}")
256-
# try:
257-
# await page.wait_for_function(wait_for)
258-
# # if callable(wait_for):
259-
# # await page.wait_for_function(wait_for)
260-
# # elif isinstance(wait_for, str):
261-
# # await page.wait_for_selector(wait_for)
262-
# # else:
263-
# # raise ValueError("wait_for must be either a callable or a CSS selector string")
264-
# except Error as e:
265-
# raise Error(f"Custom wait condition failed: {str(e)}")
296+
raise RuntimeError(f"Wait condition failed: {str(e)}")
266297

267298
html = await page.content()
268299
page = await self.execute_hook('before_return_html', page, html)

crawl4ai/async_webcrawler.py

+4-5
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ async def arun(
8080

8181
word_count_threshold = max(word_count_threshold, MIN_WORD_THRESHOLD)
8282

83-
async_response : AsyncCrawlResponse = None
83+
async_response: AsyncCrawlResponse = None
8484
cached = None
8585
screenshot_data = None
8686
extracted_content = None
@@ -102,15 +102,14 @@ async def arun(
102102
t1 = time.time()
103103
if user_agent:
104104
self.crawler_strategy.update_user_agent(user_agent)
105-
async_response : AsyncCrawlResponse = await self.crawler_strategy.crawl(url, **kwargs)
105+
async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl(url, screenshot=screenshot, **kwargs)
106106
html = sanitize_input_encode(async_response.html)
107+
screenshot_data = async_response.screenshot
107108
t2 = time.time()
108109
if verbose:
109110
print(
110111
f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds"
111112
)
112-
if screenshot:
113-
screenshot_data = await self.crawler_strategy.take_screenshot(url)
114113

115114
crawl_result = await self.aprocess_html(
116115
url,
@@ -127,7 +126,7 @@ async def arun(
127126
**kwargs,
128127
)
129128
crawl_result.status_code = async_response.status_code if async_response else 200
130-
crawl_result.responser_headers = async_response.response_headers if async_response else {}
129+
crawl_result.response_headers = async_response.response_headers if async_response else {}
131130
crawl_result.success = bool(html)
132131
crawl_result.session_id = kwargs.get("session_id", None)
133132
return crawl_result

crawl4ai/models.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,5 +18,5 @@ class CrawlResult(BaseModel):
1818
metadata: Optional[dict] = None
1919
error_message: Optional[str] = None
2020
session_id: Optional[str] = None
21-
responser_headers: Optional[dict] = None
21+
response_headers: Optional[dict] = None
2222
status_code: Optional[int] = None
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# File: async_webcrawler_multiple_urls_example.py
2+
import os, sys
3+
# append 2 parent directories to sys.path to import crawl4ai
4+
parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
5+
sys.path.append(parent_dir)
6+
7+
import asyncio
8+
from crawl4ai import AsyncWebCrawler
9+
10+
async def main():
11+
# Initialize the AsyncWebCrawler
12+
async with AsyncWebCrawler(verbose=True) as crawler:
13+
# List of URLs to crawl
14+
urls = [
15+
"https://example.com",
16+
"https://python.org",
17+
"https://github.com",
18+
"https://stackoverflow.com",
19+
"https://news.ycombinator.com"
20+
]
21+
22+
# Set up crawling parameters
23+
word_count_threshold = 100
24+
25+
# Run the crawling process for multiple URLs
26+
results = await crawler.arun_many(
27+
urls=urls,
28+
word_count_threshold=word_count_threshold,
29+
bypass_cache=True,
30+
verbose=True
31+
)
32+
33+
# Process the results
34+
for result in results:
35+
if result.success:
36+
print(f"Successfully crawled: {result.url}")
37+
print(f"Title: {result.metadata.get('title', 'N/A')}")
38+
print(f"Word count: {len(result.markdown.split())}")
39+
print(f"Number of links: {len(result.links.get('internal', [])) + len(result.links.get('external', []))}")
40+
print(f"Number of images: {len(result.media.get('images', []))}")
41+
print("---")
42+
else:
43+
print(f"Failed to crawl: {result.url}")
44+
print(f"Error: {result.error_message}")
45+
print("---")
46+
47+
if __name__ == "__main__":
48+
asyncio.run(main())
+45
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
import asyncio
2+
from crawl4ai import AsyncWebCrawler, AsyncPlaywrightCrawlerStrategy
3+
4+
async def main():
5+
# Example 1: Setting language when creating the crawler
6+
crawler1 = AsyncWebCrawler(
7+
crawler_strategy=AsyncPlaywrightCrawlerStrategy(
8+
headers={"Accept-Language": "fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7"}
9+
)
10+
)
11+
result1 = await crawler1.arun("https://www.example.com")
12+
print("Example 1 result:", result1.extracted_content[:100]) # Print first 100 characters
13+
14+
# Example 2: Setting language before crawling
15+
crawler2 = AsyncWebCrawler()
16+
crawler2.crawler_strategy.headers["Accept-Language"] = "es-ES,es;q=0.9,en-US;q=0.8,en;q=0.7"
17+
result2 = await crawler2.arun("https://www.example.com")
18+
print("Example 2 result:", result2.extracted_content[:100])
19+
20+
# Example 3: Setting language when calling arun method
21+
crawler3 = AsyncWebCrawler()
22+
result3 = await crawler3.arun(
23+
"https://www.example.com",
24+
headers={"Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7"}
25+
)
26+
print("Example 3 result:", result3.extracted_content[:100])
27+
28+
# Example 4: Crawling multiple pages with different languages
29+
urls = [
30+
("https://www.example.com", "fr-FR,fr;q=0.9"),
31+
("https://www.example.org", "es-ES,es;q=0.9"),
32+
("https://www.example.net", "de-DE,de;q=0.9"),
33+
]
34+
35+
crawler4 = AsyncWebCrawler()
36+
results = await asyncio.gather(*[
37+
crawler4.arun(url, headers={"Accept-Language": lang})
38+
for url, lang in urls
39+
])
40+
41+
for url, result in zip([u for u, _ in urls], results):
42+
print(f"Result for {url}:", result.extracted_content[:100])
43+
44+
if __name__ == "__main__":
45+
asyncio.run(main())

tests/async/test_basic_crawling.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import time
66

77
# Add the parent directory to the Python path
8-
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
8+
parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
99
sys.path.append(parent_dir)
1010

1111
from crawl4ai.async_webcrawler import AsyncWebCrawler

tests/async/test_screenshot.py

+124
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
import os
2+
import sys
3+
import pytest
4+
import asyncio
5+
import base64
6+
from PIL import Image
7+
import io
8+
9+
# Add the parent directory to the Python path
10+
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
11+
sys.path.append(parent_dir)
12+
13+
from crawl4ai.async_webcrawler import AsyncWebCrawler
14+
15+
@pytest.mark.asyncio
16+
async def test_basic_screenshot():
17+
async with AsyncWebCrawler(verbose=True) as crawler:
18+
url = "https://example.com" # A static website
19+
result = await crawler.arun(url=url, bypass_cache=True, screenshot=True)
20+
21+
assert result.success
22+
assert result.screenshot is not None
23+
24+
# Verify the screenshot is a valid image
25+
image_data = base64.b64decode(result.screenshot)
26+
image = Image.open(io.BytesIO(image_data))
27+
assert image.format == "PNG"
28+
29+
@pytest.mark.asyncio
30+
async def test_screenshot_with_wait_for():
31+
async with AsyncWebCrawler(verbose=True) as crawler:
32+
# Using a website with dynamic content
33+
url = "https://www.youtube.com"
34+
wait_for = "css:#content" # Wait for the main content to load
35+
36+
result = await crawler.arun(
37+
url=url,
38+
bypass_cache=True,
39+
screenshot=True,
40+
wait_for=wait_for
41+
)
42+
43+
assert result.success
44+
assert result.screenshot is not None
45+
46+
# Verify the screenshot is a valid image
47+
image_data = base64.b64decode(result.screenshot)
48+
image = Image.open(io.BytesIO(image_data))
49+
assert image.format == "PNG"
50+
51+
# You might want to add more specific checks here, like image dimensions
52+
# or even use image recognition to verify certain elements are present
53+
54+
@pytest.mark.asyncio
55+
async def test_screenshot_with_js_wait_for():
56+
async with AsyncWebCrawler(verbose=True) as crawler:
57+
url = "https://www.amazon.com"
58+
wait_for = "js:() => document.querySelector('#nav-logo-sprites') !== null"
59+
60+
result = await crawler.arun(
61+
url=url,
62+
bypass_cache=True,
63+
screenshot=True,
64+
wait_for=wait_for
65+
)
66+
67+
assert result.success
68+
assert result.screenshot is not None
69+
70+
image_data = base64.b64decode(result.screenshot)
71+
image = Image.open(io.BytesIO(image_data))
72+
assert image.format == "PNG"
73+
74+
@pytest.mark.asyncio
75+
async def test_screenshot_without_wait_for():
76+
async with AsyncWebCrawler(verbose=True) as crawler:
77+
url = "https://www.nytimes.com" # A website with lots of dynamic content
78+
79+
result = await crawler.arun(url=url, bypass_cache=True, screenshot=True)
80+
81+
assert result.success
82+
assert result.screenshot is not None
83+
84+
image_data = base64.b64decode(result.screenshot)
85+
image = Image.open(io.BytesIO(image_data))
86+
assert image.format == "PNG"
87+
88+
@pytest.mark.asyncio
89+
async def test_screenshot_comparison():
90+
async with AsyncWebCrawler(verbose=True) as crawler:
91+
url = "https://www.reddit.com"
92+
wait_for = "css:#SHORTCUT_FOCUSABLE_DIV"
93+
94+
# Take screenshot without wait_for
95+
result_without_wait = await crawler.arun(
96+
url=url,
97+
bypass_cache=True,
98+
screenshot=True
99+
)
100+
101+
# Take screenshot with wait_for
102+
result_with_wait = await crawler.arun(
103+
url=url,
104+
bypass_cache=True,
105+
screenshot=True,
106+
wait_for=wait_for
107+
)
108+
109+
assert result_without_wait.success and result_with_wait.success
110+
assert result_without_wait.screenshot is not None
111+
assert result_with_wait.screenshot is not None
112+
113+
# Compare the two screenshots
114+
image_without_wait = Image.open(io.BytesIO(base64.b64decode(result_without_wait.screenshot)))
115+
image_with_wait = Image.open(io.BytesIO(base64.b64decode(result_with_wait.screenshot)))
116+
117+
# This is a simple size comparison. In a real-world scenario, you might want to use
118+
# more sophisticated image comparison techniques.
119+
assert image_with_wait.size[0] >= image_without_wait.size[0]
120+
assert image_with_wait.size[1] >= image_without_wait.size[1]
121+
122+
# Entry point for debugging
123+
if __name__ == "__main__":
124+
pytest.main([__file__, "-v"])

0 commit comments

Comments
 (0)