Skip to content

Commit

Permalink
Update documents and README
Browse files Browse the repository at this point in the history
  • Loading branch information
unclecode committed Sep 25, 2024
1 parent f1eee09 commit 10cdad0
Show file tree
Hide file tree
Showing 4 changed files with 357 additions and 73 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -189,4 +189,6 @@ a.txt
.lambda_function.py
ec2*

update_changelog.sh
update_changelog.sh

.DS_Store
147 changes: 76 additions & 71 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,80 @@ if __name__ == "__main__":
asyncio.run(main())
```

### Extracting Structured Data without LLM

The `JsonCssExtractionStrategy` allows for precise extraction of structured data from web pages using CSS selectors.

```python
import asyncio
import json
from crawl4ai import AsyncWebCrawler
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy

async def extract_news_teasers():
schema = {
"name": "News Teaser Extractor",
"baseSelector": ".wide-tease-item__wrapper",
"fields": [
{
"name": "category",
"selector": ".unibrow span[data-testid='unibrow-text']",
"type": "text",
},
{
"name": "headline",
"selector": ".wide-tease-item__headline",
"type": "text",
},
{
"name": "summary",
"selector": ".wide-tease-item__description",
"type": "text",
},
{
"name": "time",
"selector": "[data-testid='wide-tease-date']",
"type": "text",
},
{
"name": "image",
"type": "nested",
"selector": "picture.teasePicture img",
"fields": [
{"name": "src", "type": "attribute", "attribute": "src"},
{"name": "alt", "type": "attribute", "attribute": "alt"},
],
},
{
"name": "link",
"selector": "a[href]",
"type": "attribute",
"attribute": "href",
},
],
}

extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)

async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(
url="https://www.nbcnews.com/business",
extraction_strategy=extraction_strategy,
bypass_cache=True,
)

assert result.success, "Failed to crawl the page"

news_teasers = json.loads(result.extracted_content)
print(f"Successfully extracted {len(news_teasers)} news teasers")
print(json.dumps(news_teasers[0], indent=2))

if __name__ == "__main__":
asyncio.run(extract_news_teasers())
```

For more advanced usage examples, check out our [Examples](https://crawl4ai.com/mkdocs/full_details/advanced_jsoncss_extraction.md) section in the documentation.

### Extracting Structured Data with OpenAI

```python
Expand Down Expand Up @@ -174,7 +248,7 @@ if __name__ == "__main__":
asyncio.run(main())
```

### Advanced Multi-Page Crawling with JavaScript Execution
### Session Management and Dynamic Content Crawling

Crawl4AI excels at handling complex scenarios, such as crawling multiple pages with dynamic content loaded via JavaScript. Here's an example of crawling GitHub commits across multiple pages:

Expand Down Expand Up @@ -240,77 +314,8 @@ if __name__ == "__main__":

This example demonstrates Crawl4AI's ability to handle complex scenarios where content is loaded asynchronously. It crawls multiple pages of GitHub commits, executing JavaScript to load new content and using custom hooks to ensure data is loaded before proceeding.

### Using JsonCssExtractionStrategy

The `JsonCssExtractionStrategy` allows for precise extraction of structured data from web pages using CSS selectors.

```python
import asyncio
import json
from crawl4ai import AsyncWebCrawler
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy

async def extract_news_teasers():
schema = {
"name": "News Teaser Extractor",
"baseSelector": ".wide-tease-item__wrapper",
"fields": [
{
"name": "category",
"selector": ".unibrow span[data-testid='unibrow-text']",
"type": "text",
},
{
"name": "headline",
"selector": ".wide-tease-item__headline",
"type": "text",
},
{
"name": "summary",
"selector": ".wide-tease-item__description",
"type": "text",
},
{
"name": "time",
"selector": "[data-testid='wide-tease-date']",
"type": "text",
},
{
"name": "image",
"type": "nested",
"selector": "picture.teasePicture img",
"fields": [
{"name": "src", "type": "attribute", "attribute": "src"},
{"name": "alt", "type": "attribute", "attribute": "alt"},
],
},
{
"name": "link",
"selector": "a[href]",
"type": "attribute",
"attribute": "href",
},
],
}

extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
For more advanced usage examples, check out our [Examples](https://crawl4ai.com/mkdocs/full_details/session_based_crawling.md) section in the documentation.

async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(
url="https://www.nbcnews.com/business",
extraction_strategy=extraction_strategy,
bypass_cache=True,
)

assert result.success, "Failed to crawl the page"

news_teasers = json.loads(result.extracted_content)
print(f"Successfully extracted {len(news_teasers)} news teasers")
print(json.dumps(news_teasers[0], indent=2))

if __name__ == "__main__":
asyncio.run(extract_news_teasers())
```

## Speed Comparison 🚀

Expand Down
Loading

0 comments on commit 10cdad0

Please sign in to comment.