Update documents and README

wanghaisheng · Sep 25, 2024 · 10cdad0 · 10cdad0
1 parent f1eee09
commit 10cdad0
Show file tree

Hide file tree

Showing 4 changed files with 357 additions and 73 deletions.
diff --git a/.gitignore b/.gitignore
@@ -189,4 +189,6 @@ a.txt
 .lambda_function.py
 ec2*
 
-update_changelog.sh
+update_changelog.sh
+
+.DS_Store
diff --git a/README.md b/README.md
@@ -139,6 +139,80 @@ if __name__ == "__main__":
     asyncio.run(main())
 ```
 
+### Extracting Structured Data without LLM
+
+The `JsonCssExtractionStrategy` allows for precise extraction of structured data from web pages using CSS selectors.
+
+```python
+import asyncio
+import json
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+async def extract_news_teasers():
+    schema = {
+        "name": "News Teaser Extractor",
+        "baseSelector": ".wide-tease-item__wrapper",
+        "fields": [
+            {
+                "name": "category",
+                "selector": ".unibrow span[data-testid='unibrow-text']",
+                "type": "text",
+            },
+            {
+                "name": "headline",
+                "selector": ".wide-tease-item__headline",
+                "type": "text",
+            },
+            {
+                "name": "summary",
+                "selector": ".wide-tease-item__description",
+                "type": "text",
+            },
+            {
+                "name": "time",
+                "selector": "[data-testid='wide-tease-date']",
+                "type": "text",
+            },
+            {
+                "name": "image",
+                "type": "nested",
+                "selector": "picture.teasePicture img",
+                "fields": [
+                    {"name": "src", "type": "attribute", "attribute": "src"},
+                    {"name": "alt", "type": "attribute", "attribute": "alt"},
+                ],
+            },
+            {
+                "name": "link",
+                "selector": "a[href]",
+                "type": "attribute",
+                "attribute": "href",
+            },
+        ],
+    }
+
+    extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
+
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business",
+            extraction_strategy=extraction_strategy,
+            bypass_cache=True,
+        )
+
+        assert result.success, "Failed to crawl the page"
+
+        news_teasers = json.loads(result.extracted_content)
+        print(f"Successfully extracted {len(news_teasers)} news teasers")
+        print(json.dumps(news_teasers[0], indent=2))
+
+if __name__ == "__main__":
+    asyncio.run(extract_news_teasers())
+```
+
+For more advanced usage examples, check out our [Examples](https://crawl4ai.com/mkdocs/full_details/advanced_jsoncss_extraction.md) section in the documentation.
+
 ### Extracting Structured Data with OpenAI
 
 ```python
@@ -174,7 +248,7 @@ if __name__ == "__main__":
     asyncio.run(main())
 ```
 
-### Advanced Multi-Page Crawling with JavaScript Execution
+### Session Management and Dynamic Content Crawling
 
 Crawl4AI excels at handling complex scenarios, such as crawling multiple pages with dynamic content loaded via JavaScript. Here's an example of crawling GitHub commits across multiple pages:
 
@@ -240,77 +314,8 @@ if __name__ == "__main__":
 
 This example demonstrates Crawl4AI's ability to handle complex scenarios where content is loaded asynchronously. It crawls multiple pages of GitHub commits, executing JavaScript to load new content and using custom hooks to ensure data is loaded before proceeding.
 
-### Using JsonCssExtractionStrategy
-
-The `JsonCssExtractionStrategy` allows for precise extraction of structured data from web pages using CSS selectors.
-
-```python
-import asyncio
-import json
-from crawl4ai import AsyncWebCrawler
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
-
-async def extract_news_teasers():
-    schema = {
-        "name": "News Teaser Extractor",
-        "baseSelector": ".wide-tease-item__wrapper",
-        "fields": [
-            {
-                "name": "category",
-                "selector": ".unibrow span[data-testid='unibrow-text']",
-                "type": "text",
-            },
-            {
-                "name": "headline",
-                "selector": ".wide-tease-item__headline",
-                "type": "text",
-            },
-            {
-                "name": "summary",
-                "selector": ".wide-tease-item__description",
-                "type": "text",
-            },
-            {
-                "name": "time",
-                "selector": "[data-testid='wide-tease-date']",
-                "type": "text",
-            },
-            {
-                "name": "image",
-                "type": "nested",
-                "selector": "picture.teasePicture img",
-                "fields": [
-                    {"name": "src", "type": "attribute", "attribute": "src"},
-                    {"name": "alt", "type": "attribute", "attribute": "alt"},
-                ],
-            },
-            {
-                "name": "link",
-                "selector": "a[href]",
-                "type": "attribute",
-                "attribute": "href",
-            },
-        ],
-    }
-
-    extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
+For more advanced usage examples, check out our [Examples](https://crawl4ai.com/mkdocs/full_details/session_based_crawling.md) section in the documentation.
 
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        result = await crawler.arun(
-            url="https://www.nbcnews.com/business",
-            extraction_strategy=extraction_strategy,
-            bypass_cache=True,
-        )
-
-        assert result.success, "Failed to crawl the page"
-
-        news_teasers = json.loads(result.extracted_content)
-        print(f"Successfully extracted {len(news_teasers)} news teasers")
-        print(json.dumps(news_teasers[0], indent=2))
-
-if __name__ == "__main__":
-    asyncio.run(extract_news_teasers())
-```
 
 ## Speed Comparison 🚀