Skip to content

Commit bccadec

Browse files
committed
Remove dependency on psutil, PyYaml, and extend requests version range
1 parent 0759503 commit bccadec

File tree

5 files changed

+71
-27
lines changed

5 files changed

+71
-27
lines changed

README.md

+14-3
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc
3939
- 🔄 Session management for complex multi-page crawling scenarios
4040
- 🌐 Asynchronous architecture for improved performance and scalability
4141

42-
4342
## Installation 🛠️
4443

4544
Crawl4AI offers flexible installation options to suit various use cases. You can install it as a Python package or use Docker.
@@ -56,9 +55,21 @@ For basic web crawling and scraping tasks:
5655
pip install crawl4ai
5756
```
5857

59-
By default this will install the asynchronous version of Crawl4AI, using Playwright for web crawling.
58+
By default, this will install the asynchronous version of Crawl4AI, using Playwright for web crawling.
59+
60+
👉 Note: When you install Crawl4AI, the setup script should automatically install and set up Playwright. However, if you encounter any Playwright-related errors, you can manually install it using one of these methods:
61+
62+
1. Through the command line:
63+
```bash
64+
playwright install
65+
```
66+
67+
2. If the above doesn't work, try this more specific command:
68+
```bash
69+
python -m playwright install chromium
70+
```
6071

61-
👉 Note: The standard version of Crawl4AI uses Playwright for asynchronous crawling. If you encounter an error saying that Playwright is not installed, you can run playwright install. However, this should be done automatically during the setup process.
72+
This second method has proven to be more reliable in some cases.
6273

6374
#### Installation with Synchronous Version
6475

crawl4ai/async_crawler_strategy.py

+1-10
Original file line numberDiff line numberDiff line change
@@ -3,24 +3,15 @@
33
from abc import ABC, abstractmethod
44
from typing import Callable, Dict, Any, List, Optional
55
import os
6-
import psutil
76
from playwright.async_api import async_playwright, Page, Browser, Error
87
from io import BytesIO
98
from PIL import Image, ImageDraw, ImageFont
10-
from .utils import sanitize_input_encode
9+
from .utils import sanitize_input_encode, calculate_semaphore_count
1110
import json, uuid
1211
import hashlib
1312
from pathlib import Path
1413
from playwright.async_api import ProxySettings
1514
from pydantic import BaseModel
16-
17-
def calculate_semaphore_count():
18-
cpu_count = os.cpu_count()
19-
memory_gb = psutil.virtual_memory().total / (1024 ** 3) # Convert to GB
20-
base_count = max(1, cpu_count // 2)
21-
memory_based_cap = int(memory_gb / 2) # Assume 2GB per instance
22-
return min(base_count, memory_based_cap)
23-
2415
class AsyncCrawlResponse(BaseModel):
2516
html: str
2617
response_headers: Dict[str, str]

crawl4ai/utils.py

+41
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import html
77
import re
88
import os
9+
import platform
910
from html2text import HTML2Text
1011
from .prompts import PROMPT_EXTRACT_BLOCKS
1112
from .config import *
@@ -18,6 +19,46 @@
1819
class InvalidCSSSelectorError(Exception):
1920
pass
2021

22+
def calculate_semaphore_count():
23+
cpu_count = os.cpu_count()
24+
memory_gb = get_system_memory() / (1024 ** 3) # Convert to GB
25+
base_count = max(1, cpu_count // 2)
26+
memory_based_cap = int(memory_gb / 2) # Assume 2GB per instance
27+
return min(base_count, memory_based_cap)
28+
29+
def get_system_memory():
30+
system = platform.system()
31+
if system == "Linux":
32+
with open('/proc/meminfo', 'r') as mem:
33+
for line in mem:
34+
if line.startswith('MemTotal:'):
35+
return int(line.split()[1]) * 1024 # Convert KB to bytes
36+
elif system == "Darwin": # macOS
37+
import subprocess
38+
output = subprocess.check_output(['sysctl', '-n', 'hw.memsize']).decode('utf-8')
39+
return int(output.strip())
40+
elif system == "Windows":
41+
import ctypes
42+
kernel32 = ctypes.windll.kernel32
43+
c_ulonglong = ctypes.c_ulonglong
44+
class MEMORYSTATUSEX(ctypes.Structure):
45+
_fields_ = [
46+
('dwLength', ctypes.c_ulong),
47+
('dwMemoryLoad', ctypes.c_ulong),
48+
('ullTotalPhys', c_ulonglong),
49+
('ullAvailPhys', c_ulonglong),
50+
('ullTotalPageFile', c_ulonglong),
51+
('ullAvailPageFile', c_ulonglong),
52+
('ullTotalVirtual', c_ulonglong),
53+
('ullAvailVirtual', c_ulonglong),
54+
('ullAvailExtendedVirtual', c_ulonglong),
55+
]
56+
memoryStatus = MEMORYSTATUSEX()
57+
memoryStatus.dwLength = ctypes.sizeof(MEMORYSTATUSEX)
58+
kernel32.GlobalMemoryStatusEx(ctypes.byref(memoryStatus))
59+
return memoryStatus.ullTotalPhys
60+
else:
61+
raise OSError("Unsupported operating system")
2162

2263
def get_home_folder():
2364
home_folder = os.path.join(Path.home(), ".crawl4ai")

requirements.txt

+2-4
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,5 @@ numpy>=1.26.0,<2.1.1
66
pillow==10.4.0
77
playwright==1.47.0
88
python-dotenv==1.0.1
9-
requests==2.32.3
10-
PyYAML==6.0.2
11-
beautifulsoup4==4.12.3
12-
psutil==6.0.0
9+
requests>=2.26.0,<2.32.3
10+
beautifulsoup4==4.12.3

setup.py

+13-10
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from pathlib import Path
55
import shutil
66
import subprocess
7+
import sys
78

89
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
910
# If the folder already exists, remove the cache folder
@@ -35,21 +36,23 @@
3536
cosine_similarity_requirements = ["torch", "transformers", "nltk", "spacy"]
3637
sync_requirements = ["selenium"]
3738

38-
def post_install():
39-
print("Running post-installation setup...")
39+
def install_playwright():
40+
print("Installing Playwright browsers...")
4041
try:
41-
subprocess.check_call(["playwright", "install"])
42+
subprocess.check_call([sys.executable, "-m", "playwright", "install"])
4243
print("Playwright installation completed successfully.")
43-
except subprocess.CalledProcessError:
44-
print("Error during Playwright installation. Please run 'playwright install' manually.")
45-
except FileNotFoundError:
46-
print("Playwright not found. Please ensure it's installed and run 'playwright install' manually.")
44+
except subprocess.CalledProcessError as e:
45+
print(f"Error during Playwright installation: {e}")
46+
print("Please run 'python -m playwright install' manually after the installation.")
47+
except Exception as e:
48+
print(f"Unexpected error during Playwright installation: {e}")
49+
print("Please run 'python -m playwright install' manually after the installation.")
4750

4851
class PostInstallCommand(install):
4952
def run(self):
5053
install.run(self)
51-
post_install()
52-
54+
install_playwright()
55+
5356
setup(
5457
name="Crawl4AI",
5558
version=version,
@@ -61,7 +64,7 @@ def run(self):
6164
author_email="[email protected]",
6265
license="MIT",
6366
packages=find_packages(),
64-
install_requires=default_requirements,
67+
install_requires=default_requirements + ["playwright"], # Add playwright to default requirements
6568
extras_require={
6669
"torch": torch_requirements,
6770
"transformer": transformer_requirements,

0 commit comments

Comments
 (0)