Complete guide for using the Crawl4AI web scraping service on Haiven.
https://crawler.haiven.local/mnt/apps/docker/ai/crawl4ai/.env# Check health endpoint
curl https://crawler.haiven.local/health
# Expected response:
# {"status":"healthy","version":"0.2.x","cache_enabled":true}
All API requests require authentication via Bearer token:
# Get your API token
cat /mnt/apps/docker/ai/crawl4ai/.env | grep CRAWL4AI_API_TOKEN
# Use in requests
curl -H "Authorization: Bearer YOUR_TOKEN" https://crawler.haiven.local/crawl
Crawl a single URL and get clean markdown output:
curl -X POST https://crawler.haiven.local/crawl \
-H "Content-Type: application/json" \
-H "Authorization: Bearer YOUR_TOKEN" \
-d '{
"urls": ["https://example.com"],
"word_count_threshold": 10
}'
{
"results": [
{
"url": "https://example.com",
"success": true,
"markdown": {
"raw_markdown": "# Example Domain\n\nThis domain is for...",
"fit_markdown": "Example Domain - This domain is for..."
},
"metadata": {
"title": "Example Domain",
"description": "Example description",
"author": null
},
"links": ["https://www.iana.org/domains/example"]
}
]
}
# Get just the markdown content
curl -X POST https://crawler.haiven.local/crawl \
-H "Content-Type: application/json" \
-d '{"urls": ["https://example.com"]}' \
| jq -r '.results[0].markdown.raw_markdown'
For SPAs (React, Vue, Angular) that require JavaScript to render:
curl -X POST https://crawler.haiven.local/crawl \
-H "Content-Type: application/json" \
-d '{
"urls": ["https://github.com/trending"],
"js_code": "await new Promise(resolve => setTimeout(resolve, 3000));",
"wait_for": "css:.Box-row",
"word_count_threshold": 10
}'
| Parameter | Description | Example |
|---|---|---|
js_code |
JavaScript to execute on page | "await new Promise(r => setTimeout(r, 2000));" |
wait_for |
Wait for CSS selector | "css:#content" or "css:.article" |
word_count_threshold |
Minimum words per block | 10 |
# Wait for AJAX content to load
curl -X POST https://crawler.haiven.local/crawl \
-H "Content-Type: application/json" \
-d '{
"urls": ["https://spa-example.com"],
"js_code": "await new Promise(r => setTimeout(r, 5000));",
"wait_for": "css:[data-loaded=true]"
}'
Crawl multiple URLs in parallel:
curl -X POST https://crawler.haiven.local/crawl \
-H "Content-Type: application/json" \
-d '{
"urls": [
"https://docs.example.com/getting-started",
"https://docs.example.com/api-reference",
"https://docs.example.com/tutorials",
"https://docs.example.com/faq"
],
"word_count_threshold": 10
}'
# Extract titles and word counts from batch
curl -X POST https://crawler.haiven.local/crawl \
-H "Content-Type: application/json" \
-d '{"urls": ["https://example.com", "https://example.org"]}' \
| jq '.results[] | {url: .url, title: .metadata.title, words: (.markdown.raw_markdown | split(" ") | length)}'
import requests
import hashlib
from datetime import datetime
CRAWL4AI_URL = "https://crawler.haiven.local"
API_TOKEN = "your_token"
def crawl_for_rag(url, chunk_size=1000):
"""Crawl URL and prepare chunks for RAG ingestion."""
response = requests.post(
f"{CRAWL4AI_URL}/crawl",
headers={
"Content-Type": "application/json",
"Authorization": f"Bearer {API_TOKEN}"
},
json={"urls": [url], "word_count_threshold": 10}
)
result = response.json()["results"][0]
markdown = result["markdown"]["raw_markdown"]
metadata = result["metadata"]
# Split into chunks by paragraphs
paragraphs = markdown.split('\n\n')
chunks = []
current_chunk = []
current_size = 0
for para in paragraphs:
para_size = len(para.split())
if current_size + para_size > chunk_size and current_chunk:
chunks.append('\n\n'.join(current_chunk))
current_chunk = [para]
current_size = para_size
else:
current_chunk.append(para)
current_size += para_size
if current_chunk:
chunks.append('\n\n'.join(current_chunk))
# Prepare documents for vector DB
documents = []
for i, chunk in enumerate(chunks):
doc = {
"id": hashlib.sha256(f"{url}_{i}".encode()).hexdigest(),
"text": chunk,
"metadata": {
"source": url,
"title": metadata.get("title", ""),
"chunk_index": i,
"total_chunks": len(chunks),
"crawled_at": datetime.now().isoformat()
}
}
documents.append(doc)
return documents
# Usage
docs = crawl_for_rag("https://docs.anthropic.com/claude")
print(f"Created {len(docs)} chunks for embedding")
from langchain.document_loaders import BaseLoader
from langchain.schema import Document
import requests
class Crawl4AILoader(BaseLoader):
"""LangChain document loader for Crawl4AI."""
def __init__(self, urls, api_token):
self.urls = urls if isinstance(urls, list) else [urls]
self.api_token = api_token
def load(self):
documents = []
response = requests.post(
"https://crawler.haiven.local/crawl",
headers={
"Content-Type": "application/json",
"Authorization": f"Bearer {self.api_token}"
},
json={"urls": self.urls, "word_count_threshold": 10}
)
for result in response.json()["results"]:
if result["success"]:
doc = Document(
page_content=result["markdown"]["raw_markdown"],
metadata={
"source": result["url"],
"title": result["metadata"].get("title", "")
}
)
documents.append(doc)
return documents
# Usage with LangChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
loader = Crawl4AILoader(
urls=["https://docs.example.com/guide"],
api_token="your_token"
)
docs = loader.load()
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = splitter.split_documents(docs)
import requests
class Crawl4AIClient:
def __init__(self, base_url="https://crawler.haiven.local", token=None):
self.base_url = base_url
self.headers = {"Content-Type": "application/json"}
if token:
self.headers["Authorization"] = f"Bearer {token}"
def crawl(self, urls, **options):
"""Crawl one or more URLs."""
if isinstance(urls, str):
urls = [urls]
payload = {"urls": urls, "word_count_threshold": 10, **options}
response = requests.post(
f"{self.base_url}/crawl",
headers=self.headers,
json=payload
)
response.raise_for_status()
return response.json()["results"]
def health(self):
"""Check service health."""
response = requests.get(f"{self.base_url}/health")
return response.json()
# Usage
client = Crawl4AIClient(token="your_token")
# Check health
print(client.health())
# Crawl single URL
results = client.crawl("https://example.com")
print(results[0]["markdown"]["raw_markdown"])
# Crawl with JS rendering
results = client.crawl(
"https://react-app.com",
js_code="await new Promise(r => setTimeout(r, 2000));",
wait_for="css:#app"
)
from urllib.parse import urljoin, urlparse
from collections import deque
import time
class SiteCrawler:
def __init__(self, client, max_pages=50, delay=0.5):
self.client = client
self.max_pages = max_pages
self.delay = delay
self.visited = set()
self.results = []
def crawl_site(self, start_url, same_domain_only=True):
"""Recursively crawl a website."""
queue = deque([start_url])
start_domain = urlparse(start_url).netloc
while queue and len(self.visited) < self.max_pages:
url = queue.popleft()
if url in self.visited:
continue
if same_domain_only and urlparse(url).netloc != start_domain:
continue
print(f"Crawling ({len(self.visited)+1}/{self.max_pages}): {url}")
try:
results = self.client.crawl(url)
if results and results[0]["success"]:
result = results[0]
self.visited.add(url)
self.results.append(result)
# Queue new links
for link in result.get("links", []):
abs_url = urljoin(url, link)
if abs_url not in self.visited:
queue.append(abs_url)
time.sleep(self.delay)
except Exception as e:
print(f" Error: {e}")
return self.results
# Usage
client = Crawl4AIClient(token="your_token")
crawler = SiteCrawler(client, max_pages=20)
results = crawler.crawl_site("https://docs.example.com")
print(f"\nCrawled {len(results)} pages")
for r in results:
print(f" {r['url']} - {r['metadata'].get('title', 'No title')}")
Force fresh crawl ignoring cached content:
curl -X POST https://crawler.haiven.local/crawl \
-H "Content-Type: application/json" \
-d '{
"urls": ["https://example.com"],
"bypass_cache": true
}'
Extract only specific elements:
curl -X POST https://crawler.haiven.local/crawl \
-H "Content-Type: application/json" \
-d '{
"urls": ["https://news.ycombinator.com"],
"css_selector": ".athing",
"word_count_threshold": 5
}'
curl -X POST https://crawler.haiven.local/crawl \
-H "Content-Type: application/json" \
-d '{
"urls": ["https://example.com"],
"screenshot": true
}'
1. Connection Refused
# Check if service is running
docker ps | grep crawl4ai
# Check logs
docker logs crawl4ai --tail 50
2. JavaScript Content Not Loading
# Increase wait time
{
"js_code": "await new Promise(r => setTimeout(r, 5000));",
"wait_for": "css:#content"
}
3. Memory Errors During Batch Crawl
# Reduce concurrent requests
# Edit docker-compose.yml:
# CRAWL4AI_MAX_CONCURRENT_REQUESTS=5
docker compose restart crawl4ai
4. Slow First Request
- First request downloads Chromium browser (~200MB)
- Subsequent requests use cached browser
- Allow 5-10 minutes for initial startup
# Container status
docker ps --filter name=crawl4ai
# Resource usage
docker stats crawl4ai --no-stream
# Cache size
du -sh /mnt/storage/crawler/cache
# Clear all cached content
rm -rf /mnt/storage/crawler/cache/*
# Clear old cache (>7 days)
find /mnt/storage/crawler/cache -type f -mtime +7 -delete
| Endpoint | Method | Description |
|---|---|---|
/health |
GET | Service health check |
/crawl |
POST | Crawl one or more URLs |
Request Body:
{
"urls": ["https://example.com"],
"word_count_threshold": 10,
"bypass_cache": false,
"css_selector": null,
"js_code": null,
"wait_for": null,
"screenshot": false
}
Response:
{
"results": [
{
"url": "string",
"success": true,
"markdown": {
"raw_markdown": "string",
"fit_markdown": "string"
},
"metadata": {
"title": "string",
"description": "string",
"author": "string"
},
"links": ["string"],
"screenshot": "base64_string (if requested)"
}
]
}
For more details, see the Implementation Guide and Crawl4AI Documentation.