# Global rules for all crawlers
User-agent: *
# Basic crawling rules
Allow: /index.html
Disallow: /private/
Disallow: /admin/
Disallow: /backend/
Disallow: /api/
Disallow: /*.json$
Disallow: /*?*

# Known AI training crawlers (including GPT, Claude, and others)
User-agent: CCBot
Disallow: /

User-agent: GPTBot
Disallow: /

User-agent: ChatGPT-User
Disallow: /

User-agent: Google-Extended
Disallow: /

User-agent: anthropic-ai
Disallow: /

User-agent: Claude-Web
Disallow: /

User-agent: Omgilibot
Disallow: /

User-agent: Omgili
Disallow: /

User-agent: FacebookBot
Disallow: /

User-agent: cohere-ai
Disallow: /

# Block Common AI Training Data Collectors
User-agent: Common Crawl
Disallow: /

User-agent: CommonCrawler
Disallow: /

User-agent: ia_archiver
Disallow: /

# Rate limiting through crawl-delay (in seconds)
User-agent: *
Crawl-delay: 10

# Sitemaps
Sitemap: https://your-domain.com/sitemap.xml

# Additional protective measures
# Block access to specific file types often used for training
User-agent: *
Disallow: /*.md$
Disallow: /*.markdown$
Disallow: /*.txt$
Disallow: /*.doc$
Disallow: /*.docx$
Disallow: /*.pdf$
Disallow: /*.csv$
Disallow: /*.xls$
Disallow: /*.xlsx$
Disallow: /*.xml$
Disallow: /*.json$
Disallow: /*.yaml$
Disallow: /*.yml$

# Block common parameters used by scrapers
Disallow: /*?source=
Disallow: /*?ref=
Disallow: /*?utm_
Disallow: /*?fbclid=
Disallow: /*?gclid=

# Block access to development/staging areas
Disallow: /dev/
Disallow: /staging/
Disallow: /test/
Disallow: /beta/

# Block backend and API paths
Disallow: /wp-admin/
Disallow: /wp-includes/
Disallow: /wp-content/plugins/
Disallow: /wp-json/
Disallow: /graphql
Disallow: /api/v1/
Disallow: /api/v2/
Disallow: /api/v3/
Disallow: /rest/
Disallow: /feeds/
Disallow: /rss/

# Block access to user-specific content
Disallow: /users/
Disallow: /profiles/
Disallow: /account/
Disallow: /dashboard/