# robots.txt for hsec.de
# Updated: February 2026
# 
# This file controls both traditional search engine crawlers 
# and AI/LLM training scrapers

# =============================================================================
# TRADITIONAL SEARCH ENGINES (Allowed)
# =============================================================================

# Default policy for well-behaved search engines
User-agent: *
Disallow: /admin/
Disallow: /private/
Disallow: /*.json$
Disallow: /*.xml$
Disallow: /api/
Allow: /

# Sitemap location
Sitemap: https://hsec.de/sitemap.xml

# =============================================================================
# AI SCRAPERS & LLM TRAINING BOTS (Blocked)
# =============================================================================
# These bots scrape content for AI model training without permission
# We block them to protect our intellectual property

# OpenAI (ChatGPT training)
User-agent: GPTBot
Disallow: /

User-agent: ChatGPT-User
Disallow: /

# Google AI (Bard/Gemini training)
User-agent: Google-Extended
Disallow: /

# Anthropic (Claude training)
User-agent: anthropic-ai
Disallow: /

User-agent: Claude-Web
Disallow: /

# Common Crawl (used by many AI companies)
User-agent: CCBot
Disallow: /

# Cohere AI
User-agent: cohere-ai
Disallow: /

# Perplexity AI
User-agent: PerplexityBot
Disallow: /

# Meta/Facebook AI
User-agent: FacebookBot
Disallow: /

User-agent: Meta-ExternalAgent
Disallow: /

# Amazon (Alexa AI training)
User-agent: Amazonbot
Disallow: /

User-agent: ia_archiver
Disallow: /

# Bytedance (TikTok)
User-agent: Bytespider
Disallow: /

# Apple AI
User-agent: Applebot-Extended
Disallow: /

# Diffbot
User-agent: Diffbot
Disallow: /

# Omgili
User-agent: omgili
Disallow: /

User-agent: omgilibot
Disallow: /

# YouBot (You.com AI)
User-agent: YouBot
Disallow: /

# =============================================================================
# AGGRESSIVE/MISBEHAVING CRAWLERS (Blocked)
# =============================================================================

# Scrapers known for ignoring rules or aggressive crawling
User-agent: AhrefsBot
Disallow: /

User-agent: SemrushBot
Disallow: /

User-agent: DotBot
Disallow: /

User-agent: MJ12bot
Disallow: /

User-agent: BLEXBot
Disallow: /

# =============================================================================
# CRAWL RATE LIMITS FOR ALLOWED BOTS
# =============================================================================

# Google
User-agent: Googlebot
Crawl-delay: 1
Allow: /

# Bing
User-agent: Bingbot
Crawl-delay: 1
Allow: /

# DuckDuckGo
User-agent: DuckDuckBot
Crawl-delay: 2
Allow: /

# Yandex
User-agent: Yandex
Crawl-delay: 2
Allow: /

# =============================================================================
# NOTES
# =============================================================================
# 
# AI Scraper Policy:
# We block AI training scrapers to protect our intellectual property,
# including proprietary insights, technical documentation, and client work.
# Our content is available for reading by humans via browsers, but not for
# bulk scraping or AI model training without explicit permission.
#
# For licensing inquiries: info@hsec.de
#
# Traditional search engines (Google Search, Bing, etc.) are allowed to
# index our content for search purposes, as this serves legitimate user needs.