# Thynk Knowledge Hub — robots.txt
#
# Default rules for traditional search/web crawlers
User-agent: *
Disallow: /admin/
Disallow: /login
Allow: /

# ---------------------------------------------------------------------------
# AI / LLM crawlers — explicit, granular rules
#
# Strategy: allow bots that surface our content as cited answers in
# AI search products; block bots that exist primarily to scrape
# training data with no traffic-back signal.
#
# To flip a bot from allow → block: change `Disallow: /admin/` line
# to `Disallow: /` (and remove the Allow line).
# ---------------------------------------------------------------------------

# OpenAI search crawler — surfaces pages in ChatGPT search / SearchGPT.
# https://platform.openai.com/docs/bots
User-agent: OAI-SearchBot
Disallow: /admin/
Disallow: /login
Allow: /

# OpenAI on-demand fetch (when a ChatGPT user shares a URL)
User-agent: ChatGPT-User
Disallow: /admin/
Disallow: /login
Allow: /

# Perplexity answer engine
User-agent: PerplexityBot
Disallow: /admin/
Disallow: /login
Allow: /

# Perplexity on-demand fetch (when a Perplexity user references a URL)
User-agent: Perplexity-User
Disallow: /admin/
Disallow: /login
Allow: /

# Anthropic / Claude — fetches pages cited in Claude responses
# https://support.anthropic.com/en/articles/8896518
User-agent: ClaudeBot
Disallow: /admin/
Disallow: /login
Allow: /

User-agent: Claude-User
Disallow: /admin/
Disallow: /login
Allow: /

User-agent: Claude-SearchBot
Disallow: /admin/
Disallow: /login
Allow: /

# Apple Intelligence / Apple search results
User-agent: Applebot
Disallow: /admin/
Disallow: /login
Allow: /

# Meta AI search
User-agent: meta-externalagent
Disallow: /admin/
Disallow: /login
Allow: /

# Amazon Alexa / Bedrock-related crawler
User-agent: Amazonbot
Disallow: /admin/
Disallow: /login
Allow: /

# Mistral AI
User-agent: MistralAI-User
Disallow: /admin/
Disallow: /login
Allow: /

# DuckDuckGo Assist
User-agent: DuckAssistBot
Disallow: /admin/
Disallow: /login
Allow: /

# Cohere
User-agent: cohere-ai
Disallow: /admin/
Disallow: /login
Allow: /

# ---------------------------------------------------------------------------
# Training-only bots — blocked
#
# These crawlers feed model training corpora with little or no
# traffic-back to the site. Keep them out unless you actively want
# to seed training data.
# ---------------------------------------------------------------------------

# OpenAI training crawler (separate from OAI-SearchBot)
User-agent: GPTBot
Disallow: /

# Google's AI training opt-out (Gemini, Vertex AI training)
# https://developers.google.com/search/docs/crawling-indexing/google-extended
User-agent: Google-Extended
Disallow: /

# Apple Intelligence training opt-out (separate from Applebot)
User-agent: Applebot-Extended
Disallow: /

# Meta AI training (separate from meta-externalagent)
User-agent: FacebookBot
Disallow: /

# ByteDance / TikTok crawler — aggressive, low traffic-back
User-agent: Bytespider
Disallow: /

# Common Crawl — used by countless training corpora
User-agent: CCBot
Disallow: /

# Diffbot
User-agent: Diffbot
Disallow: /

# Omgili / Webz.io — sells scraped content as datasets
User-agent: Omgilibot
Disallow: /

User-agent: omgili
Disallow: /

# ImagesiftBot
User-agent: ImagesiftBot
Disallow: /

# YouBot
User-agent: YouBot
Disallow: /

# ---------------------------------------------------------------------------
# Sitemaps — referenced by all of the above
# ---------------------------------------------------------------------------
Sitemap: https://knowledge.thynk.cloud/sitemap-index.xml
Sitemap: https://knowledge.thynk.cloud/sitemap.xml
Sitemap: https://knowledge.thynk.cloud/image-sitemap.xml