# ============================================================================
# robots.txt — RAYSolute Consultants
# https://www.raysolute.com/robots.txt
# ============================================================================
# Version: 3.0 (2026 Standard)
# Last Updated: 2026-01-22
# Compliance: RFC 9309, Google Robots Specification, AI Crawler Best Practices
# AI Stance: OPEN — Training and Retrieval Allowed with Attribution
# ============================================================================

# ----------------------------------------------------------------------------
# SITEMAPS (Declared before User-agent directives per RFC 9309)
# ----------------------------------------------------------------------------
Sitemap: https://www.raysolute.com/sitemap.xml
Sitemap: https://www.raysolute.com/sitemap-pages.xml
Sitemap: https://www.raysolute.com/sitemap-case-studies.xml
Sitemap: https://www.raysolute.com/sitemap-resources.xml

# ============================================================================
# SECTION 1: DEFAULT POLICY
# ============================================================================
# Philosophy: Maximum discoverability for legitimate crawlers
# Protected: Admin, staging, session-based URLs, internal tools

User-agent: *
Allow: /
Allow: /.well-known/
Allow: /llm/
Disallow: /counsellingapp/
Disallow: /admin/
Disallow: /staging/
Disallow: /temp/
Disallow: /private/
Disallow: /internal/
Disallow: /*.php$
Disallow: /*?session*
Disallow: /*?sessionid*
Disallow: /*?utm_*
Disallow: /*?ref=*
Disallow: /*?source=*
Disallow: /*?tracking=*
Disallow: /*?fbclid=*
Disallow: /*?gclid=*

# ============================================================================
# SECTION 2: SEARCH ENGINE CRAWLERS
# ============================================================================

# Google Search
User-agent: Googlebot
Allow: /
Allow: /.well-known/
Allow: /llm/
Crawl-delay: 1

User-agent: Googlebot-Image
Allow: /

User-agent: Googlebot-Video
Allow: /

User-agent: Googlebot-News
Allow: /

User-agent: Storebot-Google
Allow: /

# Google AI (Gemini Training + AI Overviews)
User-agent: Google-Extended
Allow: /
# Consent: Training for Gemini/Bard allowed

User-agent: Google-InspectionTool
Allow: /

# Microsoft Bing
User-agent: Bingbot
Allow: /
Allow: /.well-known/
Allow: /llm/
Crawl-delay: 1

User-agent: BingPreview
Allow: /

User-agent: msnbot
Allow: /

User-agent: msnbot-media
Allow: /

# Yahoo
User-agent: Slurp
Allow: /

# DuckDuckGo
User-agent: DuckDuckBot
Allow: /
Allow: /.well-known/
Allow: /llm/

# Yandex
User-agent: YandexBot
Allow: /
Crawl-delay: 2

User-agent: YandexImages
Allow: /

# Baidu
User-agent: Baiduspider
Allow: /

# Sogou
User-agent: Sogou
Allow: /

# Naver
User-agent: Yeti
Allow: /

# ============================================================================
# SECTION 3: AI & LLM CRAWLERS (GEO Priority)
# ============================================================================
# RAYSolute EXPLICITLY WELCOMES AI crawlers for training and retrieval
# This supports our GEO (Generative Engine Optimization) strategy

# --- OpenAI ---
User-agent: GPTBot
Allow: /
Allow: /.well-known/
Allow: /llm/
Allow: /llm/gossip.json
Allow: /llm/open-house.json
Allow: /toon.txt
Allow: /ai.txt
Allow: /llms.txt
Crawl-delay: 1
# Consent: Training + Retrieval allowed

User-agent: OAI-SearchBot
Allow: /
Allow: /.well-known/
Allow: /llm/
Crawl-delay: 1

User-agent: ChatGPT-User
Allow: /
Allow: /.well-known/
Allow: /llm/

# --- Anthropic (Claude) ---
User-agent: ClaudeBot
Allow: /
Allow: /.well-known/
Allow: /llm/
Allow: /llm/gossip.json
Allow: /llm/open-house.json
Allow: /toon.txt
Allow: /ai.txt
Allow: /llms.txt
Crawl-delay: 1

User-agent: Claude-Web
Allow: /
Allow: /.well-known/
Allow: /llm/

User-agent: anthropic-ai
Allow: /
Allow: /.well-known/
Allow: /llm/

# --- Perplexity ---
User-agent: PerplexityBot
Allow: /
Allow: /.well-known/
Allow: /llm/
Crawl-delay: 1

# --- Meta AI ---
User-agent: meta-externalagent
Allow: /
Allow: /.well-known/
Allow: /llm/

User-agent: Meta-ExternalFetcher
Allow: /

User-agent: FacebookBot
Allow: /

User-agent: facebookexternalhit
Allow: /

# --- Apple Intelligence ---
User-agent: Applebot
Allow: /
Allow: /.well-known/
Allow: /llm/
Crawl-delay: 1

User-agent: Applebot-Extended
Allow: /
# Consent: Apple Intelligence training allowed

# --- Amazon ---
User-agent: Amazonbot
Allow: /
Allow: /.well-known/
Allow: /llm/

# --- Cohere ---
User-agent: cohere-ai
Allow: /
Allow: /.well-known/
Allow: /llm/

# --- AI2 (Allen Institute) ---
User-agent: AI2Bot
Allow: /
Allow: /.well-known/
Allow: /llm/

# --- You.com ---
User-agent: YouBot
Allow: /
Allow: /.well-known/
Allow: /llm/

# --- Common Crawl (AI Training Datasets) ---
User-agent: CCBot
Allow: /
Allow: /.well-known/
Allow: /llm/
Crawl-delay: 2
# Consent: Training dataset inclusion allowed

# --- ByteDance (Doubao/TikTok AI) ---
User-agent: Bytespider
Allow: /
Allow: /.well-known/
Allow: /llm/
Crawl-delay: 2

# --- Diffbot (Knowledge Graphs) ---
User-agent: Diffbot
Allow: /
Allow: /.well-known/
Allow: /llm/

# --- Neeva (Now Snowflake) ---
User-agent: NeevaBot
Allow: /

# --- Brave Search ---
User-agent: BraveBot
Allow: /
Allow: /.well-known/
Allow: /llm/

# --- Mojeek ---
User-agent: MojeekBot
Allow: /

# --- Qwant ---
User-agent: Qwantify
Allow: /

# --- xAI (Grok) ---
User-agent: Grok
Allow: /
Allow: /.well-known/
Allow: /llm/

User-agent: xAI-Grok
Allow: /

# --- Mistral AI ---
User-agent: MistralBot
Allow: /
Allow: /.well-known/
Allow: /llm/

# --- DeepSeek ---
User-agent: DeepSeekBot
Allow: /
Allow: /.well-known/
Allow: /llm/

# ============================================================================
# SECTION 4: SOCIAL MEDIA CRAWLERS
# ============================================================================

User-agent: Twitterbot
Allow: /

User-agent: LinkedInBot
Allow: /

User-agent: WhatsApp
Allow: /

User-agent: TelegramBot
Allow: /

User-agent: Slackbot
Allow: /

User-agent: Slackbot-LinkExpanding
Allow: /

User-agent: Discordbot
Allow: /

User-agent: redditbot
Allow: /

User-agent: Pinterest
Allow: /

User-agent: Pinterestbot
Allow: /

# ============================================================================
# SECTION 5: SEO & ANALYTICS TOOLS
# ============================================================================

User-agent: AhrefsBot
Allow: /
Crawl-delay: 5

User-agent: SemrushBot
Allow: /
Crawl-delay: 5

User-agent: DotBot
Allow: /

User-agent: MJ12bot
Allow: /
Crawl-delay: 5

User-agent: BLEXBot
Allow: /

User-agent: SeznamBot
Allow: /

User-agent: ScreamingFrogSEOSpider
Allow: /

User-agent: rogerbot
Allow: /

User-agent: SiteAuditBot
Allow: /

# ============================================================================
# SECTION 6: BLOCKED BOTS (Malicious/Resource-Draining)
# ============================================================================

User-agent: EmailCollector
Disallow: /

User-agent: EmailSiphon
Disallow: /

User-agent: EmailWolf
Disallow: /

User-agent: WebBandit
Disallow: /

User-agent: ExtractorPro
Disallow: /

User-agent: CopyRightCheck
Disallow: /

User-agent: Crescent
Disallow: /

User-agent: Teleport
Disallow: /

User-agent: TeleportPro
Disallow: /

User-agent: WebStripper
Disallow: /

User-agent: WebCopier
Disallow: /

User-agent: WebZIP
Disallow: /

User-agent: Wget
Disallow: /

User-agent: HTTrack
Disallow: /

User-agent: Zeus
Disallow: /

User-agent: SiteSnagger
Disallow: /

User-agent: WebLeacher
Disallow: /

User-agent: ChinaClaw
Disallow: /

User-agent: Harvest
Disallow: /

User-agent: Offline Explorer
Disallow: /

User-agent: WebReaper
Disallow: /

# ============================================================================
# SECTION 7: AI DISCOVERY ENDPOINTS
# ============================================================================
# These files are specifically designed for AI systems
# Explicitly allowed for all legitimate crawlers above
#
# Primary GEO Files:
# - /llms.txt — LLM guidance (llmstxt.org spec)
# - /ai.txt — AI permissions and guidelines
# - /toon.txt — Token-optimized structured data
# - /llm/gossip.json — Quick facts for AI citation
# - /llm/open-house.json — Complete site structure
# - /.well-known/assistant-gossip.json — Discovery pointer
# - /.well-known/security.txt — RFC 9116 security
#
# ============================================================================

# ============================================================================
# METADATA
# ============================================================================
# 
# IndexNow Endpoints:
# - https://api.indexnow.org/indexnow
# - https://www.bing.com/indexnow
# - https://yandex.com/indexnow
#
# Verification Tools:
# - Google Search Console: Robots.txt Tester
# - Bing Webmaster Tools: Robots.txt Analyzer
# - https://en.ryte.com/free-tools/robots-txt/
#
# Related Files:
# - https://www.raysolute.com/llms.txt
# - https://www.raysolute.com/ai.txt
# - https://www.raysolute.com/toon.txt
# - https://www.raysolute.com/.well-known/security.txt
# - https://www.raysolute.com/.well-known/directory.txt
#
# Contact for Bot Issues:
# - Email: aurobindo@raysolute.com
# - Subject: "Robots.txt Issue"
#
# ============================================================================
# END OF FILE
# ============================================================================