"""Language detection utility for ContextEngine.
Single implementation used across extraction, indexing, and compression.
Supports: zh-CN, ja, ko, ru, en (default).
"""
import re
def detect_language(text: str) -> str:
"""Detect language from text.
Args:
text: Text to analyze (can combine multiple strings with join).
Returns:
Language code: "zh-CN", "ja", "ko", "ru", or "en" (default).
"""
if not text.strip():
return "en"
total = len(re.findall(r"\S", text))
if total == 0:
return "en"
if re.search(r"[\u3040-\u30ff]", text):
return "ja"
if re.search(r"[\u4e00-\u9fff]", text):
return "zh-CN"
ko = len(re.findall(r"[\uac00-\ud7af]", text))
if ko >= 2 and ko / total >= 0.10:
return "ko"
ru = len(re.findall(r"[\u0400-\u04ff]", text))
if ru >= 2 and ru / total >= 0.10:
return "ru"
return "en"