"""
-------------------------------------------------------------------------
This file is part of the RAGSDK project.
Copyright (c) 2025 Huawei Technologies Co.,Ltd.
RAGSDK is licensed under Mulan PSL v2.
You can use this software according to the terms and conditions of the Mulan PSL v2.
You may obtain a copy of Mulan PSL v2 at:
http://license.coscl.org.cn/MulanPSL2
THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
See the Mulan PSL v2 for more details.
-------------------------------------------------------------------------
"""
"""
MXRAGCache 提供的对外API,用于初始化MXRAGCache的参数,在MXRAGCache运行之前进行调用
"""
import os
import re
import cachetools
from gptcache import Cache
from gptcache.manager.scalar_data import CacheBase
from gptcache.similarity_evaluation import ExactMatchEvaluation
from gptcache.config import Config
from gptcache.processor.pre import get_prompt
from loguru import logger
from mx_rag.cache import CacheConfig, SimilarityCacheConfig, EvictPolicy
from mx_rag.cache.cache_similarity.cache_similarity import CacheSimilarity
from mx_rag.cache.cache_storage.cache_vec_storage import CacheVecStorage
from mx_rag.cache.cache_emb.cache_emb import CacheEmb
from mx_rag.utils.common import validate_params, MB, GB
from mx_rag.utils.file_check import SecFileCheck, FileCheck, check_disk_free_space
def _cache_file_secure_check(file_path: str, file_max_size: int):
if not os.path.exists(file_path):
logger.warning("cache file not exist will create by flush")
return
SecFileCheck(file_path, file_max_size).check()
def _get_data_save_file(data_save_folder: str, cache_name: str, memory_only: bool = False):
"""
功能描述:
内部接口,根据用户配置创建缓存数据目录和文件
Args:
data_save_folder:str 缓存存储目录 由用户提供,需要符合安全标准
cache_name:str cache实例的键值
memory_only:bool 是否是memory_cache缓存 如果是则只需要创建data_map文件
Return:
vector_save_file:str similarity cache向量数据库缓存文件
sql_save_file:str similarity cache缓存数据缓存文件
data_save_file:str memory cache的缓存文件
"""
file_prefix = cache_name
FileCheck.dir_check(data_save_folder)
vector_save_file = ""
sql_save_file = ""
data_save_file = ""
if not memory_only:
vector_save_file = os.path.join(data_save_folder, f"{file_prefix}_vector_cache_file.index")
_cache_file_secure_check(vector_save_file, 20 * GB)
sql_save_file = os.path.join(data_save_folder, f"{file_prefix}_sql_cache_file.db")
_cache_file_secure_check(sql_save_file, 30 * GB)
else:
data_save_file = os.path.join(data_save_folder, f"{file_prefix}_data_map.txt")
_cache_file_secure_check(data_save_file, 100 * MB)
return vector_save_file, sql_save_file, data_save_file
def _maybe_create_cache_save_folder(config: CacheConfig):
FileCheck.check_input_path_valid(config.data_save_folder)
if config.lock:
with config.lock:
if not os.path.exists(config.data_save_folder):
os.makedirs(config.data_save_folder, 0o750)
else:
if not os.path.exists(config.data_save_folder):
os.makedirs(config.data_save_folder, 0o750)
if check_disk_free_space(os.path.dirname(config.data_save_folder), config.min_free_space):
raise Exception("Insufficient remaining space, please clear disk space")
def _get_gpt_cache_config(config: CacheConfig) -> Config:
config = Config(similarity_threshold=config.similarity_threshold,
auto_flush=config.auto_flush,
disable_report=config.disable_report
)
return config
def _init_mxrag_similar_cache(cache_obj: Cache, cache_name: str, config: SimilarityCacheConfig):
"""
功能描述:
内部接口,根据SimilarityCacheConfig 初始化指定cache_name的cache实例,为语义相关cache
Args:
cache_obj:Cache 缓存实例
cache_name:str cache实例的键值
config:SimilarityCacheConfig 语义相似缓存配置数据
Return:
None
"""
from gptcache.manager import get_data_manager
from gptcache.adapter.api import init_similar_cache
vector_save_file, sql_save_file, _ = _get_data_save_file(config.data_save_folder, cache_name)
vector_base = CacheVecStorage.create(**config.vector_config, vector_save_file=vector_save_file,
top_k=config.retrieval_top_k)
cache_base = CacheBase(config.cache_config, sql_url=f'{config.cache_config}:///{sql_save_file}')
similarity = CacheSimilarity.create(**config.similarity_config)
embedding = CacheEmb.create(**config.emb_config)
data_manager = get_data_manager(
cache_base=cache_base,
vector_base=vector_base,
max_size=config.cache_size,
eviction=config.eviction_policy.value,
clean_size=config.clean_size
)
gptcache_config = _get_gpt_cache_config(config)
init_similar_cache(
pre_func=get_prompt,
cache_obj=cache_obj,
data_manager=data_manager,
embedding=embedding,
evaluation=similarity,
config=gptcache_config
)
return vector_save_file, sql_save_file, vector_base
def _init_mxrag_memory_cache(cache_obj: Cache, cache_name: str, config: CacheConfig):
"""
功能描述:
内部接口,根据CacheConfig 初始化指定cache_name的cache实例,为memory_cache only
Args:
cache_obj:Cache 缓存实例
cache_name:str cache实例的键值
config:CacheConfig memory_cache 缓存配置数据
Return:
None
"""
from gptcache.manager.data_manager import MapDataManager
from gptcache.adapter.api import init_similar_cache
_, _, data_save_file = _get_data_save_file(config.data_save_folder, cache_name, True)
evict_policy_memory_map = {
EvictPolicy.LRU.value: cachetools.LRUCache,
EvictPolicy.LFU.value: cachetools.LFUCache,
EvictPolicy.FIFO.value: cachetools.FIFOCache,
EvictPolicy.RR.value: cachetools.RRCache
}
data_manager = MapDataManager(data_save_file,
config.cache_size,
evict_policy_memory_map.get(config.eviction_policy.value, cachetools.LRUCache))
gptcache_config = _get_gpt_cache_config(config)
init_similar_cache(
pre_func=get_prompt,
cache_obj=cache_obj,
data_manager=data_manager,
embedding=CacheEmb(skip_emb=True),
evaluation=ExactMatchEvaluation(),
config=gptcache_config
)
return data_save_file
@validate_params(
cache_name=dict(
validator=lambda x: isinstance(x, str) and 0 < len(x) < 64 and bool(re.fullmatch(r'[0-9a-zA-Z_]+', x)),
message="param must meets: Type is str, length range (0, 64), match '[0-9a-zA-Z_]+'"),
)
def _init_mxrag_cache(cache_obj: Cache, cache_name: str, config):
"""
功能描述:
内部接口,根据config 初始化指定cache_name的cache实例
Args:
cache_obj:Cache 缓存实例
cache_name:str cache实例的键值
config:CacheConfig/SimilarityCacheConfig 缓存配置数据
Return:
None
Raises:
ValueError: 当配置数据不在有效范围内时
"""
_maybe_create_cache_save_folder(config)
save_data_path = {}
if config.config_type == "similarity_cache_config":
save_data_path["vector_file"], save_data_path["sql_file"], save_data_path["vector_db"] =\
_init_mxrag_similar_cache(cache_obj, cache_name, config)
elif config.config_type == "memory_cache_config":
save_data_path["txt_file"] = _init_mxrag_memory_cache(cache_obj, cache_name, config)
else:
logger.error("config type not support. ")
return save_data_path