import json
import re
from collections import OrderedDict
from unittest.mock import patch, AsyncMock
import pytest
from openjiuwen_deepsearch.algorithm.source_trace.citation_checker_research import CitationCheckerResearch
from openjiuwen_deepsearch.common.exception import CustomIndexException
from openjiuwen_deepsearch.common.status_code import StatusCode
class MockMatch:
"""模拟 re.Match 对象,用于测试"""
def __init__(self, start, end, string="mocked string"):
self._start = start
self._end = end
self.string = string
def start(self):
return self._start
def end(self):
return self._end
class TestResearchCitationChecker:
"""Test cases for CitationCheckerResearch core functionality."""
def test_init(self):
"""Test CitationCheckerResearch initialization."""
checker = CitationCheckerResearch("mock_model")
assert hasattr(checker, 'citation_verifier')
assert checker.citation_verifier is not None
def test_validate_url_match_exact_match(self):
"""Test URL validation with exact match."""
url = "https://example.com"
datas = [{'url': url, 'valid': True}]
citation_index = 0
checker = CitationCheckerResearch("mock_model")
result_url, is_valid = checker.validate_url_match(
url, datas, citation_index)
assert result_url == url
assert is_valid is True
@patch('openjiuwen_deepsearch.algorithm.source_trace.citation_checker_research.are_similar_urls')
@patch('openjiuwen_deepsearch.algorithm.source_trace.citation_checker_research.LogManager')
def test_validate_url_match_mismatch_with_logging(self, mock_log_manager, mock_are_similar_urls):
"""Test URL validation with mismatch and logging."""
mock_log_manager.is_sensitive.return_value = False
mock_are_similar_urls.return_value = False
url = "https://example.com"
datas = [{'url': "https://different.com", 'valid': True}]
citation_index = 0
checker = CitationCheckerResearch("mock_model")
result_url, is_valid = checker.validate_url_match(
url, datas, citation_index)
assert result_url == url
assert is_valid is False
def test_handle_duplicate_citations_keep_higher_score(self):
"""Test handling duplicate URLs with higher score."""
url = "https://example.com"
old_match = MockMatch(10, 50)
new_match = MockMatch(52, 92)
current_data = {'score': 0.9, 'match': new_match}
existing_data = {'score': 0.8, 'match': old_match}
datas = [existing_data, current_data]
processed_citation_urls = {url: {'score': 0.8, 'data_index': 0}}
citation_index = 1
checker = CitationCheckerResearch("mock_model")
result = checker.remove_duplicate_citations(
url, datas, processed_citation_urls, citation_index)
assert processed_citation_urls[url]['score'] == 0.9
assert processed_citation_urls[url]['data_index'] == 1
assert datas[0]["valid"] is False
assert datas[0]["invalid_reason"] == "score lower than another citation"
assert 0 in result
def test_handle_duplicate_citations_keep_existing_score(self):
"""Test handling duplicate URLs with lower score."""
url = "https://example.com"
old_match = MockMatch(10, 50)
new_match = MockMatch(52, 92)
current_data = {'score': 0.6, 'match': new_match}
existing_data = {'score': 0.8, 'match': old_match}
datas = [existing_data, current_data]
processed_citation_urls = {url: {'score': 0.8, 'data_index': 0}}
citation_index = 1
checker = CitationCheckerResearch("mock_model")
result = checker.remove_duplicate_citations(
url, datas, processed_citation_urls, citation_index)
assert processed_citation_urls[url]['score'] == 0.8
assert processed_citation_urls[url]['data_index'] == 0
assert datas[1]["valid"] is False
assert datas[1]["invalid_reason"] == "score lower than another citation"
assert 1 in result
def test_handle_duplicate_citations_non_adjacent(self):
"""Test handling non-adjacent duplicate URLs."""
url = "https://example.com"
old_match = MockMatch(10, 50)
new_match = MockMatch(100, 140)
current_data = {'score': 0.9, 'match': new_match}
existing_data = {'score': 0.8, 'match': old_match}
datas = [existing_data, current_data]
processed_citation_urls = {url: {'score': 0.8, 'data_index': 0}}
citation_index = 1
checker = CitationCheckerResearch("mock_model")
result = checker.remove_duplicate_citations(
url, datas, processed_citation_urls, citation_index)
assert processed_citation_urls[url]['score'] == 0.9
assert processed_citation_urls[url]['data_index'] == 1
assert datas[0].get("valid", True)
assert datas[1].get("valid", True)
assert len(result) == 0
def test_save_citation_message_filter_invalid_and_images(self):
"""Test filtering out invalid and image citations."""
datas = [
{
'url': 'https://example.com',
'title': 'Example',
'valid': True
},
{
'url': 'https://invalid.com',
'title': 'Invalid',
'valid': False
},
{
'url': 'https://image.com',
'title': 'Image',
'valid': True,
'is_image': True
}
]
result = CitationCheckerResearch.organize_citations_for_frontend(datas)
assert result['code'] == 0
assert len(result['data']) == 1
assert result['data'][0]['url'] == "https://example.com"
assert result['data'][0]['from'] == 'web'
def test_save_citation_message_local_source(self):
"""Test handling local source citations."""
datas = [
{
'url': 'local/path/file.pdf',
'title': 'Local',
'valid': True,
'is_image': False
}
]
result = CitationCheckerResearch.organize_citations_for_frontend(datas)
assert result['code'] == 0
assert len(result['data']) == 1
assert result['data'][0]['from'] == 'local'
def test_save_citation_message_reuses_stable_id_and_omits_offsets(self):
datas = [
{
"url": "https://example.com",
"title": "Example",
"valid": True,
"id": 7,
"reference_index": 1,
}
]
result = CitationCheckerResearch.organize_citations_for_frontend(datas)
assert result["code"] == 0
assert result["data"][0]["id"] == 7
assert result["data"][0]["reference_index"] == 1
assert "citation_start_offset" not in result["data"][0]
assert "citation_end_offset" not in result["data"][0]
def setup_method(self):
"""Set up test fixtures."""
self.checker = CitationCheckerResearch("mock_model")
@patch('openjiuwen_deepsearch.algorithm.source_trace.citation_checker_research.LogManager')
def test_process_single_citation_with_logging(self, mock_log_manager):
"""Test processing a single citation with logging enabled."""
mock_log_manager.is_sensitive.return_value = False
para = "这是一个测试[source_tracer_result][示例](https://example.com)引用。"
pattern = r'\[source_tracer_result\](!)?\[(.*?)\](?:<(.*?)>|\((.*?)\))'
match = re.search(pattern, para)
datas = [{'url': 'https://example.com', 'valid': True, 'score': 0.8}]
processed_citation_urls = {}
data_index = 0
result_del_indices = self.checker.validate_and_process_single_citation(
match, datas, processed_citation_urls, data_index)
assert result_del_indices == []
assert datas[0]['is_image'] is False
assert 'match' in datas[0]
def test_process_single_citation_invalid_data(self):
"""Test processing a citation with invalid data."""
para = "这是一个测试[source_tracer_result][示例](https://example.com)引用。"
pattern = r'\[source_tracer_result\](!)?\[(.*?)\](?:<(.*?)>|\((.*?)\))'
match = re.search(pattern, para)
datas = [{'url': 'https://example.com', 'valid': False}]
processed_citation_urls = {}
data_index = 0
result_del_indices = self.checker.validate_and_process_single_citation(
match, datas, processed_citation_urls, data_index)
assert 0 in result_del_indices
assert datas[0]['is_image'] is False
def test_process_single_citation_invalid_url(self):
"""Test processing a citation with invalid URL."""
para = "这是一个测试[source_tracer_result][示例](https://example.com)引用。"
pattern = r'\[source_tracer_result\](!)?\[(.*?)\](?:<(.*?)>|\((.*?)\))'
match = re.search(pattern, para)
datas = [{'url': 'https://different.com', 'valid': True}]
processed_citation_urls = {}
data_index = 0
result_del_indices = self.checker.validate_and_process_single_citation(
match, datas, processed_citation_urls, data_index)
assert 0 in result_del_indices
assert datas[0]['valid'] is False
def test_process_single_citation_index_out_of_range(self):
"""Test processing a citation with index out of range."""
para = "这是一个测试[source_tracer_result][示例](https://example.com)引用。"
pattern = r'\[source_tracer_result\](!)?\[(.*?)\](?:<(.*?)>|\((.*?)\))'
match = re.search(pattern, para)
datas = []
processed_citation_urls = {}
data_index = 0
with pytest.raises(CustomIndexException):
self.checker.validate_and_process_single_citation(
match, datas, processed_citation_urls, data_index)
def test_process_single_paragraph_length_mismatch(self):
"""Test processing a paragraph with citation length mismatch."""
para = "这是一个测试[source_tracer_result][示例](https://example.com)引用。"
datas = []
data_index = 0
with patch.object(self.checker, 'validate_and_process_single_citation') as mock_process:
mock_process.side_effect = CustomIndexException(
StatusCode.PARAM_CHECK_ERROR_INDEX_OUT_OF_RANGE.code,
"Index out of range"
)
with pytest.raises(CustomIndexException):
self.checker.process_single_paragraph_citations(
para, datas, data_index)
@patch('openjiuwen_deepsearch.algorithm.source_trace.citation_checker_research.LogManager')
def test_preprocess_text_and_datas_with_logging(self, mock_log_manager):
"""Test preprocessing with logging enabled."""
mock_log_manager.is_sensitive.return_value = False
text = {'article': "这是一个测试文章"}
datas = [{'url': 'https://example.com', 'valid': True}]
result_text, result_datas = self.checker.preprocess_text_and_citations(
text, datas)
assert result_text == "这是一个测试文章"
assert result_datas == datas
def test_replace_inline_citations_with_image(self):
"""Test replacement with image citation."""
markdown_text = "这是一个测试![source_tracer_result][图片](https://image.com)引用。"
datas = [{'url': 'https://image.com', 'valid': True}]
inline_ref_pattern = re.compile(
r'\[source_tracer_result\](?:!)?\[(.*?)\](?:<(.*?)>|\((.*?)\))')
result_text, references, result_datas = self.checker.replace_inline_citations(
markdown_text, datas, inline_ref_pattern)
assert '![[' in result_text and 'https://image.com' in result_text
assert references == OrderedDict()
def test_replace_inline_citations_assigns_checked_citation_tokens(self):
markdown_text = "前缀[source_tracer_result][测试标题](https://test.com)后缀"
datas = [
{"url": "https://test.com", "title": "测试标题", "valid": True, "score": 0.9},
]
inline_ref_pattern = re.compile(
r'\[source_tracer_result\](!)?\[(.*?)\](?:<(.*?)>|\((.*?)\))'
)
transformed_text, references, datas = self.checker.replace_inline_citations(
markdown_text, datas, inline_ref_pattern)
assert transformed_text == "前缀[checked_citation:0][[1]](https://test.com)后缀"
assert list(references.items()) == [("https://test.com", ("测试标题", 1))]
assert datas[0]["id"] == 0
assert datas[0]["reference_index"] == 1
assert "citation_start_offset" not in datas[0]
assert "citation_end_offset" not in datas[0]
@patch('openjiuwen_deepsearch.algorithm.source_trace.citation_checker_research.LogManager')
def test_transform_references_with_logging(self, mock_log_manager):
"""Test transform references with logging enabled."""
mock_log_manager.is_sensitive.return_value = False
text = {'article': "这是一个测试文章"}
datas = [{'url': 'https://example.com', 'valid': True}]
result_text, result_datas = self.checker.transform_references(
text, datas)
assert isinstance(result_text, str)
assert isinstance(result_datas, list)
def test_transform_references_normalizes_multiline_source_tracer_title(self):
"""Test multiline source tracer titles are converted to checked citations."""
url = "https://milvus.io/blog/build-smarter-rag-routing-hybrid-retrieval.md"
text = {
"article": (
"混合检索说明[source_tracer_result]"
"[Build Smarter RAG with Routing and Hybrid Retrieval\n - Milvus Blog]"
f"({url})结束。"
)
}
datas = [
{
"url": url,
"title": "Build Smarter RAG with Routing and Hybrid Retrieval\n - Milvus Blog",
"content": "Content 1",
"valid": True,
"score": 0.9,
}
]
result_text, result_datas = self.checker.transform_references(text, datas)
assert "[source_tracer_result]" not in result_text
assert "[checked_citation:0][[1]]" in result_text
assert "[1]. [Build Smarter RAG with Routing and Hybrid Retrieval - Milvus Blog]" in result_text
assert result_datas[0]["id"] == 0
assert result_datas[0]["reference_index"] == 1
@pytest.mark.asyncio
async def test_checker_success(self):
"""Test successful execution of main checker method."""
text = {'article': "这是一个测试文章"}
datas = [{'url': 'https://example.com', 'valid': True}]
with patch.object(self.checker.citation_verifier, 'run', new_callable=AsyncMock) as mock_verify:
mock_verify.return_value = datas
result = await self.checker.checker(text, datas)
result_dict = json.loads(result)
assert 'checked_trace_source_report_content' in result_dict
assert 'citation_messages' in result_dict
assert result_dict['citation_messages']['code'] == 0
mock_verify.assert_called_once_with(datas)
@pytest.mark.asyncio
@patch('openjiuwen_deepsearch.algorithm.source_trace.citation_checker_research.LogManager')
async def test_checker_with_detailed_logging(self, mock_log_manager):
"""Test checker method with detailed logging enabled."""
mock_log_manager.is_sensitive.return_value = False
text = {'article': "这是一个测试文章"}
datas = [{'url': 'https://example.com', 'valid': True}]
with patch.object(self.checker.citation_verifier, 'run', new_callable=AsyncMock) as mock_verify:
mock_verify.return_value = datas
result = await self.checker.checker(text, datas)
result_dict = json.loads(result)
assert 'checked_trace_source_report_content' in result_dict
assert 'citation_messages' in result_dict
assert result_dict['citation_messages']['code'] == 0
@pytest.mark.asyncio
async def test_checker_with_verification_exception(self):
"""Test checker method with verification exception."""
text = {'article': "这是一个测试文章"}
datas = [{'url': 'https://example.com', 'valid': True}]
with patch.object(self.checker.citation_verifier, 'run', new_callable=AsyncMock) as mock_verify:
mock_verify.side_effect = Exception("验证错误")
with pytest.raises(Exception, match="验证错误"):
await self.checker.checker(text, datas)
@pytest.mark.asyncio
async def test_end_to_end_citation_processing(self):
"""Test end-to-end citation processing with realistic data."""
text = {
'article': "这是一个测试文章,包含[source_tracer_result][示例1](https://example1.com)和[source_tracer_result][示例2](https://example2.com)引用。"}
datas = [
{'url': 'https://example1.com', 'title': '示例1',
'valid': True, 'score': 0.8},
{'url': 'https://example2.com', 'title': '示例2',
'valid': True, 'score': 0.9}
]
with patch.object(self.checker.citation_verifier, 'run', new_callable=AsyncMock) as mock_verify:
mock_verify.return_value = datas
result = await self.checker.checker(text, datas)
result_dict = json.loads(result)
assert 'checked_trace_source_report_content' in result_dict
assert 'citation_messages' in result_dict
assert result_dict['citation_messages']['code'] == 0
assert len(result_dict['citation_messages']['data']) == 2
response_content = result_dict['checked_trace_source_report_content']
assert '[[1]]' in response_content
assert '[[2]]' in response_content
assert '[1]. [示例1]' in response_content
assert '[2]. [示例2]' in response_content
class TestCitationOffsetTracking:
"""测试稳定 citation token 替换后的数据映射"""
@pytest.fixture
def checker(self):
return CitationCheckerResearch(llm_model="mock_model")
def test_replace_inline_citations_records_stable_ids(self, checker):
"""替换引用后,datas 中每条有效引用应记录稳定 id 和 reference_index"""
markdown_text = "这是一段测试文本[source_tracer_result][标题A](https://a.com)以及更多内容[source_tracer_result][标题B](https://b.com)结束。"
datas = [
{"url": "https://a.com", "title": "标题A", "valid": True, "score": 0.9},
{"url": "https://b.com", "title": "标题B", "valid": True, "score": 0.8},
]
inline_ref_pattern = re.compile(
r'\[source_tracer_result\](!)?\[(.*?)\](?:<(.*?)>|\((.*?)\))')
transformed_text, references, datas = checker.replace_inline_citations(
markdown_text, datas, inline_ref_pattern)
for data in datas:
if data.get("valid", False):
assert "id" in data
assert "reference_index" in data
assert "citation_start_offset" not in data
assert "citation_end_offset" not in data
assert transformed_text == (
"这是一段测试文本[checked_citation:0][[1]](https://a.com)"
"以及更多内容[checked_citation:1][[2]](https://b.com)结束。"
)
def test_offsets_are_correct_for_single_citation(self, checker):
"""单个引用应使用稳定 citation id 且不再写入偏移量"""
markdown_text = "前缀文本[source_tracer_result][测试标题](https://test.com)后缀文本"
datas = [
{"url": "https://test.com", "title": "测试标题", "valid": True, "score": 0.9},
]
inline_ref_pattern = re.compile(
r'\[source_tracer_result\](!)?\[(.*?)\](?:<(.*?)>|\((.*?)\))')
transformed_text, references, datas = checker.replace_inline_citations(
markdown_text, datas, inline_ref_pattern)
assert len(datas) == 1
assert transformed_text == "前缀文本[checked_citation:0][[1]](https://test.com)后缀文本"
assert datas[0]["id"] == 0
assert datas[0]["reference_index"] == 1
assert "citation_start_offset" not in datas[0]
assert "citation_end_offset" not in datas[0]