#include "pdf/pdfium/pdfium_text_fragment_finder.h"
#include <optional>
#include <string>
#include <vector>
#include "base/containers/span.h"
#include "base/functional/bind.h"
#include "base/memory/raw_ptr.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversions.h"
#include "components/shared_highlighting/core/common/text_fragment.h"
#include "pdf/pdfium/pdfium_engine.h"
#include "pdf/pdfium/pdfium_range.h"
namespace chrome_pdf {
namespace {
void AddTextFragmentPrefixResult(
std::vector<PDFiumRange>& text_fragment_prefixes,
PDFiumRange prefix_result) {
text_fragment_prefixes.emplace_back(std::move(prefix_result));
}
void AddTextFragmentSuffixResult(
PDFiumEngine* engine,
std::optional<PDFiumRange>& text_fragment_suffix,
const PDFiumRange& before_suffix_range,
PDFiumRange suffix_result) {
if (text_fragment_suffix) {
return;
}
const int suffix_boundary_start =
before_suffix_range.char_index() + before_suffix_range.char_count();
const int suffix_boundary_count =
suffix_result.char_index() - suffix_boundary_start;
const auto suffix_boundary =
PDFiumRange(engine->GetPage(before_suffix_range.page_index()),
suffix_boundary_start, suffix_boundary_count);
for (const auto& c : suffix_boundary.GetText()) {
if (!base::IsUnicodeWhitespace(c)) {
return;
}
}
text_fragment_suffix = std::move(suffix_result);
}
std::optional<PDFiumRange> FindTextFragmentSuffix(
PDFiumEngine* engine,
const shared_highlighting::TextFragment& fragment,
const PDFiumRange& end_range) {
std::optional<PDFiumRange> text_fragment_suffix = std::nullopt;
engine->SearchForFragment(
base::UTF8ToUTF16(fragment.suffix()),
end_range.char_index() +
end_range.char_count(),
-1,
end_range.page_index(),
base::BindRepeating(&AddTextFragmentSuffixResult, engine,
std::ref(text_fragment_suffix), std::ref(end_range)));
return text_fragment_suffix;
}
void AddTextFragmentStartResult(
PDFiumEngine* engine,
std::vector<PDFiumRange>& text_fragment_starts,
std::optional<PDFiumRange>& text_fragment_suffix,
const shared_highlighting::TextFragment& fragment,
std::optional<const PDFiumRange> prefix_range,
PDFiumRange start_result) {
if (prefix_range) {
const int prefix_end =
prefix_range->char_index() + prefix_range->char_count();
const int boundary_start = prefix_end;
const int boundary_count = start_result.char_index() - prefix_end;
const auto boundary =
PDFiumRange(engine->GetPage(start_result.page_index()), boundary_start,
boundary_count);
for (const auto& c : boundary.GetText()) {
if (!base::IsUnicodeWhitespace(c)) {
return;
}
}
}
if (fragment.text_end().empty() && !fragment.suffix().empty()) {
text_fragment_suffix =
FindTextFragmentSuffix(engine, fragment, start_result);
if (!text_fragment_suffix) {
return;
}
}
text_fragment_starts.emplace_back(std::move(start_result));
}
void AddTextFragmentEndResult(PDFiumEngine* engine,
std::optional<PDFiumRange>& text_fragment_end,
std::optional<PDFiumRange>& text_fragment_suffix,
const shared_highlighting::TextFragment& fragment,
PDFiumRange end_result) {
if (text_fragment_end) {
return;
}
if (!fragment.suffix().empty()) {
text_fragment_suffix = FindTextFragmentSuffix(engine, fragment, end_result);
if (!text_fragment_suffix) {
return;
}
}
text_fragment_end = std::move(end_result);
}
}
PDFiumTextFragmentFinder::PDFiumTextFragmentFinder(PDFiumEngine* engine)
: engine_(engine) {}
PDFiumTextFragmentFinder::~PDFiumTextFragmentFinder() = default;
std::vector<PDFiumRange> PDFiumTextFragmentFinder::FindTextFragments(
base::span<const std::string> text_fragments) {
text_fragment_highlights_.clear();
for (const std::string& fragment : text_fragments) {
const auto text_fragment =
shared_highlighting::TextFragment::FromEscapedString(fragment);
CHECK(text_fragment.has_value());
StartTextFragmentSearch(text_fragment.value());
}
return std::move(text_fragment_highlights_);
}
void PDFiumTextFragmentFinder::StartTextFragmentSearch(
const shared_highlighting::TextFragment& fragment) {
last_unsearched_page_ = 0;
text_fragment_prefixes_.clear();
text_fragment_starts_.clear();
text_fragment_end_ = std::nullopt;
text_fragment_suffix_ = std::nullopt;
if (engine_->GetNumberOfPages() == 0) {
return;
}
if (!fragment.prefix().empty()) {
FindTextFragmentPrefix(fragment, 0);
return;
}
FindTextFragmentStart(fragment);
}
void PDFiumTextFragmentFinder::FindTextFragmentPrefix(
const shared_highlighting::TextFragment& fragment,
int page_to_start_search_from) {
text_fragment_prefixes_.clear();
const auto prefix_unicode = base::UTF8ToUTF16(fragment.prefix());
for (int current_page = page_to_start_search_from;
current_page < engine_->GetNumberOfPages(); current_page++) {
last_unsearched_page_ = current_page + 1;
engine_->SearchForFragment(
prefix_unicode,
0,
-1, current_page,
base::BindRepeating(&AddTextFragmentPrefixResult,
std::ref(text_fragment_prefixes_)));
if (!text_fragment_prefixes_.empty()) {
FindTextFragmentStart(fragment);
return;
}
}
}
void PDFiumTextFragmentFinder::FindTextFragmentStart(
const shared_highlighting::TextFragment& fragment) {
text_fragment_starts_.clear();
const auto start_unicode = base::UTF8ToUTF16(fragment.text_start());
if (text_fragment_prefixes_.empty()) {
for (int current_page = 0; current_page < engine_->GetNumberOfPages();
current_page++) {
engine_->SearchForFragment(
start_unicode,
0,
-1, current_page,
base::BindRepeating(&AddTextFragmentStartResult, engine_,
std::ref(text_fragment_starts_),
std::ref(text_fragment_suffix_), fragment,
std::nullopt));
}
if (text_fragment_starts_.empty()) {
return;
}
FindTextFragmentEnd(fragment);
return;
}
for (const auto& prefix_range : text_fragment_prefixes_) {
engine_->SearchForFragment(
start_unicode,
prefix_range.char_index(),
-1, prefix_range.page_index(),
base::BindRepeating(&AddTextFragmentStartResult, engine_,
std::ref(text_fragment_starts_),
std::ref(text_fragment_suffix_), fragment,
prefix_range));
if (text_fragment_starts_.empty()) {
continue;
}
FindTextFragmentEnd(fragment);
return;
}
if (text_fragment_starts_.empty() && !fragment.prefix().empty() &&
last_unsearched_page_ < engine_->GetNumberOfPages()) {
FindTextFragmentPrefix(fragment, last_unsearched_page_);
}
}
void PDFiumTextFragmentFinder::FindTextFragmentEnd(
const shared_highlighting::TextFragment& fragment) {
if (fragment.text_end().empty()) {
FinishTextFragmentSearch();
return;
}
text_fragment_end_ = std::nullopt;
const auto end_unicode = base::UTF8ToUTF16(fragment.text_end());
for (const auto& start_range : text_fragment_starts_) {
engine_->SearchForFragment(
end_unicode,
start_range.char_index() +
start_range.char_count(),
-1,
start_range.page_index(),
base::BindRepeating(&AddTextFragmentEndResult, engine_,
std::ref(text_fragment_end_),
std::ref(text_fragment_suffix_), fragment));
if (text_fragment_end_) {
text_fragment_starts_ = {start_range};
FinishTextFragmentSearch();
return;
}
}
if (!text_fragment_end_ && !fragment.prefix().empty() &&
last_unsearched_page_ < engine_->GetNumberOfPages()) {
FindTextFragmentPrefix(fragment, last_unsearched_page_);
}
}
void PDFiumTextFragmentFinder::FinishTextFragmentSearch() {
if (text_fragment_starts_.empty()) {
return;
}
PDFiumRange highlight = text_fragment_starts_[0];
if (text_fragment_end_) {
CHECK_GT(text_fragment_end_->char_index(), highlight.char_index());
base::CheckedNumeric<int> new_char_count = text_fragment_end_->char_index();
new_char_count -= highlight.char_index();
new_char_count += text_fragment_end_->char_count();
highlight.SetCharCount(new_char_count.ValueOrDie());
}
text_fragment_highlights_.emplace_back(std::move(highlight));
}
}