#include "pdf/pdfium/pdfium_range.h"
#include <string>
#include <utility>
#include "base/check_op.h"
#include "base/containers/span.h"
#include "base/debug/alias.h"
#include "base/numerics/checked_math.h"
#include "base/strings/string_util.h"
#include "pdf/accessibility_structs.h"
#include "pdf/pdfium/pdfium_api_string_buffer_adapter.h"
#include "pdf/pdfium/pdfium_api_wrappers.h"
#include "third_party/pdfium/public/fpdf_searchex.h"
#include "ui/gfx/geometry/point.h"
#include "ui/gfx/geometry/rect.h"
#include "ui/gfx/geometry/rect_f.h"
namespace chrome_pdf {
namespace {
void AdjustForBackwardsRange(int& index, int& count) {
if (count < 0) {
count *= -1;
index -= count;
}
}
struct PdfRectTextRunInfo {
PdfRect pdf_rect;
PdfRect tight_pdf_rect;
size_t char_count;
};
float GetVerticalOverlap(const PdfRect& rect1, const PdfRect& rect2) {
CHECK(!rect1.IsEmpty());
CHECK(!rect2.IsEmpty());
PdfRect union_rect = rect1;
union_rect.Union(rect2);
if (union_rect.height() == rect1.height() ||
union_rect.height() == rect2.height()) {
return 1.0f;
}
PdfRect intersect_rect = rect1;
intersect_rect.Intersect(rect2);
return intersect_rect.height() / union_rect.height();
}
bool ShouldMergeHorizontalRects(const PdfRectTextRunInfo& text_run1,
const PdfRectTextRunInfo& text_run2) {
static constexpr float kVerticalOverlapThreshold = 0.8f;
const PdfRect& rect1 = text_run1.pdf_rect;
const PdfRect& rect2 = text_run2.pdf_rect;
if (GetVerticalOverlap(rect1, rect2) < kVerticalOverlapThreshold) {
return false;
}
static constexpr float kHorizontalWidthFactor = 1.0f;
const float average_width1 =
kHorizontalWidthFactor * rect1.width() / text_run1.char_count;
const float average_width2 =
kHorizontalWidthFactor * rect2.width() / text_run2.char_count;
const float rect1_left = rect1.left() - average_width1;
const float rect1_right = rect1.right() + average_width1;
const float rect2_left = rect2.left() - average_width2;
const float rect2_right = rect2.right() + average_width2;
return rect1_left < rect2_right && rect1_right > rect2_left;
}
std::vector<PdfRect> MergeAdjacentRects(
base::span<PdfRectTextRunInfo> text_runs,
PDFiumRange::PdfBoundsTightness tightness) {
std::vector<PdfRect> results;
const PdfRectTextRunInfo* previous_text_run = nullptr;
PdfRect current_pdf_rect;
for (const auto& text_run : text_runs) {
PdfRect effective_rect = text_run.pdf_rect;
if (tightness == PDFiumRange::PdfBoundsTightness::kTightVertical) {
*effective_rect.writable_bottom() = text_run.tight_pdf_rect.bottom();
*effective_rect.writable_top() = text_run.tight_pdf_rect.top();
}
if (previous_text_run) {
if (ShouldMergeHorizontalRects(*previous_text_run, text_run)) {
current_pdf_rect.Union(effective_rect);
} else {
results.push_back(current_pdf_rect);
current_pdf_rect = effective_rect;
}
} else {
current_pdf_rect = effective_rect;
}
previous_text_run = &text_run;
}
if (!current_pdf_rect.IsEmpty()) {
results.push_back(current_pdf_rect);
}
return results;
}
}
bool IsIgnorableCharacter(char16_t c) {
return c == kZeroWidthSpace || c == kPDFSoftHyphenMarker;
}
PDFiumRange PDFiumRange::AllTextOnPage(PDFiumPage* page) {
return PDFiumRange(page, 0, page->GetCharCount());
}
PDFiumRange PDFiumRange::CreateBackwards(PDFiumPage* page,
int char_index,
int char_count) {
CHECK_GE(char_count, 0);
PDFiumRange range(page, char_index, char_count);
if (char_count > 0) {
range.char_index_ += char_count;
range.char_count_ *= -1;
}
return range;
}
PDFiumRange::PDFiumRange(PDFiumPage* page, int char_index, int char_count)
: page_unload_preventer_(page),
page_(page),
char_index_(char_index),
char_count_(char_count) {
DCHECK(page_);
[[maybe_unused]] FPDF_TEXTPAGE text_page = page_->GetTextPage();
#if DCHECK_IS_ON()
AdjustForBackwardsRange(char_index, char_count);
DCHECK_LE(char_count, FPDFText_CountChars(text_page));
#endif
}
PDFiumRange::PDFiumRange(const PDFiumRange&) = default;
PDFiumRange& PDFiumRange::operator=(const PDFiumRange&) = default;
PDFiumRange::PDFiumRange(PDFiumRange&&) noexcept = default;
PDFiumRange& PDFiumRange::operator=(PDFiumRange&&) noexcept = default;
PDFiumRange::~PDFiumRange() = default;
void PDFiumRange::SetCharCount(int char_count) {
if (char_count == char_count_) {
return;
}
char_count_ = char_count;
#if DCHECK_IS_ON()
int dummy_index = 0;
AdjustForBackwardsRange(dummy_index, char_count);
DCHECK_LE(char_count, FPDFText_CountChars(page_->GetTextPage()));
#endif
cached_screen_rects_point_ = gfx::Point();
cached_screen_rects_zoom_ = 0;
}
const std::vector<gfx::Rect>& PDFiumRange::GetScreenRects(
const gfx::Point& point,
double zoom,
PageOrientation orientation) const {
if (point == cached_screen_rects_point_ &&
zoom == cached_screen_rects_zoom_) {
return cached_screen_rects_;
}
cached_screen_rects_.clear();
cached_screen_rects_point_ = point;
cached_screen_rects_zoom_ = zoom;
std::vector<PdfRect> rects = GetRects();
cached_screen_rects_.reserve(rects.size());
for (const auto& rect : rects) {
cached_screen_rects_.push_back(
page_->PageToScreen(point, zoom, rect, orientation));
}
return cached_screen_rects_;
}
std::vector<PdfRect> PDFiumRange::GetRects() const {
return GetRectsWithTightness(PdfBoundsTightness::kLoose);
}
std::vector<PdfRect> PDFiumRange::GetRectsWithTightness(
PdfBoundsTightness tightness) const {
if (char_count_ == 0) {
return {};
}
FPDF_TEXTPAGE text_page = page_->GetTextPage();
if (!text_page) {
return {};
}
const int char_index_debug = char_index_;
const int char_count_debug = char_count_;
base::debug::Alias(&char_index_debug);
base::debug::Alias(&char_count_debug);
int char_index = char_index_;
int char_count = char_count_;
AdjustForBackwardsRange(char_index, char_count);
CHECK_GE(char_index, 0) << " start: " << char_index_
<< " count: " << char_count_;
CHECK_LT(char_index, FPDFText_CountChars(text_page))
<< " start: " << char_index_ << " count: " << char_count_;
std::vector<PdfRectTextRunInfo> text_runs;
const int end_char_index = char_index + char_count;
bool reached_end = false;
while (!reached_end) {
std::optional<AccessibilityTextRunInfo> text_run_info =
page_->GetTextRunInfoAt(char_index);
CHECK(text_run_info.has_value());
base::CheckedNumeric<uint32_t> safe_next_char_index =
text_run_info.value().start_index;
safe_next_char_index += text_run_info.value().len;
int next_char_index;
CHECK(safe_next_char_index.AssignIfValid(&next_char_index));
reached_end = next_char_index >= end_char_index;
if (reached_end) {
next_char_index = end_char_index;
}
PdfRect text_run_rect;
PdfRect tight_text_run_rect;
for (int i = char_index; i < next_char_index; ++i) {
PdfRect rect;
bool got_rect =
FPDFText_GetLooseCharBox(text_page, i, &FsRectFFromPdfRect(rect));
CHECK(got_rect);
text_run_rect.Union(rect);
if (tightness == PdfBoundsTightness::kTightVertical) {
tight_text_run_rect.Union(GetTextCharBox(text_page, i).value());
}
}
if (!text_run_rect.IsEmpty()) {
text_runs.emplace_back(text_run_rect,
tight_text_run_rect,
next_char_index - char_index);
}
char_index = next_char_index;
}
return MergeAdjacentRects(text_runs, tightness);
}
std::u16string PDFiumRange::GetText() const {
if (char_count_ == 0) {
return std::u16string();
}
int index = char_index_;
int count = char_count_;
AdjustForBackwardsRange(index, count);
CHECK_GT(count, 0);
std::u16string result;
{
PDFiumAPIStringBufferAdapter<std::u16string> api_string_adapter(
&result, count + 1, false);
unsigned short* data =
reinterpret_cast<unsigned short*>(api_string_adapter.GetData());
int written = FPDFText_GetText(page_->GetTextPage(), index, count, data);
DCHECK_GE(written, 0);
api_string_adapter.Close(written);
}
const gfx::RectF page_bounds = page_->GetCroppedRect();
std::u16string in_bound_text;
in_bound_text.reserve(result.size());
CHECK_GE(static_cast<size_t>(count), result.size());
size_t trimmed_count = static_cast<size_t>(count) - result.size();
int index_offset = index;
while (trimmed_count) {
if (FPDFText_GetTextIndexFromCharIndex(page_->GetTextPage(),
index_offset) >= 0) {
break;
}
--trimmed_count;
++index_offset;
}
for (size_t i = 0; i < result.size(); ++i) {
if (page_->IsCharInPageBounds(index_offset + i, page_bounds)) {
in_bound_text += result[i];
}
}
result = std::move(in_bound_text);
std::erase_if(result, IsIgnorableCharacter);
return result;
}
}