#include "chrome/services/speech/speech_timestamp_estimator.h"
#include <algorithm>
namespace speech {
namespace {
using MediaTimestamp = SpeechTimestampEstimator::MediaTimestamp;
using SpeechTimestamp = SpeechTimestampEstimator::SpeechTimestamp;
using PlaybackDuration = SpeechTimestampEstimator::PlaybackDuration;
using MediaRanges = SpeechTimestampEstimator::MediaRanges;
using PlaybackChunk = SpeechTimestampEstimator::PlaybackChunk;
PlaybackDuration CalculateDuration(SpeechTimestamp start, SpeechTimestamp end) {
CHECK_LT(start, end);
return PlaybackDuration(end.value() - start.value());
}
MediaTimestamp IncreaseTimestamp(MediaTimestamp timestamp,
PlaybackDuration duration) {
CHECK(duration->is_positive());
return MediaTimestamp(timestamp.value() + duration.value());
}
}
SpeechTimestampEstimator::SpeechTimestampEstimator() = default;
SpeechTimestampEstimator::~SpeechTimestampEstimator() = default;
SpeechTimestampEstimator::PlaybackChunk::PlaybackChunk(
MediaTimestamp media_start,
SpeechTimestamp current_speech_time)
: media_start(media_start), speech_start(current_speech_time) {}
void SpeechTimestampEstimator::PlaybackChunk::TrimStart(
PlaybackDuration duration) {
CHECK_LE(duration, playback_duration);
*media_start += duration.value();
*speech_start += duration.value();
*playback_duration -= duration.value();
}
void SpeechTimestampEstimator::PlaybackChunk::AddDuration(
PlaybackDuration duration) {
*playback_duration += duration.value();
}
MediaTimestamp SpeechTimestampEstimator::PlaybackChunk::MediaEnd() const {
return MediaTimestamp(media_start.value() + playback_duration.value());
}
SpeechTimestamp SpeechTimestampEstimator::PlaybackChunk::SpeechEnd() const {
return SpeechTimestamp(speech_start.value() + playback_duration.value());
}
void SpeechTimestampEstimator::AddPlaybackStart(
MediaTimestamp media_start_pts) {
playback_chunks_.emplace_back(media_start_pts, current_speech_time_);
running_silence_duration_.reset();
}
void SpeechTimestampEstimator::AppendDuration(PlaybackDuration duration) {
CHECK(duration->is_positive());
if (running_silence_duration_) {
AdjustLastMediaTimestampForSilence(current_speech_time_);
}
*current_speech_time_ += duration.value();
if (playback_chunks_.empty()) {
return;
}
playback_chunks_.back().AddDuration(duration);
}
void SpeechTimestampEstimator::OnSilentMediaDropped(PlaybackDuration duration) {
CHECK(duration->is_positive());
if (playback_chunks_.empty()) {
return;
}
if (!running_silence_duration_) {
running_silence_duration_ = duration;
return;
}
*(running_silence_duration_.value()) += duration.value();
}
void SpeechTimestampEstimator::AdjustLastMediaTimestampForSilence(
SpeechTimestamp current_speech_time) {
CHECK(!playback_chunks_.empty());
CHECK(running_silence_duration_.has_value());
MediaTimestamp first_audible_timestamp = IncreaseTimestamp(
playback_chunks_.back().MediaEnd(), running_silence_duration_.value());
playback_chunks_.emplace_back(first_audible_timestamp, current_speech_time);
running_silence_duration_.reset();
}
void SpeechTimestampEstimator::PopFrontUntil(
base::circular_deque<PlaybackChunk>& chunks,
SpeechTimestamp end_timestamp) {
CHECK(!chunks.empty());
CHECK_LE(end_timestamp, current_speech_time_);
CHECK_EQ(chunks.back().SpeechEnd(), current_speech_time_);
while (chunks.front().SpeechEnd() < end_timestamp) {
chunks.pop_front();
}
CHECK(!chunks.empty());
PlaybackChunk& front_chunk = chunks.front();
if (front_chunk.speech_start < end_timestamp) {
PlaybackDuration duration =
CalculateDuration(front_chunk.speech_start, end_timestamp);
front_chunk.TrimStart(duration);
}
CHECK(!chunks.empty());
}
std::vector<SpeechTimestampEstimator::PlaybackChunk>
SpeechTimestampEstimator::TakeFrontUntil(
base::circular_deque<PlaybackChunk>& chunks,
SpeechTimestamp end_timestamp) {
CHECK(!chunks.empty());
CHECK_LE(end_timestamp, current_speech_time_);
CHECK_EQ(chunks.back().SpeechEnd(), current_speech_time_);
std::vector<PlaybackChunk> results;
while (chunks.front().SpeechEnd() < end_timestamp) {
if (!chunks.front().playback_duration->is_zero()) {
results.push_back(std::move(chunks.front()));
}
chunks.pop_front();
}
CHECK(!chunks.empty());
PlaybackChunk& front_chunk = chunks.front();
if (front_chunk.speech_start < end_timestamp) {
PlaybackDuration duration =
CalculateDuration(front_chunk.speech_start, end_timestamp);
PlaybackChunk front_copy = front_chunk;
front_copy.playback_duration = duration;
results.push_back(std::move(front_copy));
front_chunk.TrimStart(duration);
}
CHECK(!chunks.empty());
return results;
}
MediaRanges SpeechTimestampEstimator::TakeTimestampsInRange(
SpeechTimestamp start,
SpeechTimestamp end) {
if (start >= end || playback_chunks_.empty()) {
return MediaRanges();
}
constexpr auto kSpeechTimeZero = SpeechTimestamp(base::Seconds(0));
start = std::clamp(start, kSpeechTimeZero, current_speech_time_);
end = std::clamp(end, kSpeechTimeZero, current_speech_time_);
if (start == end) {
return MediaRanges();
}
PopFrontUntil(playback_chunks_, start);
auto playbacks = TakeFrontUntil(playback_chunks_, end);
CHECK(!playback_chunks_.empty());
return ConvertToMediaRanges(playbacks);
}
MediaRanges SpeechTimestampEstimator::PeekTimestampsInRange(
SpeechTimestamp start,
SpeechTimestamp end) {
if (start >= end || playback_chunks_.empty()) {
return MediaRanges();
}
constexpr auto kSpeechTimeZero = SpeechTimestamp(base::Seconds(0));
start = std::clamp(start, kSpeechTimeZero, current_speech_time_);
end = std::clamp(end, kSpeechTimeZero, current_speech_time_);
if (start == end) {
return MediaRanges();
}
auto chunks_copy = playback_chunks_;
PopFrontUntil(chunks_copy, start);
auto playbacks = TakeFrontUntil(chunks_copy, end);
return ConvertToMediaRanges(playbacks);
}
MediaRanges SpeechTimestampEstimator::ConvertToMediaRanges(
const std::vector<PlaybackChunk>& playbacks) {
MediaRanges results;
results.reserve(playbacks.size());
std::ranges::transform(
playbacks, std::back_inserter(results),
[](const PlaybackChunk& chunk) -> media::MediaTimestampRange {
return {.start = chunk.media_start.value(),
.end = chunk.MediaEnd().value()};
});
return results;
}
}