910e62b5创建于 1月15日历史提交
// Copyright 2018 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "content/browser/speech/tts_controller_impl.h"

#include <stddef.h>

#include <algorithm>
#include <string>
#include <vector>

#include "base/containers/queue.h"
#include "base/functional/bind.h"
#include "base/json/json_reader.h"
#include "base/metrics/histogram_macros.h"
#include "base/metrics/user_metrics.h"
#include "base/observer_list.h"
#include "base/strings/string_util.h"
#include "base/task/single_thread_task_runner.h"
#include "base/values.h"
#include "build/build_config.h"
#include "content/browser/speech/tts_utterance_impl.h"
#include "content/public/browser/content_browser_client.h"
#include "content/public/browser/tts_utterance.h"
#include "content/public/browser/visibility.h"
#include "content/public/browser/web_contents.h"
#include "content/public/common/content_client.h"
#include "services/data_decoder/public/cpp/safe_xml_parser.h"
#include "services/data_decoder/public/mojom/xml_parser.mojom.h"
#include "third_party/blink/public/mojom/speech/speech_synthesis.mojom.h"
#include "ui/base/l10n/l10n_util.h"

#if BUILDFLAG(IS_CHROMEOS)
#include "content/public/browser/tts_controller_delegate.h"
#endif

namespace content {
namespace {
// A value to be used to indicate that there is no char index available.
const int kInvalidCharIndex = -1;

// A value to be used to indicate that there is no length available.
const int kInvalidLength = -1;

#if BUILDFLAG(IS_CHROMEOS)
bool VoiceIdMatches(
    const std::optional<TtsControllerDelegate::PreferredVoiceId>& id,
    const content::VoiceData& voice) {
  if (!id.has_value() || voice.name.empty() ||
      (voice.engine_id.empty() && !voice.native))
    return false;
  if (voice.native)
    return id->name == voice.name && id->id.empty();
  return id->name == voice.name && id->id == voice.engine_id;
}
#endif  // BUILDFLAG(IS_CHROMEOS)

TtsUtteranceImpl* AsUtteranceImpl(TtsUtterance* utterance) {
  return static_cast<TtsUtteranceImpl*>(utterance);
}

bool IsUtteranceSpokenByRemoteEngine(TtsUtterance* utterance) {
  if (utterance && !utterance->GetEngineId().empty()) {
    TtsUtteranceImpl* utterance_impl = AsUtteranceImpl(utterance);
    return utterance_impl->spoken_by_remote_engine();
  }
  return false;
}

}  // namespace

//
// VoiceData
//

VoiceData::VoiceData() : remote(false), native(false) {}

VoiceData::VoiceData(const VoiceData& other) = default;

VoiceData::~VoiceData() {}

//
// TtsController
//

TtsController* TtsController::GetInstance() {
  return TtsControllerImpl::GetInstance();
}

void TtsController::SkipAddNetworkChangeObserverForTests(bool enabled) {
  return TtsControllerImpl::SkipAddNetworkChangeObserverForTests(enabled);
}

// IMPORTANT!
// These values are written to logs.  Do not renumber or delete
// existing items; add new entries to the end of the list.
// LINT.IfChange(UMATextToSpeechEvent)
enum class UMATextToSpeechEvent {
  START = 0,
  END = 1,
  WORD = 2,
  SENTENCE = 3,
  MARKER = 4,
  INTERRUPTED = 5,
  CANCELLED = 6,
  SPEECH_ERROR = 7,
  PAUSE = 8,
  RESUME = 9,

  // This must always be the last enum. It's okay for its value to
  // increase, but none of the other enum values may change.
  COUNT
};
// LINT.ThenChange(/tools/metrics/histograms/metadata/accessibility/enums.xml:TextToSpeechEvent)

//
// TtsControllerImpl
//

// static
bool TtsControllerImpl::skip_add_network_change_observer_for_tests_ = false;

// static
TtsControllerImpl* TtsControllerImpl::GetInstance() {
  return base::Singleton<TtsControllerImpl>::get();
}

// static
void TtsControllerImpl::SkipAddNetworkChangeObserverForTests(bool enabled) {
  TtsControllerImpl::skip_add_network_change_observer_for_tests_ = enabled;
}

void TtsControllerImpl::SetStopSpeakingWhenHidden(bool value) {
  stop_speaking_when_hidden_ = value;
}

TtsControllerImpl::TtsControllerImpl() {
  if (!skip_add_network_change_observer_for_tests_) {
    net::NetworkChangeNotifier::AddNetworkChangeObserver(this);
  }
  OnNetworkChanged(net::NetworkChangeNotifier::GetConnectionType());
}

TtsControllerImpl::~TtsControllerImpl() {
  if (current_utterance_) {
    current_utterance_->Finish();
    SetCurrentUtterance(nullptr);
  }

  // Clear any queued utterances too.
  ClearUtteranceQueue(false);  // Don't sent events.

  net::NetworkChangeNotifier::RemoveNetworkChangeObserver(this);
}

void TtsControllerImpl::SpeakOrEnqueue(
    std::unique_ptr<TtsUtterance> utterance) {
  if (!ShouldSpeakUtterance(utterance.get())) {
    utterance->Finish();
    return;
  }

  // If the TTS platform or tts engine delegate is still loading or
  // initializing, queue or flush the utterance. The utterances can be sent to
  // platform specific implementation or to the engine implementation. Every
  // utterances are postponed until the platform specific implementation and
  // built in tts engine are loaded to avoid races where the utterance gets
  // dropped unexpectedly.
  if (TtsPlatformLoading() ||
      (engine_delegate_ && !engine_delegate_->IsBuiltInTtsEngineInitialized(
                               utterance->GetBrowserContext()))) {
    GetTtsPlatform()->LoadBuiltInTtsEngine(utterance->GetBrowserContext());

    if (utterance->GetShouldClearQueue())
      ClearUtteranceQueue(true);

    utterance_list_.emplace_back(std::move(utterance));
    return;
  }

  // If we're paused and we get an utterance that can't be queued,
  // flush the queue but stay in the paused state.
  if (paused_ && utterance->GetShouldClearQueue()) {
    Stop();
    utterance_list_.emplace_back(std::move(utterance));
    paused_ = true;
    return;
  }

  if (paused_ || (IsSpeaking() && !utterance->GetShouldClearQueue())) {
    utterance_list_.emplace_back(std::move(utterance));
  } else {
    Stop();
    SpeakNow(std::move(utterance));
  }
}

void TtsControllerImpl::Stop() {
  Stop(GURL());
}

void TtsControllerImpl::Stop(const GURL& source_url) {
  StopAndClearQueue(source_url);
}

void TtsControllerImpl::StopAndClearQueue(const GURL& source_url) {
  if (StopCurrentUtteranceIfMatches(source_url))
    ClearUtteranceQueue(true);
}

bool TtsControllerImpl::StopCurrentUtteranceIfMatches(const GURL& source_url) {
  base::RecordAction(base::UserMetricsAction("TextToSpeech.Stop"));

  paused_ = false;

  if (!source_url.is_empty() && current_utterance_ &&
      current_utterance_->GetSrcUrl().DeprecatedGetOriginAsURL() !=
          source_url.DeprecatedGetOriginAsURL())
    return false;

  StopCurrentUtterance();
  return true;
}

void TtsControllerImpl::StopCurrentUtterance() {
  bool spoken_by_remote_engine =
      IsUtteranceSpokenByRemoteEngine(current_utterance_.get());
  if (engine_delegate_ && current_utterance_ &&
      !current_utterance_->GetEngineId().empty() && !spoken_by_remote_engine) {
    engine_delegate_->Stop(current_utterance_.get());
  } else if (TtsPlatformReady()) {
    GetTtsPlatform()->ClearError();
    GetTtsPlatform()->StopSpeaking();
  }

  if (current_utterance_) {
    current_utterance_->OnTtsEvent(TTS_EVENT_INTERRUPTED, kInvalidCharIndex,
                                   kInvalidLength, std::string());
  }

  FinishCurrentUtterance();
}

void TtsControllerImpl::Pause() {
  base::RecordAction(base::UserMetricsAction("TextToSpeech.Pause"));

  if (paused_)
    return;

  paused_ = true;
  bool spoken_by_remote_engine =
      IsUtteranceSpokenByRemoteEngine(current_utterance_.get());
  if (engine_delegate_ && current_utterance_ &&
      !current_utterance_->GetEngineId().empty() && !spoken_by_remote_engine) {
    engine_delegate_->Pause(current_utterance_.get());
  } else if (current_utterance_) {
    DCHECK(TtsPlatformReady());
    GetTtsPlatform()->ClearError();
    GetTtsPlatform()->Pause();
  }
}

void TtsControllerImpl::Resume() {
  base::RecordAction(base::UserMetricsAction("TextToSpeech.Resume"));

  if (!paused_)
    return;

  paused_ = false;
  bool spoken_by_remote_engine =
      IsUtteranceSpokenByRemoteEngine(current_utterance_.get());
  if (engine_delegate_ && current_utterance_ &&
      !current_utterance_->GetEngineId().empty() && !spoken_by_remote_engine) {
    engine_delegate_->Resume(current_utterance_.get());
  } else if (current_utterance_) {
    DCHECK(TtsPlatformReady());
    GetTtsPlatform()->ClearError();
    GetTtsPlatform()->Resume();
  } else {
    SpeakNextUtterance();
  }
}

void TtsControllerImpl::UninstallLanguageRequest(
    content::BrowserContext* browser_context,
    const std::string& lang,
    const std::string& client_id,
    int source,
    bool uninstall_immediately) {
  if (!engine_delegate_) {
    return;
  }

  engine_delegate_->UninstallLanguageRequest(browser_context, lang, client_id,
                                             source, uninstall_immediately);
}

void TtsControllerImpl::InstallLanguageRequest(BrowserContext* browser_context,
                                               const std::string& lang,
                                               const std::string& client_id,
                                               int source) {
  if (!engine_delegate_) {
    return;
  }

  engine_delegate_->InstallLanguageRequest(browser_context, lang, client_id,
                                           source);
}

void TtsControllerImpl::LanguageStatusRequest(BrowserContext* browser_context,
                                              const std::string& lang,
                                              const std::string& client_id,
                                              int source) {
  if (!engine_delegate_) {
    return;
  }

  engine_delegate_->LanguageStatusRequest(browser_context, lang, client_id,
                                          source);
}

void TtsControllerImpl::OnTtsEvent(int utterance_id,
                                   TtsEventType event_type,
                                   int char_index,
                                   int length,
                                   const std::string& error_message) {
  // We may sometimes receive completion callbacks "late", after we've
  // already finished the utterance (for example because another utterance
  // interrupted or we got a call to Stop). This is normal and we can
  // safely just ignore these events.
  if (!current_utterance_ || utterance_id != current_utterance_->GetId()) {
    return;
  }

  UMATextToSpeechEvent metric;
  switch (event_type) {
    case TTS_EVENT_START:
      metric = UMATextToSpeechEvent::START;
      break;
    case TTS_EVENT_END:
      metric = UMATextToSpeechEvent::END;
      break;
    case TTS_EVENT_WORD:
      metric = UMATextToSpeechEvent::WORD;
      break;
    case TTS_EVENT_SENTENCE:
      metric = UMATextToSpeechEvent::SENTENCE;
      break;
    case TTS_EVENT_MARKER:
      metric = UMATextToSpeechEvent::MARKER;
      break;
    case TTS_EVENT_INTERRUPTED:
      metric = UMATextToSpeechEvent::INTERRUPTED;
      break;
    case TTS_EVENT_CANCELLED:
      metric = UMATextToSpeechEvent::CANCELLED;
      break;
    case TTS_EVENT_ERROR:
      metric = UMATextToSpeechEvent::SPEECH_ERROR;
      break;
    case TTS_EVENT_PAUSE:
      metric = UMATextToSpeechEvent::PAUSE;
      break;
    case TTS_EVENT_RESUME:
      metric = UMATextToSpeechEvent::RESUME;
      break;
    default:
      NOTREACHED();
  }
  UMA_HISTOGRAM_ENUMERATION("TextToSpeech.Event", metric,
                            UMATextToSpeechEvent::COUNT);

  current_utterance_->OnTtsEvent(event_type, char_index, length, error_message);
  if (current_utterance_->IsFinished()) {
    FinishCurrentUtterance();
    SpeakNextUtterance();
  }
}

void TtsControllerImpl::OnTtsUtteranceBecameInvalid(int utterance_id) {
#if BUILDFLAG(IS_CHROMEOS)
  // This handles the case that the utterance originated from the standalone
  // browser becomes invalid, we need to stop
  RemoveUtteranceAndStopIfNeeded(utterance_id);
#else
  NOTREACHED();
#endif
}

void TtsControllerImpl::GetVoices(BrowserContext* browser_context,
                                  const GURL& source_url,
                                  std::vector<VoiceData>* out_voices) {
  // Initialize GetTtsPlatform first, so that engine_delegate_ can be set
  // if necessary.
  TtsPlatform* tts_platform = GetTtsPlatform();

  DCHECK(tts_platform);
  // Ensure we have all built-in voices loaded. This is a no-op if already
  // loaded.
  tts_platform->LoadBuiltInTtsEngine(browser_context);
  if (TtsPlatformReady())
    tts_platform->GetVoices(out_voices);

  if (browser_context && engine_delegate_ &&
      engine_delegate_->IsBuiltInTtsEngineInitialized(browser_context)) {
    engine_delegate_->GetVoices(browser_context, source_url, out_voices);
  }

  tts_platform->FinalizeVoiceOrdering(*out_voices);

  if (!allow_remote_voices_) {
    auto it =
        std::remove_if(out_voices->begin(), out_voices->end(),
                       [](const VoiceData& voice) { return voice.remote; });
    out_voices->resize(it - out_voices->begin());
  }
}

bool TtsControllerImpl::IsSpeaking() {
  return current_utterance_ != nullptr ||
         (TtsPlatformReady() && GetTtsPlatform()->IsSpeaking());
}

void TtsControllerImpl::UpdateLanguageStatus(
    BrowserContext* browser_context,
    const std::string& lang,
    LanguageInstallStatus install_status,
    const std::string& error) {
  if (update_language_status_delegates_.empty()) {
    return;
  }

  for (auto& delegate : update_language_status_delegates_) {
    delegate.OnUpdateLanguageStatus(browser_context, lang, install_status,
                                    error);
  }
}

void TtsControllerImpl::AddUpdateLanguageStatusDelegate(
    UpdateLanguageStatusDelegate* delegate) {
  update_language_status_delegates_.AddObserver(delegate);
}

void TtsControllerImpl::RemoveUpdateLanguageStatusDelegate(
    UpdateLanguageStatusDelegate* delegate) {
  update_language_status_delegates_.RemoveObserver(delegate);
}

void TtsControllerImpl::VoicesChanged() {
  if (voices_changed_delegates_.empty() || TtsPlatformLoading())
    return;

  // Existence of platform tts indicates explicit requests to tts. Since
  // |VoicesChanged| can occur implicitly, only send if needed.
  for (auto& delegate : voices_changed_delegates_)
    delegate.OnVoicesChanged();

  if (!current_utterance_ && !utterance_list_.empty())
    SpeakNextUtterance();
}

void TtsControllerImpl::AddVoicesChangedDelegate(
    VoicesChangedDelegate* delegate) {
  voices_changed_delegates_.AddObserver(delegate);
}

void TtsControllerImpl::RemoveVoicesChangedDelegate(
    VoicesChangedDelegate* delegate) {
  voices_changed_delegates_.RemoveObserver(delegate);
}

void TtsControllerImpl::RemoveUtteranceEventDelegate(
    UtteranceEventDelegate* delegate) {
  // First clear any pending utterances with this delegate.
  std::list<std::unique_ptr<TtsUtterance>> old_list;
  utterance_list_.swap(old_list);
  while (!old_list.empty()) {
    std::unique_ptr<TtsUtterance> utterance = std::move(old_list.front());
    old_list.pop_front();
    if (utterance->GetEventDelegate() != delegate)
      utterance_list_.emplace_back(std::move(utterance));
  }

  if (current_utterance_ &&
      current_utterance_->GetEventDelegate() == delegate) {
    current_utterance_->SetEventDelegate(nullptr);
    if (engine_delegate_ && !current_utterance_->GetEngineId().empty()) {
      engine_delegate_->Stop(current_utterance_.get());
    } else {
      DCHECK(TtsPlatformReady());
      GetTtsPlatform()->ClearError();
      GetTtsPlatform()->StopSpeaking();
    }

    FinishCurrentUtterance();
    SpeakNextUtterance();
  }
}

void TtsControllerImpl::SetTtsEngineDelegate(TtsEngineDelegate* delegate) {
  engine_delegate_ = delegate;
}

TtsEngineDelegate* TtsControllerImpl::GetTtsEngineDelegate() {
  return engine_delegate_;
}

void TtsControllerImpl::RefreshVoices() {
  GetTtsPlatform()->RefreshVoices();
}

void TtsControllerImpl::Shutdown() {
  if (tts_platform_)
    tts_platform_->Shutdown();
}

void TtsControllerImpl::OnBrowserContextDestroyed(
    BrowserContext* browser_context) {
  bool did_clear_utterances = false;

  // First clear the BrowserContext from any utterances.
  for (std::unique_ptr<TtsUtterance>& utterance : utterance_list_) {
    if (utterance->GetBrowserContext() == browser_context) {
      utterance->ClearBrowserContext();
      did_clear_utterances = true;
    }
  }

  if (current_utterance_ &&
      current_utterance_->GetBrowserContext() == browser_context) {
    current_utterance_->ClearBrowserContext();
    did_clear_utterances = true;
  }

  // If we cleared the BrowserContext from any utterances, stop speech
  // just to be safe. Do this using PostTask because calling Stop might
  // try to send notifications and that can trigger code paths that try
  // to access the BrowserContext that's being deleted. Note that it's
  // safe to use base::Unretained because this is a singleton.
  if (did_clear_utterances) {
    base::SingleThreadTaskRunner::GetCurrentDefault()->PostTask(
        FROM_HERE, base::BindOnce(&TtsControllerImpl::StopAndClearQueue,
                                  base::Unretained(this), GURL()));
  }
}

void TtsControllerImpl::SetTtsPlatform(TtsPlatform* tts_platform) {
  tts_platform_ = tts_platform;
}

int TtsControllerImpl::QueueSize() {
  return static_cast<int>(utterance_list_.size());
}

TtsPlatform* TtsControllerImpl::GetTtsPlatform() {
  if (!tts_platform_)
    tts_platform_ = TtsPlatform::GetInstance();
  DCHECK(tts_platform_);
  return tts_platform_;
}

bool TtsControllerImpl::TtsPlatformReady() {
  TtsPlatform* tts_platform = GetTtsPlatform();
  return tts_platform->PlatformImplSupported() &&
         tts_platform->PlatformImplInitialized();
}

bool TtsControllerImpl::TtsPlatformLoading() {
  // If the platform implementation is supported, it is considered to be in
  // loading state until the platform is inititialized. Typically, that means
  // the libraries are loaded and the voices are being loaded.
  TtsPlatform* tts_platform = GetTtsPlatform();
  return tts_platform->PlatformImplSupported() &&
         !tts_platform->PlatformImplInitialized();
}

void TtsControllerImpl::SpeakNow(std::unique_ptr<TtsUtterance> utterance) {
  // Get all available voices and try to find a matching voice.
  std::vector<VoiceData> voices;
  GetVoices(utterance->GetBrowserContext(), utterance->GetSrcUrl(), &voices);

  // Get the best matching voice. If nothing matches, just set "native"
  // to true because that might trigger deferred loading of native voices.
  // TODO(katie): Move most of the GetMatchingVoice logic into content/ and
  // use the TTS controller delegate to get chrome-specific info as needed.
  int index = GetMatchingVoice(utterance.get(), voices);
  VoiceData voice;
  if (index >= 0) {
    voice = voices[index];
  } else {
    voice.native = true;
    voice.engine_id = utterance->GetEngineId();
    voice.name = utterance->GetVoiceName();
    voice.lang = utterance->GetLang();
  }

  UpdateUtteranceDefaults(utterance.get());

  GetTtsPlatform()->WillSpeakUtteranceWithVoice(utterance.get(), voice);

  base::RecordAction(base::UserMetricsAction("TextToSpeech.Speak"));
  UMA_HISTOGRAM_COUNTS_100000("TextToSpeech.Utterance.Rate",
                              utterance->GetContinuousParameters().rate);
  UMA_HISTOGRAM_COUNTS_100000("TextToSpeech.Utterance.TextLength",
                              utterance->GetText().size());
  UMA_HISTOGRAM_BOOLEAN("TextToSpeech.Utterance.FromExtensionAPI",
                        !utterance->GetSrcUrl().is_empty());
  UMA_HISTOGRAM_BOOLEAN("TextToSpeech.Utterance.HasVoiceName",
                        !utterance->GetVoiceName().empty());
  UMA_HISTOGRAM_BOOLEAN("TextToSpeech.Utterance.Native", voice.native);

  if (!voice.native) {
#if !BUILDFLAG(IS_ANDROID)
    DCHECK(!voice.engine_id.empty());
    SetCurrentUtterance(std::move(utterance));
    current_utterance_->SetEngineId(voice.engine_id);
    if (engine_delegate_) {
      engine_delegate_->Speak(current_utterance_.get(), voice);
    }

    bool sends_end_event =
        voice.events.find(TTS_EVENT_END) != voice.events.end();
    if (!sends_end_event) {
      current_utterance_->Finish();
      SetCurrentUtterance(nullptr);
      SpeakNextUtterance();
    }
#endif  // !BUILDFLAG(IS_ANDROID)
  } else {
    // It's possible for certain platforms to send start events immediately
    // during |speak|.
    SetCurrentUtterance(std::move(utterance));
    if (TtsPlatformReady()) {
      GetTtsPlatform()->ClearError();
      GetTtsPlatform()->Speak(
          current_utterance_->GetId(), current_utterance_->GetText(),
          current_utterance_->GetLang(), voice,
          current_utterance_->GetContinuousParameters(),
          base::BindOnce(&TtsControllerImpl::OnSpeakFinished,
                         base::Unretained(this), current_utterance_->GetId()));
    } else {
      // The TTS platform is not supported.
      OnSpeakFinished(current_utterance_->GetId(), false);
    }
  }
}

void TtsControllerImpl::OnSpeakFinished(int utterance_id, bool success) {
  if (success)
    return;

  // Since OnSpeakFinished could run asynchronously, it is possible that the
  // current utterance has changed. Ignore any such spurious callbacks.
  if (!current_utterance_ || current_utterance_->GetId() != utterance_id)
    return;

  // If the native voice wasn't able to process this speech, see if the browser
  // has built-in TTS that crashed and needs re-loading or the utterance came
  // from a profile that no longer exists e.g. login.
  // The controller only ends up here if we had at some point completely
  // initialized native tts and tts engine delegate (see SpeakOrEnqueue), so
  // drop the utterance from re-processing.
  GetTtsPlatform()->LoadBuiltInTtsEngine(
      current_utterance_->GetBrowserContext());

  current_utterance_->OnTtsEvent(TTS_EVENT_ERROR, kInvalidCharIndex,
                                 kInvalidLength, GetTtsPlatform()->GetError());
  SetCurrentUtterance(nullptr);
}

void TtsControllerImpl::ClearUtteranceQueue(bool send_events) {
  while (!utterance_list_.empty()) {
    std::unique_ptr<TtsUtterance> utterance =
        std::move(utterance_list_.front());
    utterance_list_.pop_front();
    if (send_events) {
      utterance->OnTtsEvent(TTS_EVENT_CANCELLED, kInvalidCharIndex,
                            kInvalidLength, std::string());
    } else {
      utterance->Finish();
    }
  }
}

void TtsControllerImpl::FinishCurrentUtterance() {
  if (!current_utterance_)
    return;

  if (!current_utterance_->IsFinished()) {
    current_utterance_->OnTtsEvent(TTS_EVENT_INTERRUPTED, kInvalidCharIndex,
                                   kInvalidLength, std::string());
  }

  SetCurrentUtterance(nullptr);
}

void TtsControllerImpl::SpeakNextUtterance() {
  if (paused_)
    return;

  // Start speaking the next utterance in the queue.  Keep trying in case
  // one fails but there are still more in the queue to try.
  TtsUtterance* previous_utterance = nullptr;
  while (!utterance_list_.empty() && !current_utterance_) {
    std::unique_ptr<TtsUtterance> utterance =
        std::move(utterance_list_.front());
    utterance_list_.pop_front();
    DCHECK(previous_utterance != utterance.get());

    if (ShouldSpeakUtterance(utterance.get()))
      SpeakNow(std::move(utterance));
    else
      utterance->Finish();

    previous_utterance = utterance.get();
  }
}

void TtsControllerImpl::UpdateUtteranceDefaults(TtsUtterance* utterance) {
  double rate = utterance->GetContinuousParameters().rate;
  double pitch = utterance->GetContinuousParameters().pitch;
  double volume = utterance->GetContinuousParameters().volume;
#if BUILDFLAG(IS_CHROMEOS)
  if (GetTtsControllerDelegate())
    GetTtsControllerDelegate()->UpdateUtteranceDefaultsFromPrefs(
        utterance, &rate, &pitch, &volume);
#else
  // Update pitch, rate and volume to defaults if not explicity set on
  // this utterance.
  if (rate == blink::mojom::kSpeechSynthesisDoublePrefNotSet)
    rate = blink::mojom::kSpeechSynthesisDefaultRate;
  if (pitch == blink::mojom::kSpeechSynthesisDoublePrefNotSet)
    pitch = blink::mojom::kSpeechSynthesisDefaultPitch;
  if (volume == blink::mojom::kSpeechSynthesisDoublePrefNotSet)
    volume = blink::mojom::kSpeechSynthesisDefaultVolume;
#endif  // BUILDFLAG(IS_CHROMEOS)
  utterance->SetContinuousParameters(rate, pitch, volume);
}

void TtsControllerImpl::StripSSML(
    const std::string& utterance,
    base::OnceCallback<void(const std::string&)> on_ssml_parsed) {
  // Skip parsing and return if not xml.
  if (utterance.find("<?xml") == std::string::npos) {
    std::move(on_ssml_parsed).Run(utterance);
    return;
  }

  // Parse using safe, out-of-process Xml Parser.
  data_decoder::DataDecoder::ParseXmlIsolated(
      utterance,
      data_decoder::mojom::XmlParser::WhitespaceBehavior::kPreserveSignificant,
      base::BindOnce(&TtsControllerImpl::StripSSMLHelper, utterance,
                     std::move(on_ssml_parsed)));
}

// Called when ParseXml finishes.
// Uses parsed xml to build parsed utterance text.
void TtsControllerImpl::StripSSMLHelper(
    const std::string& utterance,
    base::OnceCallback<void(const std::string&)> on_ssml_parsed,
    data_decoder::DataDecoder::ValueOrError result) {
  // Error checks.
  // If invalid xml, return original utterance text.
  if (!result.has_value()) {
    std::move(on_ssml_parsed).Run(utterance);
    return;
  }

  std::string root_tag_name;
  data_decoder::GetXmlElementTagName(*result, &root_tag_name);
  // Root element must be <speak>.
  if (root_tag_name.compare("speak") != 0) {
    std::move(on_ssml_parsed).Run(utterance);
    return;
  }

  std::string parsed_text;
  // Change from unique_ptr to base::Value* so recursion will work.
  PopulateParsedText(&parsed_text, &*result);

  // Run with parsed_text.
  std::move(on_ssml_parsed).Run(parsed_text);
}

void TtsControllerImpl::PopulateParsedText(std::string* parsed_text,
                                           const base::Value* element) {
  DCHECK(parsed_text);
  if (!element || !element->is_dict()) {
    return;
  }
  // Add element's text if present.
  // Note: We don't use data_decoder::GetXmlElementText because it gets the text
  // of element's first child, not text of current element.
  const std::string* text_value =
      element->GetDict().FindString(data_decoder::mojom::XmlParser::kTextKey);
  if (text_value)
    *parsed_text += *text_value;

  const base::Value::List* children =
      data_decoder::GetXmlElementChildren(*element);
  if (!children) {
    return;
  }

  for (const auto& entry : *children) {
    // We need to iterate over all children because some text elements are
    // nested within other types of elements, such as <emphasis> tags.
    PopulateParsedText(parsed_text, &entry);
  }
}

int TtsControllerImpl::GetMatchingVoice(TtsUtterance* utterance,
                                        const std::vector<VoiceData>& voices) {
  const std::string app_lang =
      GetContentClient()->browser()->GetApplicationLocale();
  // Start with a best score of -1, that way even if none of the criteria
  // match, something will be returned if there are any voices.
  int best_score = -1;
  int best_score_index = -1;
#if BUILDFLAG(IS_CHROMEOS)
  TtsControllerDelegate* delegate = GetTtsControllerDelegate();
  std::unique_ptr<TtsControllerDelegate::PreferredVoiceIds> preferred_ids =
      delegate ? delegate->GetPreferredVoiceIdsForUtterance(utterance)
               : nullptr;
#endif  // BUILDFLAG(IS_CHROMEOS)
  for (size_t i = 0; i < voices.size(); ++i) {
    const content::VoiceData& voice = voices[i];
    int score = 0;

    // If the extension ID is specified, check for an exact match.
    if (!utterance->GetEngineId().empty() &&
        utterance->GetEngineId() != voice.engine_id)
      continue;

    // If the voice name is specified, check for an exact match.
    if (!utterance->GetVoiceName().empty() &&
        voice.name != utterance->GetVoiceName())
      continue;

    // Prefer the utterance language.
    if (!voice.lang.empty() && !utterance->GetLang().empty()) {
      std::string voice_language =
          base::ToLowerASCII(l10n_util::GetLanguage(voice.lang));
      std::string voice_country =
          base::ToLowerASCII(l10n_util::GetCountry(voice.lang));
      std::string utterance_language =
          base::ToLowerASCII(l10n_util::GetLanguage(utterance->GetLang()));
      std::string utterance_country =
          base::ToLowerASCII(l10n_util::GetCountry(utterance->GetLang()));

      // An exact locale match is worth more than a partial match.
      // Convert locales to lowercase to handle cases like "en-us" vs. "en-US".
      // Cases where language and country match should score the same as an
      // exact match.
      if (voice_language == utterance_language &&
          (voice_country == utterance_country ||
           (utterance_country.empty() && voice_language == voice_country) ||
           (voice_country.empty() &&
            utterance_language == utterance_country))) {
        score += 128;
      } else if (voice_language == utterance_language) {
        score += 64;
      }
    }

    // Next, prefer required event types.
    if (!utterance->GetRequiredEventTypes().empty()) {
      bool has_all_required_event_types = true;
      for (TtsEventType event_type : utterance->GetRequiredEventTypes()) {
        if (voice.events.find(event_type) == voice.events.end()) {
          has_all_required_event_types = false;
          break;
        }
      }
      if (has_all_required_event_types)
        score += 32;
    }

#if BUILDFLAG(IS_CHROMEOS)
    if (preferred_ids) {
      // First prefer the user's preference voice for the utterance language,
      // if the utterance language is specified.
      if (!utterance->GetLang().empty() &&
          VoiceIdMatches(preferred_ids->lang_voice_id, voice)) {
        score += 16;
      }

      // Then prefer the user's preference voice for the system language.
      // This is a lower priority match than the utterance voice.
      if (VoiceIdMatches(preferred_ids->locale_voice_id, voice))
        score += 8;

      // Finally, prefer the user's preference voice for any language. This will
      // pick the default voice if there is no better match for the current
      // system language and utterance language.
      if (VoiceIdMatches(preferred_ids->any_locale_voice_id, voice))
        score += 4;
    }
#endif  // BUILDFLAG(IS_CHROMEOS)

    // Finally, prefer system language.
    if (!voice.lang.empty()) {
      if (voice.lang == app_lang) {
        score += 2;
      } else if (base::EqualsCaseInsensitiveASCII(
                     l10n_util::GetLanguage(voice.lang),
                     l10n_util::GetLanguage(app_lang))) {
        score += 1;
      }
    }

    if (score > best_score) {
      best_score = score;
      best_score_index = i;
    }
  }

  return best_score_index;
}

void TtsControllerImpl::SetCurrentUtterance(
    std::unique_ptr<TtsUtterance> utterance) {
  current_utterance_ = std::move(utterance);
  Observe(current_utterance_
              ? AsUtteranceImpl(current_utterance_.get())->GetWebContents()
              : nullptr);
}

void TtsControllerImpl::StopCurrentUtteranceAndRemoveUtterancesMatching(
    WebContents* wc) {
  DCHECK(wc);
  // Removes any utterances that match the WebContents from the current
  // utterance (which our inherited WebContentsObserver starts observing every
  // time the utterance changes).
  //
  // This is called when the WebContents for the current utterance is destroyed
  // or hidden. In the case where it's destroyed, this is done to avoid
  // attempting to start a utterance that is very likely to be destroyed right
  // away, and there are also subtle timing issues if we didn't do this (if a
  // queued utterance has already received WebContentsDestroyed(), and we start
  // it, we won't get the corresponding WebContentsDestroyed()).
  auto eraser = [wc](const std::unique_ptr<TtsUtterance>& utterance) {
    TtsUtteranceImpl* utterance_impl = AsUtteranceImpl(utterance.get());
    if (utterance_impl->GetWebContents() == wc) {
      utterance_impl->Finish();
      return true;
    }
    return false;
  };
  utterance_list_.erase(
      std::remove_if(utterance_list_.begin(), utterance_list_.end(), eraser),
      utterance_list_.end());
  const bool stopped = StopCurrentUtteranceIfMatches(GURL());
  DCHECK(stopped);
  SpeakNextUtterance();
}

void TtsControllerImpl::RemoveUtteranceAndStopIfNeeded(int utterance_id) {
  for (std::list<std::unique_ptr<TtsUtterance>>::iterator it =
           utterance_list_.begin();
       it != utterance_list_.end(); ++it) {
    if ((*it)->GetId() == utterance_id) {
      TtsUtteranceImpl* utterance_impl = AsUtteranceImpl((*it).get());
      utterance_impl->Finish();
      utterance_list_.erase(it);
      break;
    }
  }

  const bool stopped = StopCurrentUtteranceIfMatches(utterance_id);
  if (stopped)
    SpeakNextUtterance();
}

bool TtsControllerImpl::StopCurrentUtteranceIfMatches(int utterance_id) {
  paused_ = false;

  if (current_utterance_->GetId() != utterance_id)
    return false;

  StopCurrentUtterance();
  return true;
}

bool TtsControllerImpl::ShouldSpeakUtterance(TtsUtterance* utterance) {
  TtsUtteranceImpl* utterance_impl = AsUtteranceImpl(utterance);
  if (!utterance_impl->was_created_with_web_contents() ||
      utterance_impl->ShouldAlwaysBeSpoken()) {
    return true;
  }

  // If the WebContents that created the utterance has been destroyed, don't
  // speak it.
  if (!utterance_impl->GetWebContents())
    return false;

  // Allow speaking if either the WebContents is visible, or the WebContents
  // isn't required to be visible before speaking.
  return !stop_speaking_when_hidden_ ||
         utterance_impl->GetWebContents()->GetVisibility() !=
             Visibility::HIDDEN;
}

//
// WebContentsObserver
//

void TtsControllerImpl::WebContentsDestroyed() {
  StopCurrentUtteranceAndRemoveUtterancesMatching(web_contents());
}

void TtsControllerImpl::OnVisibilityChanged(Visibility visibility) {
  if (visibility == Visibility::HIDDEN && stop_speaking_when_hidden_)
    StopCurrentUtteranceAndRemoveUtterancesMatching(web_contents());
}

void TtsControllerImpl::OnNetworkChanged(
    net::NetworkChangeNotifier::ConnectionType type) {
  switch (type) {
      // Non-cellular connections.
    case net::NetworkChangeNotifier::ConnectionType::CONNECTION_UNKNOWN:
    case net::NetworkChangeNotifier::ConnectionType::CONNECTION_ETHERNET:
    case net::NetworkChangeNotifier::ConnectionType::CONNECTION_WIFI:
    case net::NetworkChangeNotifier::ConnectionType::CONNECTION_BLUETOOTH:
      allow_remote_voices_ = true;
      break;

      // Cellular connections.
    case net::NetworkChangeNotifier::ConnectionType::CONNECTION_2G:
    case net::NetworkChangeNotifier::ConnectionType::CONNECTION_3G:
    case net::NetworkChangeNotifier::ConnectionType::CONNECTION_4G:
    case net::NetworkChangeNotifier::ConnectionType::CONNECTION_NONE:
    case net::NetworkChangeNotifier::ConnectionType::CONNECTION_5G:
      allow_remote_voices_ = false;
  }
}

#if BUILDFLAG(IS_CHROMEOS)
TtsControllerDelegate* TtsControllerImpl::GetTtsControllerDelegate() {
  if (delegate_)
    return delegate_;
  if (GetContentClient() && GetContentClient()->browser()) {
    delegate_ = GetContentClient()->browser()->GetTtsControllerDelegate();
    return delegate_;
  }
  return nullptr;
}

void TtsControllerImpl::SetTtsControllerDelegateForTesting(
    TtsControllerDelegate* delegate) {
  delegate_ = delegate;
}
#endif  // BUILDFLAG(IS_CHROMEOS)

}  // namespace content