chromium_src/chrome/renderer/accessibility/read_anything/read_aloud_app_model.h · OpenHarmony-TPC/chromium_src - AtomGit

// Copyright 2024 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef CHROME_RENDERER_ACCESSIBILITY_READ_ANYTHING_READ_ALOUD_APP_MODEL_H_
#define CHROME_RENDERER_ACCESSIBILITY_READ_ANYTHING_READ_ALOUD_APP_MODEL_H_

#include "base/metrics/single_sample_metrics.h"
#include "base/values.h"
#include "chrome/common/read_anything/read_anything.mojom.h"
#include "chrome/renderer/accessibility/phrase_segmentation/dependency_parser_model.h"
#include "chrome/renderer/accessibility/read_anything/read_aloud_traversal_utils.h"
#include "ui/accessibility/ax_node_position.h"

class ReadAnythingReadAloudAppModelTest;

// A class that holds state related to Read Aloud for the
// ReadAnythingAppController for the Read Anything WebUI app.
class ReadAloudAppModel {
 public:
  // Enum for logging when speech is stopped and why.
  // These values are persisted to logs. Entries should not be renumbered and
  // numeric values should never be reused.
  //
  // LINT.IfChange(ReadAloudStopSource)
  enum class ReadAloudStopSource {
    kButton = 0,
    kKeyboardShortcut = 1,
    kCloseReadingMode = 2,
    kCloseTabOrWindow = 3,
    kReloadPage = 4,
    kChangePage = 5,
    kEngineInterrupt = 6,
    kEngineError = 7,
    kFinishContent = 8,
    kLockChromeosDevice = 9,
    kUnexpectedUpdateContent = 10,

    kMinValue = kButton,
    kMaxValue = kUnexpectedUpdateContent,
  };
  // LINT.ThenChange(/tools/metrics/histograms/metadata/accessibility/enums.xml:ReadAnythingSpeechStopSource)

  static constexpr char kSpeechStopSourceHistogramName[] =
      "Accessibility.ReadAnything.SpeechStopSource";
  static constexpr char kAudioStartTimeFailureHistogramName[] =
      "Accessibility.ReadAnything.AudioStartTime.Failure";
  static constexpr char kAudioStartTimeSuccessHistogramName[] =
      "Accessibility.ReadAnything.AudioStartTime.Success";

  ReadAloudAppModel();
  ~ReadAloudAppModel();
  ReadAloudAppModel(const ReadAloudAppModel& other) = delete;
  ReadAloudAppModel& operator=(const ReadAloudAppModel&) = delete;

  bool speech_tree_initialized() { return speech_tree_initialized_; }
  bool speech_playing() { return speech_playing_; }
  void SetSpeechPlaying(bool is_playing);
  bool audio_currently_playing() { return audio_currently_playing_; }
  void SetAudioCurrentlyPlaying(bool is_playing);
  double speech_rate() const { return speech_rate_; }
  void set_speech_rate(double rate) { speech_rate_ = rate; }
  const base::Value::List& languages_enabled_in_pref() const {
    return languages_enabled_in_pref_;
  }
  void SetLanguageEnabled(const std::string& lang, bool enabled);
  const base::Value::Dict& voices() const { return voices_; }
  void SetVoice(const std::string& voice, const std::string& lang) {
    voices_.Set(lang, voice);
  }
  int highlight_granularity() const { return highlight_granularity_; }
  void set_highlight_granularity(int granularity) {
    highlight_granularity_ = granularity;
  }
  const std::string& default_language_code() const {
    return default_language_code_;
  }
  void set_default_language_code(const std::string& code) {
    default_language_code_ = code;
  }

  bool IsHighlightOn();
  void OnSettingsRestoredFromPrefs(
      double speech_rate,
      base::Value::List* languages_enabled_in_pref,
      base::Value::Dict* voices,
      read_anything::mojom::HighlightGranularity granularity);

  // Returns the next valid AXNodePosition.
  ui::AXNodePosition::AXPositionInstance
  GetNextValidPositionFromCurrentPosition(
      const a11y::ReadAloudCurrentGranularity& current_granularity,
      bool is_pdf,
      bool is_docs,
      const std::set<ui::AXNodeID>* current_nodes);

  // Inits the AXPosition with a starting node.
  // TODO(crbug.com/40927698): We should be able to use AXPosition in a way
  // where this isn't needed.
  void InitAXPositionWithNode(ui::AXNode* ax_node,
                              const ui::AXTreeID& active_tree_id);

  void ResetGranularityIndex();

  // Returns a list of ReadAloudCurrentGranularitys representing the next nodes
  // that should be spoken and highlighted with Read Aloud.
  // This defaults to returning the first granularity until
  // MovePositionTo<Next,Previous>Granularity() moves the position.
  // If the the current processed_granularity_index_ has not been calculated
  // yet, GetNextNodes() is called which updates the AXPosition.
  a11y::ReadAloudCurrentGranularity GetCurrentText(
      bool is_pdf,
      bool is_docs,
      const std::set<ui::AXNodeID>* current_nodes);

  // Asynchronously preprocess the text on the current page that will be
  // used for Read Aloud.
  void PreprocessTextForSpeech(bool is_pdf,
                               bool is_docs,
                               const std::set<ui::AXNodeID>* current_nodes);

  // Get the dependency parsing model for this renderer process.
  DependencyParserModel& GetDependencyParserModel();

  // Increments the processed_granularity_index_, updating ReadAloud's state of
  // the current granularity to refer to the next granularity. The current
  // behavior allows the client to increment past the end of the page's content.
  void MovePositionToNextGranularity();

  // Decrements the processed_granularity_index_,updating ReadAloud's state of
  // the current granularity to refer to the previous granularity. Cannot be
  // decremented less than 0.
  void MovePositionToPreviousGranularity();

  // Returns the Read Aloud starting text index for a node. For example,
  // if the entire text of the node should be read by Read Aloud at a particular
  // moment, this will return 0. Returns -1 if the node isn't in the current
  // segment.
  int GetCurrentTextStartIndex(const ui::AXNodeID& node_id);

  // Returns the Read Aloud ending text index for a node. For example,
  // if the entire text of the node should be read by Read Aloud at a particular
  // moment, this will return the length of the node's text. Returns -1 if the
  // node isn't in the current segment.
  int GetCurrentTextEndIndex(const ui::AXNodeID& node_id);

  void ResetReadAloudState();

  // Returns a list of segments representing the next nodes and ranges
  // that should be spoken and highlighted with Read Aloud. The text ranges
  // consist of start and end offsets within each node. This defaults to
  // returning the first granularity until
  // MovePositionTo<Next,Previous>Granularity() moves the position.
  // If the the current processed_granularity_index_ has not been calculated
  // yet, GetNextNodes() is called which updates the AXPosition.
  std::vector<ReadAloudTextSegment> GetCurrentTextSegments(
      bool is_pdf,
      bool is_docs,
      const std::set<ui::AXNodeID>* current_nodes);

  // Given a text index for the current granularity, return the nodes and the
  // corresponding text ranges for that part of the text. The text ranges
  // consist of start and end offsets within each node. If the `phrases`
  // argument is `true`, the text ranges for the containing phrase are returned,
  // otherwise the text ranges for the word are returned.
  //
  // For example, if a current granularity segment has text:
  // "Hello darkness, my old friend."
  // Composed of nodes:
  // Node: {id: 113, text: "Hello dark"}
  // Node: {id: 207, text: "ness, my old friend."}
  // Then GetHighlightForCurrentSegmentIndex for index=6 will return the
  // following nodes, which correspond to the word "darkness, ":
  //    [{"113", 6, 10}, {"207", 0, 6}]
  // For index=17, which corresponds to the word "my ", will return:
  //    [{"207", 6, 9}].
  std::vector<ReadAloudTextSegment> GetHighlightForCurrentSegmentIndex(
      int index,
      bool phrases) const;

  // Updates the session count for the given metric name using
  // SingleSampleMetric. These are then logged once on destruction.
  void IncrementMetric(const std::string& metric_name);

  void LogSpeechStop(ReadAloudStopSource source);

 private:
  friend ReadAnythingReadAloudAppModelTest;

  bool IsTsTextSegmentationEnabled() const;

  void LogAudioDelay(bool success);

  // Helper method for GetCurrentText.
  a11y::ReadAloudCurrentGranularity GetNextNodes(
      bool is_pdf,
      bool is_docs,
      const std::set<ui::AXNodeID>* current_nodes);

  // Returns true if the node was previously spoken or we expect to speak it
  // to be spoken once the current run of #GetCurrentText which called
  // #NodeBeenOrWillBeSpoken finishes executing. Because AXPosition
  // sometimes returns leaf nodes, we sometimes need to use the parent of a
  // node returned by AXPosition instead of the node itself. Because of this,
  // we need to double-check that the node has not been used or currently
  // in use.
  // Example:
  // parent node: id=5
  //    child node: id=6
  //    child node: id =7
  // node: id = 10
  // Where AXPosition will return nodes in order of 6, 7, 10, but Reading Mode
  // process them as 5, 10. Without checking for previously spoken nodes,
  // id 5 will be spoken twice.
  bool NodeBeenOrWillBeSpoken(
      const a11y::ReadAloudCurrentGranularity& current_granularity,
      const ui::AXNodeID& id) const;

  bool IsValidAXPosition(
      const ui::AXNodePosition::AXPositionInstance& position,
      const a11y::ReadAloudCurrentGranularity& current_granularity,
      bool is_pdf,
      bool is_docs,
      const std::set<ui::AXNodeID>* current_nodes) const;

  void AddTextToCurrentGranularity(
      ui::AXNode* anchor_node,
      int start_index,
      int end_index,
      a11y::ReadAloudCurrentGranularity& current_granularity,
      bool is_pdf,
      bool is_docs);

  // Returns if we should end text traversal from the current position, due
  // to reaching the end of content or reaching a point, such as a paragraph,
  // where a segment should be split.
  bool ShouldEndTextTraversal(
      a11y::ReadAloudCurrentGranularity current_granularity);

  // Helper method for GetNextNodes.
  // During text traversal for Read Aloud, adds text to the current Read Aloud
  // segment from the start of the current node.
  // for example, if:
  //   node 1: This is sentence 1.
  //   node 2: This is sentence 2.
  //   ax_position_ points to node 2,
  //   AddTextFromStartOfNode will add the text in node 2 to the current
  //   segment
  // Returns a TraversalState enum used to indicate if traversal should end,
  // continue to the next node, or continue within the same node.
  a11y::TraversalState AddTextFromStartOfNode(
      bool is_pdf,
      bool is_docs,
      a11y::ReadAloudCurrentGranularity& current_granularity);

  // Helper method for GetNextNodes.
  // During text traversal for Read Aloud, adds text to the current Read Aloud
  // segment from the middle of the current node.
  // for example, if:
  //   node 1: This is sentence 1.
  //   node 2: Hello! This is sentence 2.
  //   ax_position_ points to node 2 and current_text_index_ is 7.
  //   AddTextFromMiddleOfNode will add the text in node 2 starting from the
  //   current_text_index_ to the current speech segment
  // Returns a TraversalState enum used to indicate if traversal should end,
  // continue to the next node, or continue within the same node.
  a11y::TraversalState AddTextFromMiddleOfNode(
      bool is_pdf,
      bool is_docs,
      a11y::ReadAloudCurrentGranularity& current_granularity);

  bool PositionEndsWithOpeningPunctuation(
      bool is_superscript,
      int combined_sentence_index,
      const std::u16string& combined_text,
      a11y::ReadAloudCurrentGranularity current_granularity);

  // Helper for GetNextNodes.
  // Moves the current AXPosition to the next valid position.
  void MoveToNextAXPosition(
      a11y::ReadAloudCurrentGranularity& current_granularity,
      bool is_pdf,
      bool is_docs,
      const std::set<ui::AXNodeID>* current_nodes);

  // Helper for GetNextNodes.
  // Returns true if the node at the current AXPosition has no more text
  // remaining.
  // e.g. If the current node's text is "You need to not care. You need to not
  //      stare." and Read Aloud has read out loud both sentences, this will
  //      return true. However, if Read Aloud has only read out the first
  //      sentence, this will return false because "You need to not stare."
  //      still needs to be read.
  bool NoValidTextRemainingInCurrentNode(bool is_pdf, bool is_docs) const;

  // Asynchronously segment the given granularity into phrases. Once the phrases
  // are calculated, `UpdatePhraseBoundaries` will be called.
  void CalculatePhrases(a11y::ReadAloudCurrentGranularity& granularity);

  // Once the phrase segmentation has completed for a given sentence, update the
  // granularity with the phrase boundaries, and calculate phrases for the next
  // sentence.
  // TODO(crbug.com/384820795): Investigate if a UID or hash
  // can be used to avoid passing around the tokens.
  void UpdatePhraseBoundaries(std::vector<std::string> tokens,
                              std::vector<size_t> heads);

  // Initiate phrase calculation from the first sentence.
  void StartPhraseCalculation();

  // Whether Read Aloud speech was initiated. Audio may or may not have actually
  // started output.
  bool speech_playing_ = false;
  // Whether audio for Read aloud is actually playing.
  bool audio_currently_playing_ = false;

  // The current speech rate for reading aloud.
  double speech_rate_ = 1.0;

  // The languages that the user has enabled for reading aloud.
  base::Value::List languages_enabled_in_pref_;

  // The user's preferred voices. Maps from a language to the last chosen
  // voice for that language.
  base::Value::Dict voices_;

  // The current granularity being used for the reading highlight.
  int highlight_granularity_ =
      (int)read_anything::mojom::HighlightGranularity::kDefaultValue;

  // The default language code, used as a fallback in case the page language
  // is invalid. It's not guaranteed that default_language_code_ will always
  // be valid, but as it is tied to the browser language, it is likely more
  // stable.
  std::string default_language_code_ = "en";

  // Metrics for logging. Any metric that we want to track 0-counts of should
  // be initialized here.
  const int min_sample = 0;
  const int max_sample = 1000;
  const uint32_t buckets = 50;
  std::map<std::string, int64_t> metric_to_count_map_ = {
      {"Accessibility.ReadAnything.ReadAloudNextButtonSessionCount", 0},
      {"Accessibility.ReadAnything.ReadAloudPauseSessionCount", 0},
      {"Accessibility.ReadAnything.ReadAloudPlaySessionCount", 0},
      {"Accessibility.ReadAnything.ReadAloudPreviousButtonSessionCount", 0},
  };
  std::map<std::string, std::unique_ptr<base::SingleSampleMetric>>
      metric_to_single_sample_;

  // The time when the speech becomes active.
  base::TimeTicks speech_active_time_ms_;

  // Traversal state

  ui::AXNodePosition::AXPositionInstance ax_position_;

  // If ax_position_ has been initialized. Since preprocessing nodes
  // can result in the AXPosition being set to the null position, reading mode
  // can't rely on AXPosition->IsNullPosition() to check whether or not the
  // speech tree has been initialized.
  bool speech_tree_initialized_ = false;

  // Our current index within processed_granularities_on_current_page_.
  size_t processed_granularity_index_ = 0;

  // The current text index within the given node.
  int current_text_index_ = 0;

  // Whether a phrase calculation for a sentence is currently underway. (We
  // do not initiate a second calculation before the first has completed.)
  bool is_calculating_phrases = false;

  // Which sentence (index into `processed_granularities_on_current_page`) is
  // currently being processed for phrases. -1 if none.
  int current_phrase_calculation_index_ = -1;

  // TODO(crbug.com/40927698): Clear this when granularity changes.
  // TODO(crbug.com/40927698): Use this to assist in navigating forwards /
  // backwards.
  // Previously processed granularities on the current page.
  std::vector<a11y::ReadAloudCurrentGranularity>
      processed_granularities_on_current_page_;

  ui::AXTreeID active_tree_id_ = ui::AXTreeIDUnknown();

  base::WeakPtrFactory<ReadAloudAppModel> weak_ptr_factory_{this};
};

#endif  // CHROME_RENDERER_ACCESSIBILITY_READ_ANYTHING_READ_ALOUD_APP_MODEL_H_