// Copyright 2018 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef UI_ACCESSIBILITY_AX_LANGUAGE_DETECTION_H_
#define UI_ACCESSIBILITY_AX_LANGUAGE_DETECTION_H_

#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>

#include "base/memory/raw_ptr.h"
#include "third_party/cld_3/src/src/nnet_language_identifier.h"
#include "ui/accessibility/ax_enums.mojom-forward.h"
#include "ui/accessibility/ax_export.h"
#include "ui/accessibility/ax_tree_observer.h"

namespace ui {

class AXNode;
class AXTree;

// This module implements language detection enabling Chrome to automatically
// detect the language for runs of text within the page.
//
// Node-level language detection runs once per page after the load complete
// event. This involves two passes:
//   *Detect* walks the tree from the given root using cld3 to detect up to 3
//            potential languages per node. A ranked list is created enumerating
//            all potential languages on a page.
//   *Label* re-walks the tree, assigning a language to each node considering
//           the potential languages from the detect phase, page level
//           statistics, and the assigned languages of ancestor nodes.
//
// Optionally an embedder may run *sub-node* language detection which attempts
// to assign languages for runs of text within a node, potentially down to the
// individual character level. This is useful in cases where a single paragraph
// involves switching between multiple languages, and where the speech engine
// doesn't automatically switch voices to handle different character sets.
// Due to the potentially small lengths of text runs involved this tends to be
// lower in accuracy, and works best when a node is composed of multiple
// languages with easily distinguishable scripts.

// AXLanguageInfo represents the local language detection data for all text
// within an AXNode. Stored on AXNode.
struct AX_EXPORT AXLanguageInfo {
  AXLanguageInfo();
  ~AXLanguageInfo();

  // This is the final language we have assigned for this node during the
  // 'label' step, it is the result of merging:
  //  a) The detected language for this node
  //  b) The declared lang attribute on this node
  //  c) the (recursive) language of the parent (detected or declared).
  //
  // This will be the empty string if no language was assigned during label
  // phase.
  //
  // IETF BCP 47 Language code (rfc5646).
  // examples:
  //  'de'
  //  'de-DE'
  //  'en'
  //  'en-US'
  //  'es-ES'
  //
  // This should not be read directly by clients of AXNode, instead clients
  // should call AXNode::GetLanguage().
  // TODO(chrishall): consider renaming this to `assigned_language`.
  std::string language;

  // Detected languages for this node sorted as returned by
  // FindTopNMostFreqLangs, which sorts in decreasing order of probability,
  // filtered to remove any unreliable results.
  std::vector<std::string> detected_languages;
};

// Each AXLanguageSpan contains a language, a probability, and start and end
// indices. The indices are used to specify the substring that contains the
// associated language. The string which the indices are relative to is not
// included in this structure.
// Also, the indices are relative to a Utf8 string.
// See documentation on GetLanguageAnnotationForStringAttribute for details
// on how to associate this object with a string.
struct AX_EXPORT AXLanguageSpan {
  int start_index;
  int end_index;
  std::string language;
  float probability;
};

// A single AXLanguageInfoStats instance is stored on each AXTree and contains
// statistics on detected languages for all the AXNodes in that tree.
//
// We rely on these tree-level statistics when labelling individual nodes, to
// provide extra signals to increase our confidence in assigning a detected
// language.
//
// These tree level statistics are also used to send reports on the language
// detection feature to enable tuning.
//
// The Label step will only assign a detected language to a node if that
// language is one of the most frequent languages on the page.
//
// For example, if a single node has detected_languages (in order of probability
// assigned by cld_3): da-DK, en-AU, fr-FR, but the page statistics overall
// indicate that the page is generally in en-AU and ja-JP, it is more likely to
// be a mis-recognition of Danish than an accurate assignment, so we assign
// en-AU instead of da-DK.
class AX_EXPORT AXLanguageInfoStats {
 public:
  AXLanguageInfoStats();
  ~AXLanguageInfoStats();

  // Each AXLanguageInfoStats is tied to a specific AXTree, copying is safe but
  // logically doesn't make sense.
  AXLanguageInfoStats(const AXLanguageInfoStats&) = delete;
  AXLanguageInfoStats& operator=(const AXLanguageInfoStats&) = delete;

  // Adjust our statistics to add provided detected languages.
  void Add(const std::vector<std::string>& languages);

  // Fetch the score for a given language.
  int GetScore(const std::string& lang) const;

  // Check if a given language is within the top results.
  bool CheckLanguageWithinTop(const std::string& lang);

  // Record statistics based on how we labelled a node.
  // We consider the language we labelled the node with, the language the author
  // assigned, and whether or not we assigned our highest confidence detection
  // result.
  void RecordLabelStatistics(const std::string& labelled_lang,
                             const std::string& author_lang,
                             bool labelled_with_first_result);

  // Update metrics to reflect we attempted to detect language for a node.
  void RecordDetectionAttempt();

  // Report metrics to UMA.
  // Reports statistics since last run, run once detect & label iteration.
  // If successful, will reset statistics.
  void ReportMetrics();

 private:
  // Allow access from a fixture only used in testing.
  friend class AXLanguageDetectionTestFixture;

  // Store a count of the occurrences of a given language.
  std::unordered_map<std::string, int> lang_counts_;

  // Cache of last calculated top language results.
  // A vector of pairs of (score, language) sorted by descending score.
  std::vector<std::pair<int, std::string>> top_results_;

  // Boolean recording that we have not mutated the statistics since last
  // calculating top results, setting this to false will cause recalculation
  // when the results are next fetched.
  bool top_results_valid_;

  // Invalidate the top results cache.
  void InvalidateTopResults();

  // Compute the top results and store them in cache.
  void GenerateTopResults();

  // TODO(chrishall): Do we want this for testing? or is it better to only test
  //  the generated metrics by inspecting the histogram?
  // Boolean used for testing metrics only, disables clearing of metrics.
  bool disable_metric_clearing_;
  void ClearMetrics();

  // *** Statistics recorded for metric reporting. ***
  // All statistics represent a single iteration of language detection and are
  // reset after each successful call of ReportMetrics.

  // The number of nodes we attempted detection on.
  int count_detection_attempted_;

  // The number of nodes we got detection results for.
  int count_detection_results_;

  // The number of nodes we assigned a label to.
  int count_labelled_;

  // The number of nodes we assigned a label to which was the highest confident
  // detected language.
  int count_labelled_with_top_result_;

  // The number of times we labelled a language which disagreed with the node's
  // author provided language annotation.
  //
  // If we have
  //  <div lang='en'><span>...</span><span>...</span></div>
  // and we detect and label both spans as having language 'fr', then we count
  // this as `2` overrides.
  int count_overridden_;

  // Set of top language detected for every node, used to generate the unique
  // number of detected languages metric (LangsPerPage).
  std::unordered_set<std::string> unique_top_lang_detected_;
};

// AXLanguageDetectionObserver is registered as a change observer on an AXTree
// and will run language detection after each update to the tree.
//
// We have kept this observer separate from the AXLanguageDetectionManager as we
// are aiming to launch language detection in two phases and wanted to try keep
// the code paths somewhat separate.
//
// TODO(chrishall): After both features have launched we could consider merging
// AXLanguageDetectionObserver into AXLanguageDetectionManager.
//
// TODO(chrishall): Investigate the cost of using AXTreeObserver, given that it
// has many empty virtual methods which are called for every AXTree change and
// we are only currently interested in OnAtomicUpdateFinished.
class AX_EXPORT AXLanguageDetectionObserver : public ui::AXTreeObserver {
 public:
  // Observer constructor will register itself with the provided AXTree.
  explicit AXLanguageDetectionObserver(AXTree* tree);

  // Observer destructor will remove itself as an observer from the AXTree.
  ~AXLanguageDetectionObserver() override;

  // AXLanguageDetectionObserver contains a pointer so copying is non-trivial.
  AXLanguageDetectionObserver(const AXLanguageDetectionObserver&) = delete;
  AXLanguageDetectionObserver& operator=(const AXLanguageDetectionObserver&) =
      delete;

 private:
  void OnAtomicUpdateFinished(ui::AXTree* tree,
                              bool root_changed,
                              const std::vector<Change>& changes) override;

  // Non-owning pointer to AXTree, used to de-register observer on destruction.
  const raw_ptr<AXTree> tree_;
};

// AXLanguageDetectionManager manages all of the context needed for language
// detection within an AXTree.
class AX_EXPORT AXLanguageDetectionManager {
 public:
  // Construct an AXLanguageDetectionManager for the specified tree.
  explicit AXLanguageDetectionManager(AXTree* tree);
  ~AXLanguageDetectionManager();

  // AXLanguageDetectionManager contains pointers so copying is non-trivial.
  AXLanguageDetectionManager(const AXLanguageDetectionManager&) = delete;
  AXLanguageDetectionManager& operator=(const AXLanguageDetectionManager&) =
      delete;

  // Detect languages for each node in the tree managed by this manager.
  // This is the first pass in detection and labelling.
  // This only detects the language, it does not label it, for that see
  //  LabelLanguageForSubtree.
  void DetectLanguages();

  // Label languages for each node in the tree manager by this manager.
  // This is the second pass in detection and labelling.
  // This will label the language, but relies on the earlier detection phase
  // having already completed.
  void LabelLanguages();

  // Sub-node language detection for a given string attribute.
  // For example, if a node has name: "My name is Fred", then calling
  // GetLanguageAnnotationForStringAttribute(*node, ax::mojom::StringAttribute::
  // kName) would return language detection information about "My name is Fred".
  std::vector<AXLanguageSpan> GetLanguageAnnotationForStringAttribute(
      const AXNode& node,
      ax::mojom::StringAttribute attr);

  // Construct and register a dynamic content change observer for this manager.
  void RegisterLanguageDetectionObserver();

 private:
  friend class AXLanguageDetectionObserver;

  // Allow access from a fixture only used in testing.
  friend class AXLanguageDetectionTestFixture;

  // Helper methods to test if language detection features are enabled.
  static bool IsStaticLanguageDetectionEnabled();
  static bool IsDynamicLanguageDetectionEnabled();

  // Perform detection for subtree rooted at subtree_root.
  void DetectLanguagesForSubtree(AXNode* subtree_root);
  // Perform detection for node. Will not descend into children.
  void DetectLanguagesForNode(AXNode* node);
  // Perform labelling for subtree rooted at subtree_root.
  void LabelLanguagesForSubtree(AXNode* subtree_root);
  // Perform labelling for node. Will not descend into children.
  void LabelLanguagesForNode(AXNode* node);

  // This language identifier is constructed with a default minimum byte length
  // of chrome_lang_id::NNetLanguageIdentifier::kMinNumBytesToConsider and is
  // used for detecting page-level languages.
  chrome_lang_id::NNetLanguageIdentifier language_identifier_;

  // This language identifier is constructed with a minimum byte length of
  // kShortTextIdentifierMinByteLength so it can be used for detecting languages
  // of shorter text (e.g. one character).
  chrome_lang_id::NNetLanguageIdentifier short_text_language_identifier_;

  // The observer to support dynamic content language detection.
  std::unique_ptr<AXLanguageDetectionObserver> language_detection_observer_;

  // Non-owning back pointer to the tree which owns this manager.
  raw_ptr<AXTree> tree_;

  AXLanguageInfoStats lang_info_stats_;
};

}  // namespace ui

#endif  // UI_ACCESSIBILITY_AX_LANGUAGE_DETECTION_H_