Ddeepin-ci-robotfeat: init package
acf493e8创建于 2023年11月13日历史提交
/* -*- Mode: C++ -*- */
// Copyright 2010 University of Helsinki
//
//  Licensed under the Apache License, Version 2.0 (the "License");
//  you may not use this file except in compliance with the License.
//  You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
//  Unless required by applicable law or agreed to in writing, software
//  distributed under the License is distributed on an "AS IS" BASIS,
//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//  See the License for the specific language governing permissions and
//  limitations under the License.

#ifndef HFST_OSPELL_OSPELL_H_
#define HFST_OSPELL_OSPELL_H_ 1

#include "hfstol-stdafx.h"
#include <string>
#include <deque>
#include <queue>
#include <list>
#include <stdexcept>
#include <limits>
#include <ctime>
#include "hfst-ol.h"

namespace hfst_ospell {

//! Internal class for transition processing.

struct TreeNode;
struct CacheContainer;
typedef std::pair<std::string, std::string> StringPair;
typedef std::pair<std::string, Weight> StringWeightPair;
typedef std::pair<std::vector<std::string>, Weight> SymbolsWeightPair;
typedef std::vector<StringWeightPair> StringWeightVector;
typedef std::pair<std::pair<std::string, std::string>, Weight>
                                                        StringPairWeightPair;
typedef std::vector<TreeNode> TreeNodeVector;
typedef std::map<std::string, Weight> StringWeightMap;

//! Contains low-level processing stuff.
struct STransition{
    TransitionTableIndex index; //!< index to transition
    SymbolNumber symbol; //!< symbol of transition
    Weight weight; //!< weight of transition

    //!
    //! create transition without weight
    STransition(TransitionTableIndex i,
                SymbolNumber s):
        index(i),
        symbol(s),
        weight(0.0)
        {}

    //! create transition with weight
    STransition(TransitionTableIndex i,
                SymbolNumber s,
                Weight w):
        index(i),
        symbol(s),
        weight(w)
        {}

};
//! @brief comparison for establishing order for priority queue for suggestions.

//! The suggestions that are stored in a priority queue are arranged in
//! ascending order of the weight component, following the basic penalty
//! weight logic of tropical semiring that is present in most weighted
//! finite-state spell-checking automata.
class StringWeightComparison
/* results are reversed by default because greater weights represent
   worse results - to reverse the reversal, give a true argument*/

{
    bool reverse;
public:
    //!
    //! construct a result comparator with ascending or descending weight order
    StringWeightComparison(bool reverse_result=false):
        reverse(reverse_result)
        {}
    
    //!
    //! compare two string weight pairs for weights
    bool operator() (const StringWeightPair& lhs, const StringWeightPair& rhs) const
    { // return true when we want rhs to appear before lhs
        if (reverse) {
            return (lhs.second < rhs.second);
        }
        else {
            return (lhs.second > rhs.second);
        }
    }
};


//! The suggestions that are stored in a priority queue are arranged in
//! as in StringWeightComparison.
//
//! Follows weight value logic.
//! @see StringWeightComparison.
class SymbolsWeightComparison
/* results are reversed by default because greater weights represent
   worse results - to reverse the reversal, give a true argument*/

{
    bool reverse;
public:
    //!
    //! construct a result comparator with ascending or descending weight order
    SymbolsWeightComparison(bool reverse_result=false):
        reverse(reverse_result)
        {}

    //!
    //! compare two symbols weight pairs for weights
    bool operator() (const SymbolsWeightPair& lhs, const SymbolsWeightPair& rhs) const
    { // return true when we want rhs to appear before lhs
        if (reverse) {
            return (lhs.second < rhs.second);
        }
        else {
            return (lhs.second > rhs.second);
        }
    }
};

//! @brief comparison for complex analysis queues
//
//! Follows weight value logic.
//! @see StringWeightComparison.
class StringPairWeightComparison
{
    bool reverse;
public:
    //!
    //! create result comparator with ascending or descending weight order
    StringPairWeightComparison(bool reverse_result=false):
        reverse(reverse_result)
        {}
    
    //!
    //! compare two analysis corrections for weights
    bool operator() (const StringPairWeightPair& lhs, const StringPairWeightPair& rhs) const
    { // return true when we want rhs to appear before lhs
        if (reverse) {
            return (lhs.second < rhs.second);
        }
        else {
            return (lhs.second > rhs.second);
        }
    }
};

typedef std::priority_queue<StringWeightPair,
                            std::vector<StringWeightPair>,
                            StringWeightComparison> CorrectionQueue;
typedef std::priority_queue<StringWeightPair,
                            std::vector<StringWeightPair>,
                            StringWeightComparison> AnalysisQueue;
typedef std::priority_queue<StringWeightPair,
                            std::vector<StringWeightPair>,
                            StringWeightComparison> HyphenationQueue;
typedef std::priority_queue<StringPairWeightPair,
                            std::vector<StringPairWeightPair>,
                            StringPairWeightComparison> AnalysisCorrectionQueue;
typedef std::priority_queue<SymbolsWeightPair,
                            std::vector<SymbolsWeightPair>,
                            SymbolsWeightComparison> AnalysisSymbolsQueue;

struct WeightQueue: public std::list<Weight>
{
    void push(Weight w); // add a new weight
    void pop(void); // delete the biggest weight
    Weight get_lowest(void) const;
    Weight get_highest(void) const;
};

//! Internal class for Transducer processing.

//! Contains low-level processing stuff.
class Transducer
{
protected:
    TransducerHeader header; //!< header data
    TransducerAlphabet alphabet; //!< alphabet data
    KeyTable * keys; //!< key symbol mappings
    Encoder encoder; //!< encoder to convert the strings

    static const TransitionTableIndex START_INDEX = 0; //!< position of first
  
public:
    //! 
    //! read transducer from file @a f
    Transducer(FILE * f);
    //!
    //! read transducer from raw dara @a data
    Transducer(char * raw);
    IndexTable indices; //!< index table
    TransitionTable transitions; //!< transition table
    //!
    //! Deprecated functions for single-tranducer lookup
    //! Speller::analyse() is recommended
    bool initialize_input_vector(SymbolVector & input_vector,
                                 Encoder * encoder,
                                 char * line);
    AnalysisQueue lookup(char * line);
    //!
    //! whether it's final transition in this transducer
    bool final_transition(TransitionTableIndex i);
    //!
    //! whether it's final index 
    bool final_index(TransitionTableIndex i);
    //!
    //! get transducers symbol table mapping
    KeyTable * get_key_table(void);
    //!
    //! find key for string or create it
    SymbolNumber find_next_key(char ** p);
    //!
    //! get encoder for mapping sttrings and symbols
    Encoder * get_encoder(void);
    //!
    //! get size of a state
    unsigned int get_state_size(void);
    //!
    //! get position of the ? symbols
    SymbolNumber get_unknown(void) const;
    SymbolNumber get_identity(void) const;
    //!
    //! get alphabet of automaton
    TransducerAlphabet * get_alphabet(void);
    //!
    //! get flag stuff of automaton
    OperationMap * get_operations(void);
    //!
    //! follow epsilon transitions from index
    STransition take_epsilons(const TransitionTableIndex i) const;
    //!
    //! follow epsilon transitions and falsg form index
    STransition take_epsilons_and_flags(const TransitionTableIndex i);
    //! 
    //! follow real transitions from index
    STransition take_non_epsilons(const TransitionTableIndex i,
                                  const SymbolNumber symbol) const;
    //!
    //! get next index
    TransitionTableIndex next(const TransitionTableIndex i,
                              const SymbolNumber symbol) const;
    //!
    //! get next epsilon inedx
    TransitionTableIndex next_e(const TransitionTableIndex i) const;
    //! 
    //! whether state has any transitions with @a symbol
    bool has_transitions(const TransitionTableIndex i,
                         const SymbolNumber symbol) const;
    //!
    //! whether state has epsilon s or flag s
    bool has_epsilons_or_flags(const TransitionTableIndex i);
    //!
    //! whether state has non-epsilons or non-flags
    bool has_non_epsilons_or_flags(const TransitionTableIndex i);
    //! 
    //! whether it's final
    bool is_final(const TransitionTableIndex i);
    //! 
    //! get final weight
    Weight final_weight(const TransitionTableIndex i) const;
    //!
    //! whether it's a flag
    bool is_flag(const SymbolNumber symbol);
    //!
    //! whether it's weighedc
    bool is_weighted(void);

};

//! Internal class for alphabet processing.

//! Contains low-level processing stuff.
struct TreeNode
{
//    SymbolVector input_string; //<! the current input vector
    SymbolVector string; //!< the current output vector
    unsigned int input_state; //!< its input state
    TransitionTableIndex mutator_state; //!< state in error model
    TransitionTableIndex lexicon_state; //!< state in language model
    FlagDiacriticState flag_state; //!< state of flags
    Weight weight; //!< weight

    //!
    //! construct a node in trie from all that stuff
    TreeNode(SymbolVector prev_string,
             unsigned int i,
             TransitionTableIndex mutator,
             TransitionTableIndex lexicon,
             FlagDiacriticState state,
             Weight w):
        string(prev_string),
        input_state(i),
        mutator_state(mutator),
        lexicon_state(lexicon),
        flag_state(state),
        weight(w)
        { }

    //! 
    //! construct empty node with a starting state for flags
    TreeNode(FlagDiacriticState start_state): // starting state node
    string(SymbolVector()),
    input_state(0),
    mutator_state(0),
    lexicon_state(0),
    flag_state(start_state),
    weight(0.0)
        { }

    //!
    //! check if tree node is compatible with flag diacritc
    bool try_compatible_with(FlagDiacriticOperation op);

    //!
    //! traverse some node in lexicon
    TreeNode update_lexicon(SymbolNumber next_symbol,
                            TransitionTableIndex next_lexicon,
                            Weight weight);

    //!
    //! traverse some node in error model
    TreeNode update_mutator(TransitionTableIndex next_mutator,
                            Weight weight);

    //!
    //! The update functions return updated copies of this state
     TreeNode update(SymbolNumber output_symbol,
                     unsigned int next_input,
                     TransitionTableIndex next_mutator,
                     TransitionTableIndex next_lexicon,
                     Weight weight);

    TreeNode update(SymbolNumber output_symbol,
                    TransitionTableIndex next_mutator,
                    TransitionTableIndex next_lexicon,
                    Weight weight);


};

typedef std::vector<TreeNode> TreeNodeQueue;

int nByte_utf8(unsigned char c);

//! Exception when speller cannot map characters of error model to language
//! model.

//! May get raised if error model automaton has output characters that are not
//! present in language model.
class AlphabetTranslationException: public std::runtime_error
{ // "what" should hold the first untranslatable symbol
public:
    
    //!
    //! create alpabet exception with symbol as explanation
    AlphabetTranslationException(const std::string what):
        std::runtime_error(what)
        { }
};

//! @brief Basic spell-checking automata pair unit.

//! Speller consists of two automata, one for language modeling and one for
//! error modeling. The speller object has low-level access to the automata
//! and convenience functions for checking, analysing and correction.
//! @see ZHfstOspeller for high level access.
class Speller
{
public:
    Transducer * mutator; //!< error model
    Transducer * lexicon; //!< languag model
    SymbolVector input; //!< current input
    TreeNodeQueue queue; //!< current traversal fifo stack
    TreeNode next_node;  //!< current next node
    Weight limit; //!< current limit for weights
    Weight best_suggestion; //!< best suggestion so far
    WeightQueue nbest_queue; //!< queue to keep track of current n best results
    SymbolVector alphabet_translator; //!< alphabets in automata
    OperationMap * operations; //!< flags in it
    //!< A cache for the result of first symbols
    std::vector<CacheContainer> cache;
    //!< what kind of limiting behaviour we have
    enum LimitingBehaviour { None, MaxWeight, Nbest, Beam, MaxWeightNbest,
                             MaxWeightBeam, NbestBeam, MaxWeightNbestBeam } limiting;
    //! what mode we're in
    enum Mode { Check, Correct, Lookup } mode;

    //! the maximum amount of time to take
    double max_time;
    //! when we started work
    clock_t start_clock;
    // A counter to avoid checking the clock too often
    unsigned long call_counter;
    // A flag to set for when time has been overstepped
    bool limit_reached;
    
    //!
    //! Create a speller object form error model and language automata.
    Speller(Transducer * mutator_ptr, Transducer * lexicon_ptr);
    //!
    //! size of states
    SymbolNumber get_state_size(void);
    //!
    //! initialise string conversions
    void build_alphabet_translator(void);
    void add_symbol_to_alphabet_translator(SymbolNumber to_sym);
    //!
    //! initialize input string
    bool init_input(char * line);
    //!
    //! travers epsilons in language model
    void lexicon_epsilons(void);
    bool has_lexicon_epsilons(void) const
        {
            return lexicon->has_epsilons_or_flags(next_node.lexicon_state + 1);
        }
    //!
    //! traverse epsilons in error modle
    void mutator_epsilons(void);
    bool has_mutator_epsilons(void) const
        {
            return mutator->has_transitions(next_node.mutator_state + 1, 0);
        }
    //!
    //! traverse along input
    void consume_input();
    //! helper functions for traversal
    void queue_mutator_arcs(SymbolNumber input);
    void lexicon_consume(void);
    void queue_lexicon_arcs(SymbolNumber input,
                            unsigned int mutator_state,
                            Weight mutator_weight = 0.0,
                            int input_increment = 0);
    //! @brief Check if the given string is accepted by the speller
    //
    //! foo
    bool check(char * line);
    //! @brief suggest corrections for given string @a line.
    //
    //! The number of corrections given and stored at any given time
    //! is limited by @a nbest if ≥ 0. 
    CorrectionQueue correct(char * line, int nbest = 0,
                            Weight maxweight = -1.0,
                            Weight beam = -1.0,
                            float time_cutoff = 0.0);

    bool is_under_weight_limit(Weight w) const;
    void set_limiting_behaviour(int nbest, Weight maxweight, Weight beam);
    void adjust_weight_limits(int nbest, Weight beam);
    
    //! @brief analyse given string @a line.
    //
    //! If language model is two-tape, give a list of analyses for string.
    //! If not, this should return queue of one result @a line if the
    //! string is in language model and 0 results if it isn't.
    AnalysisQueue analyse(char * line, int nbest = 0);

    //! @brief analyse given string @a line.
    //
    //! Like analyse, but keep symbols separate, instead of concatenating to
    //! strings.
    AnalysisSymbolsQueue analyseSymbols(char * line, int nbest = 0);


    void build_cache(SymbolNumber first_sym);
    //! @brief Construct a cache entry for @a first_sym..

};

struct CacheContainer
{
    // All the nodes that ultimately result from searching at input depth 1
    TreeNodeVector nodes;
    // The results are for length max one inputs only
    StringWeightVector results_len_0;
    StringWeightVector results_len_1;
    bool empty;

    CacheContainer(void): empty(true) {}
    
    void clear(void)
        {
            nodes.clear();
            results_len_0.clear();
            results_len_1.clear();
        }
    
};

std::string stringify(KeyTable * key_table,
                      SymbolVector & symbol_vector);

std::vector<std::string> symbolify(KeyTable * key_table,
                                   SymbolVector & symbol_vector);

} // namespace hfst_ospell

// Some platforms lack strndup
char* hfst_strndup(const char* s, size_t n);
    
#endif // HFST_OSPELL_OSPELL_H_