* Copyright (c) 2021 Huawei Device Co., Ltd.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef ECMASCRIPT_REGEXP_PARSER_H
#define ECMASCRIPT_REGEXP_PARSER_H
#include <cstdarg>
#include <cstdio>
#include <cstdint>
#include "ecmascript/js_thread.h"
#include "ecmascript/ecma_macros.h"
#include "ecmascript/mem/chunk.h"
#include "ecmascript/mem/c_containers.h"
#include "ecmascript/mem/c_string.h"
#include "ecmascript/mem/dyn_chunk.h"
#include "ecmascript/regexp/regexp_opcode.h"
#include "unicode/stringpiece.h"
#include "unicode/uchar.h"
#include "unicode/utf16.h"
#include "unicode/utf8.h"
#include "unicode/utypes.h"
#include "unicode/udata.h"
#include "unicode/uniset.h"
namespace panda::ecmascript {
class RegExpParser {
public:
static constexpr auto FLAG_GLOBAL = (1U << 0U);
static constexpr auto FLAG_IGNORECASE = (1U << 1U);
static constexpr auto FLAG_MULTILINE = (1U << 2U);
static constexpr auto FLAG_DOTALL = (1U << 3U);
static constexpr auto FLAG_UTF16 = (1U << 4U);
static constexpr auto FLAG_STICKY = (1U << 5U);
static constexpr auto FLAG_HASINDICES = (1U << 6U);
static constexpr uint32_t FLAG_NUM = 7;
static const uint32_t KEY_EOF = UINT32_MAX;
static constexpr int CLASS_RANGE_BASE = 0x40000000;
static constexpr uint32_t NUM_CAPTURE__OFFSET = 4;
static constexpr uint32_t NUM_STACK_OFFSET = 8;
static constexpr uint32_t OCTAL_VALUE = 8;
static constexpr uint32_t OCTAL_VALUE_RANGE = 32;
static constexpr uint32_t HEX_VALUE = 16;
static constexpr uint32_t DECIMAL_DIGITS_ADVANCE = 10;
static constexpr uint32_t FLAGS_OFFSET = 12;
static constexpr uint32_t PREFILTER_OFFSET = 16;
static constexpr uint32_t OP_START_OFFSET = 20;
static constexpr uint32_t UNICODE_HEX_VALUE = 4;
static constexpr uint32_t UNICODE_HEX_ADVANCE = 2;
static constexpr uint32_t CAPTURE_CONUT_ADVANCE = 3;
static constexpr uint32_t UTF8_CHAR_LEN_MAX = 6;
static constexpr size_t SPARSE_HEAD_OFFSET = 3;
static constexpr size_t SPARSE_OFF_OFFSET = 2;
static constexpr size_t SPARSE_MAX_OFFSET = 6;
static int Canonicalize(int c, bool isUnicode);
explicit RegExpParser(JSThread *thread, Chunk *chunk)
: thread_(thread),
base_(nullptr),
pc_(nullptr),
end_(nullptr),
flags_(0),
c0_(KEY_EOF),
captureCount_(0),
stackCount_(0),
isError_(false),
isEmpty_(false),
buffer_(chunk),
groupNames_(chunk)
{
}
~RegExpParser()
{
Clear();
}
NO_COPY_SEMANTIC(RegExpParser);
NO_MOVE_SEMANTIC(RegExpParser);
inline void Init(char *source, size_t length, uint32_t flags)
{
pc_ = reinterpret_cast<uint8_t *>(source);
base_ = pc_;
end_ = reinterpret_cast<uint8_t *>(source) + length - 1;
flags_ = flags;
}
void Parse();
void ParseDisjunction(bool isBackward);
void ParseAlternative(bool isBackward);
bool ParseAssertionCapture(int *captureIndex, bool isBackward);
void ParseQuantifier(size_t atomBcStart, int captureStart, int captureEnd);
int ParseDecimalDigits();
int ParseAtomEscape(bool isBackward);
int ParseCharacterEscape();
bool ParseGroupSpecifier(const uint8_t **pp, CString &name);
int ParseCaptureCount(const char *groupName);
bool ParseClassRanges(RangeSet *result);
void ParseNonemptyClassRangesNoDash(DynChunk *buffer);
uint32_t ParseClassAtom(RangeSet *atom);
int ParseClassEscape(RangeSet *atom);
void ParseError(const char *errorMessage);
bool ParseUnicodePropertyValueCharacters(CString &categoryName, CString &valueName);
int FindGroupName(const CString &name);
uint32_t ParseOctalLiteral();
bool ParseHexEscape(int length, uint32_t *value);
bool ParseUnlimitedLengthHexNumber(uint32_t maxValue, uint32_t *value);
bool ParseUnicodeEscape(uint32_t *value);
bool ParserIntervalQuantifier(int *pmin, int *pmax);
bool HasNamedCaptures();
int ParseEscape(const uint8_t **pp, int isUtf16);
int RecountCaptures();
int IsIdentFirst(uint32_t c);
bool NeedIntersection(uint32_t c);
void DoParserStackOverflowCheck(const char *errorMessage);
bool MatchUnicodeProperty(UProperty property, const char *propertyName, RangeSet *atom, bool negate);
bool IsExactPropertyValueAlis(const char *valueName, UProperty property, int32_t propertyValue);
bool ParseUnicodePropertyClassRange(CString &propertyName, CString &valueName, RangeSet *atom, bool negate);
bool GetUnicodePropertyName(CString &propertyName);
bool GetUnicodePropertyValueName(CString &valueName);
bool IsExactPropertyAlias(const char *propertyName, UProperty property);
bool MatchSepcialUnicodeProperty(CString &name, bool negate, RangeSet *atom);
bool IsSupportedBinaryProperty(UProperty property);
bool IsBinaryPropertyOfStrings(UProperty property);
inline CVector<CString> GetGroupNames() const
{
return newGroupNames_;
}
inline size_t GetGroupNamesSize() const
{
return groupNames_.size_;
}
inline bool IsError() const
{
return isError_;
}
inline uint8_t *GetOriginBuffer() const
{
return buffer_.buf_;
}
inline size_t GetOriginBufferSize() const
{
return buffer_.size_;
}
inline CString GetErrorMsg() const
{
if (isError_) {
return CString(errorMsg_);
}
return CString("");
}
inline bool IsGlobal() const
{
return (flags_ & FLAG_GLOBAL) != 0;
}
inline bool IsIgnoreCase() const
{
return (flags_ & FLAG_IGNORECASE) != 0;
}
inline bool IsMultiline() const
{
return (flags_ & FLAG_MULTILINE) != 0;
}
inline bool IsDotAll() const
{
return (flags_ & FLAG_DOTALL) != 0;
}
inline bool IsUtf16() const
{
return (flags_ & FLAG_UTF16) != 0;
}
inline bool IsStick() const
{
return (flags_ & FLAG_STICKY) != 0;
}
inline bool IsUnicodePropertyValueCharacter(char c) const
{
if (c >= 'a' && c <= 'z') {
return true;
}
if (c >= 'A' && c <= 'Z') {
return true;
}
if (c >= '0' && c <= '9') {
return true;
}
return (c == '_');
}
inline static int GetcurrentCharNext(int c)
{
int cur = c;
c = u_tolower(static_cast<UChar32>(c));
if (c == cur) {
c = u_toupper(static_cast<UChar32>(c));
}
if (((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) &&
!((cur >= 'A' && cur <= 'Z') || (cur >= 'a' && cur <= 'z'))) {
c = cur;
}
return c;
}
inline static void ProcessIntersection(RangeSet *result)
{
RangeSet cr;
RangeSet cr1;
const uint32_t MINLOWERCHAR = 'a';
const uint32_t MAXLOWERCHAR = 'z' + 1;
const uint32_t MINUPPERCHAR = 'A';
const uint32_t MAXUPPERCHAR = 'Z' + 1;
cr.Insert(MINLOWERCHAR, MAXLOWERCHAR);
cr.Insert(MINUPPERCHAR, MAXUPPERCHAR);
result->Inter(cr1, cr);
result->Insert(cr1);
}
private:
friend class RegExpExecutor;
static constexpr int TMP_BUF_SIZE = 128;
void Clear()
{
base_ = nullptr;
pc_ = nullptr;
end_ = nullptr;
c0_ = KEY_EOF;
isError_ = false;
isEmpty_ = false;
}
void Advance()
{
if (pc_ <= end_) {
DoParserStackOverflowCheck("Advance stack overflow!");
c0_ = *pc_++;
} else {
c0_ = KEY_EOF;
}
}
void Advance(int offset)
{
pc_ += offset - 1;
Advance();
}
void Prev()
{
if (pc_ >= base_) {
c0_ = *pc_--;
} else {
c0_ = KEY_EOF;
}
}
void SetIsError()
{
isError_ = true;
}
void PrintF(const char *fmt, ...);
JSThread *thread_;
uint8_t *base_;
uint8_t *pc_;
uint8_t *end_;
uint32_t flags_;
uint32_t c0_;
int captureCount_;
int stackCount_;
bool isError_;
bool isEmpty_;
char errorMsg_[TMP_BUF_SIZE] = {0};
int hasNamedCaptures_ = -1;
int totalCaptureCount_ = -1;
DynChunk buffer_;
DynChunk groupNames_;
CVector<CString> newGroupNames_;
};
}
#endif