<?php
* This file is part of the league/commonmark package.
*
* (c) Colin O'Dell <colinodell@gmail.com>
*
* Original code based on the CommonMark JS reference parser (https://bitly.com/commonmark-js)
* - (c) John MacFarlane
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
namespace League\CommonMark\Util;
use League\CommonMark\Block\Element\HtmlBlock;
* Provides regular expressions and utilities for parsing Markdown
*/
final class RegexHelper
{
const ESCAPABLE = 0;
const ESCAPED_CHAR = 1;
const IN_DOUBLE_QUOTES = 2;
const IN_SINGLE_QUOTES = 3;
const IN_PARENS = 4;
const REG_CHAR = 5;
const IN_PARENS_NOSP = 6;
const TAGNAME = 7;
const BLOCKTAGNAME = 8;
const ATTRIBUTENAME = 9;
const UNQUOTEDVALUE = 10;
const SINGLEQUOTEDVALUE = 11;
const DOUBLEQUOTEDVALUE = 12;
const ATTRIBUTEVALUE = 13;
const ATTRIBUTEVALUESPEC = 14;
const ATTRIBUTE = 15;
const OPENTAG = 16;
const CLOSETAG = 17;
const OPENBLOCKTAG = 18;
const CLOSEBLOCKTAG = 19;
const HTMLCOMMENT = 20;
const PROCESSINGINSTRUCTION = 21;
const DECLARATION = 22;
const CDATA = 23;
const HTMLTAG = 24;
const HTMLBLOCKOPEN = 25;
const LINK_TITLE = 26;
const PARTIAL_ENTITY = '&(?:#x[a-f0-9]{1,8}|#[0-9]{1,8}|[a-z][a-z0-9]{1,31});';
const PARTIAL_ESCAPABLE = '[!"#$%&\'()*+,.\/:;<=>?@[\\\\\]^_`{|}~-]';
const PARTIAL_ESCAPED_CHAR = '\\\\' . self::PARTIAL_ESCAPABLE;
const PARTIAL_IN_DOUBLE_QUOTES = '"(' . self::PARTIAL_ESCAPED_CHAR . '|[^"\x00])*"';
const PARTIAL_IN_SINGLE_QUOTES = '\'(' . self::PARTIAL_ESCAPED_CHAR . '|[^\'\x00])*\'';
const PARTIAL_IN_PARENS = '\\((' . self::PARTIAL_ESCAPED_CHAR . '|[^)\x00])*\\)';
const PARTIAL_REG_CHAR = '[^\\\\()\x00-\x20]';
const PARTIAL_IN_PARENS_NOSP = '\((' . self::PARTIAL_REG_CHAR . '|' . self::PARTIAL_ESCAPED_CHAR . '|\\\\)*\)';
const PARTIAL_TAGNAME = '[A-Za-z][A-Za-z0-9-]*';
const PARTIAL_BLOCKTAGNAME = '(?:address|article|aside|base|basefont|blockquote|body|caption|center|col|colgroup|dd|details|dialog|dir|div|dl|dt|fieldset|figcaption|figure|footer|form|frame|frameset|h1|head|header|hr|html|iframe|legend|li|link|main|menu|menuitem|meta|nav|noframes|ol|optgroup|option|p|param|section|source|title|summary|table|tbody|td|tfoot|th|thead|title|tr|track|ul)';
const PARTIAL_ATTRIBUTENAME = '[a-zA-Z_:][a-zA-Z0-9:._-]*';
const PARTIAL_UNQUOTEDVALUE = '[^"\'=<>`\x00-\x20]+';
const PARTIAL_SINGLEQUOTEDVALUE = '\'[^\']*\'';
const PARTIAL_DOUBLEQUOTEDVALUE = '"[^"]*"';
const PARTIAL_ATTRIBUTEVALUE = '(?:' . self::PARTIAL_UNQUOTEDVALUE . '|' . self::PARTIAL_SINGLEQUOTEDVALUE . '|' . self::PARTIAL_DOUBLEQUOTEDVALUE . ')';
const PARTIAL_ATTRIBUTEVALUESPEC = '(?:' . '\s*=' . '\s*' . self::PARTIAL_ATTRIBUTEVALUE . ')';
const PARTIAL_ATTRIBUTE = '(?:' . '\s+' . self::PARTIAL_ATTRIBUTENAME . self::PARTIAL_ATTRIBUTEVALUESPEC . '?)';
const PARTIAL_OPENTAG = '<' . self::PARTIAL_TAGNAME . self::PARTIAL_ATTRIBUTE . '*' . '\s*\/?>';
const PARTIAL_CLOSETAG = '<\/' . self::PARTIAL_TAGNAME . '\s*[>]';
const PARTIAL_OPENBLOCKTAG = '<' . self::PARTIAL_BLOCKTAGNAME . self::PARTIAL_ATTRIBUTE . '*' . '\s*\/?>';
const PARTIAL_CLOSEBLOCKTAG = '<\/' . self::PARTIAL_BLOCKTAGNAME . '\s*[>]';
const PARTIAL_HTMLCOMMENT = '<!---->|<!--(?:-?[^>-])(?:-?[^-])*-->';
const PARTIAL_PROCESSINGINSTRUCTION = '[<][?].*?[?][>]';
const PARTIAL_DECLARATION = '<![A-Z]+' . '\s+[^>]*>';
const PARTIAL_CDATA = '<!\[CDATA\[[\s\S]*?]\]>';
const PARTIAL_HTMLTAG = '(?:' . self::PARTIAL_OPENTAG . '|' . self::PARTIAL_CLOSETAG . '|' . self::PARTIAL_HTMLCOMMENT . '|' .
self::PARTIAL_PROCESSINGINSTRUCTION . '|' . self::PARTIAL_DECLARATION . '|' . self::PARTIAL_CDATA . ')';
const PARTIAL_HTMLBLOCKOPEN = '<(?:' . self::PARTIAL_BLOCKTAGNAME . '(?:[\s\/>]|$)' . '|' .
'\/' . self::PARTIAL_BLOCKTAGNAME . '(?:[\s>]|$)' . '|' . '[?!])';
const PARTIAL_LINK_TITLE = '^(?:"(' . self::PARTIAL_ESCAPED_CHAR . '|[^"\x00])*"' .
'|' . '\'(' . self::PARTIAL_ESCAPED_CHAR . '|[^\'\x00])*\'' .
'|' . '\((' . self::PARTIAL_ESCAPED_CHAR . '|[^()\x00])*\))';
const REGEX_ESCAPABLE = self::PARTIAL_ESCAPABLE;
const REGEX_ENTITY = self::PARTIAL_ENTITY;
const REGEX_PUNCTUATION = '/^[\x{2000}-\x{206F}\x{2E00}-\x{2E7F}\p{Pc}\p{Pd}\p{Pe}\p{Pf}\p{Pi}\p{Po}\p{Ps}\\\\\'!"#\$%&\(\)\*\+,\-\.\\/:;<=>\?@\[\]\^_`\{\|\}~]/u';
const REGEX_UNSAFE_PROTOCOL = '/^javascript:|vbscript:|file:|data:/i';
const REGEX_SAFE_DATA_PROTOCOL = '/^data:image\/(?:png|gif|jpeg|webp)/i';
const REGEX_NON_SPACE = '/[^ \t\f\v\r\n]/';
const REGEX_WHITESPACE_CHAR = '/^[ \t\n\x0b\x0c\x0d]/';
const REGEX_WHITESPACE = '/[ \t\n\x0b\x0c\x0d]+/';
const REGEX_UNICODE_WHITESPACE_CHAR = '/^\pZ|\s/u';
const REGEX_THEMATIC_BREAK = '/^(?:(?:\*[ \t]*){3,}|(?:_[ \t]*){3,}|(?:-[ \t]*){3,})[ \t]*$/';
const REGEX_LINK_DESTINATION_BRACES = '/^(?:<(?:[^ <>\\t\\n\\\\\\x00]|\\\\.)*>)/';
* @deprecated Instance methods will be removed in 0.18 or 1.0 (whichever comes first)
*/
protected static $instance;
* @return RegexHelper
*
* @deprecated Instances are no longer needed and will be removed in 0.18 or 1.0
*/
public static function getInstance()
{
@trigger_error('RegexHelper no longer uses the singleton pattern. Directly grab the REGEX_ or PARTIAL_ constant you need instead.', E_USER_DEPRECATED);
if (self::$instance === null) {
self::$instance = new self();
}
return self::$instance;
}
* @param string|null $character
*
* @return bool
*/
public static function isEscapable($character)
{
if ($character === null) {
return false;
}
return preg_match('/' . self::PARTIAL_ESCAPABLE . '/', $character) === 1;
}
* Returns a partial regex
*
* It'll need to be wrapped with /.../ before use
*
* @param int $const
*
* @return string
*
* @deprecated Just grab the constant directly
*/
public function getPartialRegex($const)
{
@trigger_error('RegexHelper no longer supports the getPartialRegex() function. Directly grab the PARTIAL_ constant you need instead.', E_USER_DEPRECATED);
switch ($const) {
case self::ESCAPABLE: return self::PARTIAL_ESCAPABLE;
case self::ESCAPED_CHAR: return self::PARTIAL_ESCAPED_CHAR;
case self::IN_DOUBLE_QUOTES: return self::PARTIAL_IN_DOUBLE_QUOTES;
case self::IN_SINGLE_QUOTES: return self::PARTIAL_IN_SINGLE_QUOTES;
case self::IN_PARENS: return self::PARTIAL_IN_PARENS;
case self::REG_CHAR: return self::PARTIAL_REG_CHAR;
case self::IN_PARENS_NOSP: return self::PARTIAL_IN_PARENS_NOSP;
case self::TAGNAME: return self::PARTIAL_TAGNAME;
case self::BLOCKTAGNAME: return self::PARTIAL_BLOCKTAGNAME;
case self::ATTRIBUTENAME: return self::PARTIAL_ATTRIBUTENAME;
case self::UNQUOTEDVALUE: return self::PARTIAL_UNQUOTEDVALUE;
case self::SINGLEQUOTEDVALUE: return self::PARTIAL_SINGLEQUOTEDVALUE;
case self::DOUBLEQUOTEDVALUE: return self::PARTIAL_DOUBLEQUOTEDVALUE;
case self::ATTRIBUTEVALUE: return self::PARTIAL_ATTRIBUTEVALUE;
case self::ATTRIBUTEVALUESPEC: return self::PARTIAL_ATTRIBUTEVALUESPEC;
case self::ATTRIBUTE: return self::PARTIAL_ATTRIBUTE;
case self::OPENTAG: return self::PARTIAL_OPENTAG;
case self::CLOSETAG: return self::PARTIAL_CLOSETAG;
case self::OPENBLOCKTAG: return self::PARTIAL_OPENBLOCKTAG;
case self::CLOSEBLOCKTAG: return self::PARTIAL_CLOSEBLOCKTAG;
case self::HTMLCOMMENT: return self::PARTIAL_HTMLCOMMENT;
case self::PROCESSINGINSTRUCTION: return self::PARTIAL_PROCESSINGINSTRUCTION;
case self::DECLARATION: return self::PARTIAL_DECLARATION;
case self::CDATA: return self::PARTIAL_CDATA;
case self::HTMLTAG: return self::PARTIAL_HTMLTAG;
case self::HTMLBLOCKOPEN: return self::PARTIAL_HTMLBLOCKOPEN;
case self::LINK_TITLE: return self::PARTIAL_LINK_TITLE;
}
}
* @return string
*
* @deprecated Use PARTIAL_HTMLTAG and wrap it yourself instead
*/
public function getHtmlTagRegex()
{
@trigger_error('RegexHelper::getHtmlTagRegex() has been deprecated. Use the RegexHelper::PARTIAL_HTMLTAG constant instead.', E_USER_DEPRECATED);
return '/^' . self::PARTIAL_HTMLTAG . '/i';
}
* @return string
*
* @deprecated Use PARTIAL_LINK_TITLE and wrap it yourself instead
*/
public function getLinkTitleRegex()
{
@trigger_error('RegexHelper::getLinkTitleRegex() has been deprecated. Use the RegexHelper::PARTIAL_LINK_TITLE constant instead.', E_USER_DEPRECATED);
return '/' . self::PARTIAL_LINK_TITLE . '/';
}
* @return string
*
* @deprecated Use REGEX_LINK_DESTINATION_BRACES instead
*/
public function getLinkDestinationBracesRegex()
{
@trigger_error('RegexHelper::getLinkDestinationBracesRegex() has been deprecated. Use the RegexHelper::REGEX_LINK_DESTINATION_BRACES constant instead.', E_USER_DEPRECATED);
return self::REGEX_LINK_DESTINATION_BRACES;
}
* @return string
*
* @deprecated Use the REGEX_THEMATIC_BREAK constant directly
*/
public function getThematicBreakRegex()
{
@trigger_error('RegexHelper::getThematicBreakRegex() has been deprecated. Use the RegexHelper::REGEX_THEMATIC_BREAK constant instead.', E_USER_DEPRECATED);
return self::REGEX_THEMATIC_BREAK;
}
* Attempt to match a regex in string s at offset offset
*
* @param string $regex
* @param string $string
* @param int $offset
*
* @return int|null Index of match, or null
*/
public static function matchAt($regex, $string, $offset = 0)
{
$matches = [];
$string = mb_substr($string, $offset, null, 'utf-8');
if (!preg_match($regex, $string, $matches, PREG_OFFSET_CAPTURE)) {
return;
}
$charPos = mb_strlen(mb_strcut($string, 0, $matches[0][1], 'utf-8'), 'utf-8');
return $offset + $charPos;
}
* Functional wrapper around preg_match_all
*
* @param string $pattern
* @param string $subject
* @param int $offset
*
* @return array|null
*/
public static function matchAll($pattern, $subject, $offset = 0)
{
$subject = substr($subject, $offset);
preg_match_all($pattern, $subject, $matches, PREG_PATTERN_ORDER);
$fullMatches = reset($matches);
if (empty($fullMatches)) {
return;
}
if (count($fullMatches) === 1) {
foreach ($matches as &$match) {
$match = reset($match);
}
}
if (!empty($matches)) {
return $matches;
}
}
* Replace backslash escapes with literal characters
*
* @param string $string
*
* @return string
*/
public static function unescape($string)
{
$allEscapedChar = '/\\\\(' . self::PARTIAL_ESCAPABLE . ')/';
$escaped = preg_replace($allEscapedChar, '$1', $string);
$replaced = preg_replace_callback('/' . self::PARTIAL_ENTITY . '/i', function ($e) {
return Html5Entities::decodeEntity($e[0]);
}, $escaped);
return $replaced;
}
* @param int $type HTML block type
*
* @return string|null
*/
public static function getHtmlBlockOpenRegex($type)
{
switch ($type) {
case HtmlBlock::TYPE_1_CODE_CONTAINER:
return '/^<(?:script|pre|style)(?:\s|>|$)/i';
case HtmlBlock::TYPE_2_COMMENT:
return '/^<!--/';
case HtmlBlock::TYPE_3:
return '/^<[?]/';
case HtmlBlock::TYPE_4:
return '/^<![A-Z]/';
case HtmlBlock::TYPE_5_CDATA:
return '/^<!\[CDATA\[/';
case HtmlBlock::TYPE_6_BLOCK_ELEMENT:
return '%^<[/]?(?:address|article|aside|base|basefont|blockquote|body|caption|center|col|colgroup|dd|details|dialog|dir|div|dl|dt|fieldset|figcaption|figure|footer|form|frame|frameset|h[123456]|head|header|hr|html|iframe|legend|li|link|main|menu|menuitem|meta|nav|noframes|ol|optgroup|option|p|param|section|source|title|summary|table|tbody|td|tfoot|th|thead|title|tr|track|ul)(?:\s|[/]?[>]|$)%i';
case HtmlBlock::TYPE_7_MISC_ELEMENT:
return '/^(?:' . self::PARTIAL_OPENTAG . '|' . self::PARTIAL_CLOSETAG . ')\\s*$/i';
}
}
* @param int $type HTML block type
*
* @return string|null
*/
public static function getHtmlBlockCloseRegex($type)
{
switch ($type) {
case HtmlBlock::TYPE_1_CODE_CONTAINER:
return '%<\/(?:script|pre|style)>%i';
case HtmlBlock::TYPE_2_COMMENT:
return '/-->/';
case HtmlBlock::TYPE_3:
return '/\?>/';
case HtmlBlock::TYPE_4:
return '/>/';
case HtmlBlock::TYPE_5_CDATA:
return '/\]\]>/';
}
}
* @param string $url
*
* @return bool
*/
public static function isLinkPotentiallyUnsafe($url)
{
return preg_match(self::REGEX_UNSAFE_PROTOCOL, $url) !== 0 && preg_match(self::REGEX_SAFE_DATA_PROTOCOL, $url) === 0;
}
}