* Copyright 2011 kubtek <kubtek@mail.com>
*
* This file is part of StarDict.
*
* StarDict is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* StarDict is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with StarDict. If not, see <http://www.gnu.org/licenses/>.
*/
#include <glib.h>
#include "WIKI2XML.h"
TTableInfo::TTableInfo ()
{
tr_open = false ;
td_open = false ;
}
string TTableInfo::close ()
{
string ret ;
if ( td_open ) ret += "</wikitablecell>" ;
if ( tr_open ) ret += "</wikitablerow>" ;
ret += "</wikitable>" ;
return ret ;
}
string TTableInfo::new_row ()
{
string ret ;
if ( td_open ) ret += "</wikitablecell>" ;
if ( tr_open ) ret += "</wikitablerow>" ;
ret += "<wikitablerow>" ;
td_open = false ;
tr_open = true ;
return ret ;
}
string TTableInfo::new_cell ( string type )
{
string ret ;
if ( !tr_open ) ret += new_row () ;
if ( td_open ) ret += "</wikitablecell>" ;
ret += "<wikitablecell type=\"" + upper ( type ) + "\">" ;
td_type = type ;
td_open = true ;
return ret ;
}
void WIKI2XML::parse_symmetric ( string &l , size_t &from ,
string s1 , string s2 ,
string r1 , string r2 ,
bool extend )
{
int a , b ;
if ( !submatch ( l , s1 , from ) ) return ;
for ( a = from + s1.length() ; a + s2.length() <= l.length() ; a++ )
{
if ( !submatch ( l , s2 , a ) ) continue ;
for ( b = a+1 ; extend && submatch ( l , s2 , b ) ; b++ ) ;
b-- ;
l = l.substr ( 0 , from ) +
r1 +
l.substr ( from + s1.length() , b - from - s1.length() ) +
r2 +
l.substr ( b + s2.length() , l.length() ) ;
if ( debug ) cout << "newl : " << l << endl ;
break ;
}
}
void WIKI2XML::parse_link ( string &l , size_t &from , char mode )
{
from += 1 ;
size_t a , cnt = 1 ;
chart par_open = '[' ;
chart par_close = ']' ;
if ( mode == 'T' ) { par_open = '{' ; par_close = '}' ; }
for ( a = from ; cnt > 0 && a+1 < l.length() ; a++ )
{
if ( l[a] == par_open && l[a+1] == par_open )
parse_link ( l , a ) ;
else if ( l[a] == par_close && l[a+1] == par_close )
cnt-- ;
}
if ( cnt > 0 ) return ;
int to = a-1 ;
string link = l.substr ( from+1 , to-from-1 ) ;
TXML x ;
vector <string> parts ;
explode ( '|' , link , parts ) ;
if ( mode == 'L' )
{
x.name = "wikilink" ;
x.add_key_value ( "type" , "internal" ) ;
}
else if ( mode == 'T' ) x.name = "wikitemplate" ;
for ( a = 0 ; a < parts.size() ; a++ )
{
bool last = ( a + 1 == parts.size() ) ;
string p = parts[a] ;
parse_line_sub ( p ) ;
if ( a > 0 && ( mode != 'L' || !last ) )
{
string key , value ;
vector <string> subparts ;
explode ( '=' , p , subparts ) ;
if ( subparts.size() == 1 )
{
char *str = g_markup_escape_text(p.c_str(), p.length());
value = xml_embed ( str , "value" ) ;
g_free(str);
}
else
{
key = xml_embed ( subparts[0] , "key" ) ;
subparts.erase ( subparts.begin() ) ;
string itmp = implode ( "=" , subparts );
char *str = g_markup_escape_text(itmp.c_str(), itmp.length());
value = xml_embed ( str , "value" ) ;
g_free(str);
}
p = key + value ;
}
else {
char *str = g_markup_escape_text(p.c_str(), p.length());
p = xml_embed ( str , "value" ) ;
g_free(str);
}
string param = "number=\"" + val ( a ) + "\"" ;
if ( last ) param += " last=\"1\"" ;
x.text += xml_embed ( p , "wikiparameter" , param ) ;
}
if ( mode == 'L' )
{
string trail ;
for ( a = to+2 ; a < l.length() && is_text_char ( l[a] ) ; a++ )
trail += l[a] ;
to = a-2 ;
if ( trail != "" ) x.text += xml_embed ( trail , "trail" ) ;
}
x.add_key_value ( "parameters" , val ( parts.size() ) ) ;
string replacement = x.get_string () ;
parse_line_sub ( replacement ) ;
l.erase ( from-1 , to-from+3 ) ;
l.insert ( from-1 , replacement ) ;
if ( debug ) cout << "Link : " << link << endl << "Replacement : " << replacement << endl ;
if ( debug ) cout << "Result : " << l << endl << endl ;
from = from + replacement.length() - 2 ;
}
bool WIKI2XML::is_list_char ( chart c )
{
if ( c == '*' ) return true ;
if ( c == '#' ) return true ;
if ( c == ':' ) return true ;
return false ;
}
string WIKI2XML::get_list_tag ( chart c , bool open )
{
string ret ;
if ( debug ) cout << "get_list_tag : " << c << endl ;
if ( c == '*' ) ret = "ul" ;
if ( c == '#' ) ret = "ol" ;
if ( c == ':' ) ret = "dl" ;
if ( ret != "" )
{
string itemname = "li" ;
if ( c == ':' ) itemname = "dd" ;
if ( open ) ret = "<" + ret + "><" + itemname + ">" ;
else ret = "</" + itemname + "></" + ret + ">" ;
}
return ret ;
}
string WIKI2XML::fix_list ( string &l )
{
size_t a , b ;
for ( a = 0 ; a < l.length() && is_list_char ( l[a] ) ; a++ ) ;
string newlist , pre ;
if ( a > 0 )
{
newlist = left ( l , a ) ;
while ( a < l.length() && l[a] == ' ' ) a++ ;
l = l.substr ( a , l.length() ) ;
}
if ( debug ) cout << "fix_list : " << l << endl ;
if ( list == "" && newlist == "" ) return "" ;
for ( a = 0 ; a < list.length() &&
a < newlist.length() &&
list[a] == newlist[a] ; a++ ) ;
for ( b = a ; b < list.length() ; b++ )
pre = get_list_tag ( list[b] , false ) + pre ;
for ( b = a ; b < newlist.length() ; b++ )
pre += get_list_tag ( newlist[b] , true ) ;
if ( debug ) cout << "pre : " << pre << endl ;
if ( debug ) cout << "newlist : " << newlist << endl ;
list = newlist ;
return pre ;
}
void WIKI2XML::parse_line ( string &l )
{
size_t a;
if ( debug ) cout << l << endl ;
string pre ;
string oldlist = list ;
pre += fix_list ( l ) ;
if ( list != "" && list == oldlist )
{
string itemname = "li" ;
if ( right ( list , 1 ) == ":" ) itemname = "dd" ;
pre = "</" + itemname + "><" + itemname + ">" + pre ;
}
if ( l == "" )
{
l = "<p/>" ;
}
else if ( left ( l , 4 ) == "----" )
{
for ( a = 0 ; a < l.length() && l[a] == l[0] ; a++ ) ;
pre += "<wikiurlcounter action=\"reset\"/><hr/>" ;
l = l.substr ( a , l.length() - a ) ;
}
else if ( l != "" && l[0] == '=' )
{
for ( a = 0 ; a < l.length() && l[a] == '=' && l[l.length()-a-1] == '=' ; a++ ) ;
string h = "h0" ;
if ( a >= l.length() ) h = "" ;
else if ( a < 1 || a > 9 ) h = "" ;
if ( h != "" )
{
l = l.substr ( a , l.length() - a*2 ) ;
h[1] += a ;
l = xml_embed ( l , h ) ;
}
}
else if ( l != "" && l[0] == ' ' )
{
for ( a = 0 ; a < l.length() && l[a] == ' ' ; a++ ) ;
l = l.substr ( a , l.length() ) ;
if ( l != "" )
{
pre += "<pre>" + l + "</pre>" ;
l = "" ;
}
}
else if ( left ( l , 2 ) == "{|" || (left ( l , 2 ) == "|}" && l[2] != '}' ) ||
( tables.size() > 0 && l != "" && ( l[0] == '|' || l[0] == '!' ) ) )
{
pre += table_markup ( l ) ;
l = "" ;
}
if ( l != "" ) parse_line_sub ( l ) ;
if ( pre != "" ) l = pre + l ;
}
bool WIKI2XML::is_external_link_protocol ( string protocol )
{
if ( protocol == "HTTP" ) return true ;
if ( protocol == "FTP" ) return true ;
if ( protocol == "MAILTO" ) return true ;
return false ;
}
int WIKI2XML::scan_url ( string &l , size_t from )
{
size_t a ;
for ( a = from ; a < l.length() ; a++ )
{
if ( l[a] == ':' || l[a] == '/' || l[a] == '.' ) continue ;
if ( l[a] >= '0' && l[a] <= '9' ) continue ;
if ( is_text_char ( l[a] ) ) continue ;
break ;
}
return a ;
}
void WIKI2XML::parse_external_freelink ( string &l , size_t &from )
{
int a ;
for ( a = from - 1 ; a >= 0 && is_text_char ( l[a] ) ; a-- ) ;
if ( a == -1 ) return ;
a++ ;
string protocol = upper ( l.substr ( a , from - a ) ) ;
if ( debug ) cout << "protocol : " << protocol << endl ;
if ( !is_external_link_protocol ( protocol ) ) return ;
int to = scan_url ( l , a ) ;
string url = l.substr ( a , to - a ) ;
string replacement ;
replacement += xml_embed ( url , "url" ) ;
replacement += xml_embed ( url , "title" ) ;
l = left ( l , a ) + replacement + l.substr ( to , l.length() - to ) ;
from = a + replacement.length() - 1 ;
}
void WIKI2XML::parse_external_link ( string &l , size_t &from )
{
string protocol = upper ( before_first ( ':' , l.substr ( from + 1 , l.length() - from ) ) ) ;
if ( !is_external_link_protocol ( protocol ) ) return ;
size_t to ;
for ( to = from + 1 ; to < l.length() && l[to] != ']' ; to++ ) ;
if ( to == l.length() ) return ;
string url = l.substr ( from + 1 , to - from - 1 ) ;
string title = after_first ( ' ' , url ) ;
url = before_first ( ' ' , url ) ;
string replacement ;
replacement += xml_embed ( url , "url" ) ;
if ( title == "" )
replacement += xml_embed ( "<wikiurlcounter action=\"add\"/>" , "title" ) ;
else replacement += xml_embed ( title , "title" ) ;
replacement = xml_embed ( replacement , "wikilink" , "type='external' protocol='" + protocol + "'" ) ;
l = left ( l , from ) + replacement + l.substr ( to + 1 , l.length() - to ) ;
from = from + replacement.length() - 1 ;
}
void WIKI2XML::parse_line_sub ( string &l )
{
size_t a ;
for ( a = 0 ; a < l.length() ; a++ )
{
if ( l[a] == '[' && a+1 < l.length() && l[a+1] == '[' )
parse_link ( l , a , 'L' ) ;
else if ( l[a] == '{' && a+1 < l.length() && l[a+1] == '{' )
parse_link ( l , a , 'T' ) ;
else if ( l[a] == '[' )
parse_external_link ( l , a ) ;
else if ( a+2 < l.length() && l[a] == ':' && l[a+1] == '/' && l[a+2] == '/' )
parse_external_freelink ( l , a ) ;
else if ( l[a] == SINGLE_QUOTE )
{
parse_symmetric ( l , a , "'''" , "'''" , "<b>" , "</b>" , true ) ;
parse_symmetric ( l , a , "''" , "''" , "<i>" , "</i>" ) ;
}
}
}
void WIKI2XML::parse_lines ( vector <string> &lines )
{
size_t a ;
for ( a = 0 ; a < lines.size() ; a++ )
{
parse_line ( lines[a] ) ;
}
string end ;
end = fix_list ( end ) ;
if ( end != "" ) lines.push_back ( end ) ;
end = "" ;
while ( tables.size() )
{
end += tables[tables.size()-1].close () ;
tables.pop_back () ;
}
if ( end != "" ) lines.push_back ( end ) ;
}
void WIKI2XML::init ( string s )
{
list = "" ;
lines.clear () ;
allowed_html.clear () ;
allowed_html.push_back ( "b" ) ;
allowed_html.push_back ( "i" ) ;
allowed_html.push_back ( "p" ) ;
allowed_html.push_back ( "b" ) ;
allowed_html.push_back ( "br" ) ;
allowed_html.push_back ( "hr" ) ;
allowed_html.push_back ( "tt" ) ;
allowed_html.push_back ( "pre" ) ;
allowed_html.push_back ( "nowiki" ) ;
allowed_html.push_back ( "math" ) ;
allowed_html.push_back ( "strike" ) ;
allowed_html.push_back ( "u" ) ;
allowed_html.push_back ( "table" ) ;
allowed_html.push_back ( "caption" ) ;
allowed_html.push_back ( "tr" ) ;
allowed_html.push_back ( "td" ) ;
allowed_html.push_back ( "th" ) ;
allowed_html.push_back ( "li" ) ;
allowed_html.push_back ( "ul" ) ;
allowed_html.push_back ( "ol" ) ;
allowed_html.push_back ( "dl" ) ;
allowed_html.push_back ( "dd" ) ;
allowed_html.push_back ( "dt" ) ;
allowed_html.push_back ( "div" ) ;
allowed_html.push_back ( "h1" ) ;
allowed_html.push_back ( "h2" ) ;
allowed_html.push_back ( "h3" ) ;
allowed_html.push_back ( "h4" ) ;
allowed_html.push_back ( "h5" ) ;
allowed_html.push_back ( "h6" ) ;
allowed_html.push_back ( "h7" ) ;
allowed_html.push_back ( "h8" ) ;
allowed_html.push_back ( "h9" ) ;
allowed_html.push_back ( "small" ) ;
allowed_html.push_back ( "center" ) ;
size_t a ;
for ( a = 0 ; a < allowed_html.size() ; a++ )
allowed_html[a] = upper ( allowed_html[a] ) ;
vector <TXML> taglist ;
make_tag_list ( s , taglist ) ;
remove_evil_html ( s , taglist ) ;
explode ( '\n' , s , lines ) ;
}
string WIKI2XML::get_xml ()
{
string ret = "<text>";
ret += implode ( "\n" , lines );
ret += "</text>";
while ( a >= 0 && a < ret.length() )
{
ret[a] = '!' ;
a = ret.find ( "—" , a ) ;
}*/
return ret ;
}
void WIKI2XML::replace_part ( string &s , size_t from , size_t to , string with )
{
s = s.substr ( 0 , from ) + with + s.substr ( to + 1 , s.length() - to - 1 ) ;
}
void WIKI2XML::replace_part_sync ( string &s , size_t from , size_t to , string with , vector <TXML> &list )
{
size_t a , b ;
replace_part ( s , from , to , with ) ;
for ( a = 0 ; a < list.size() ; a++ )
{
for ( b = 0 ; b < with.length() ; b++ ) list[a].insert_at ( from ) ;
for ( b = from ; b <= to ; b++ ) list[a].remove_at ( from ) ;
}
}
void WIKI2XML::make_tag_list ( string &s , vector <TXML> &list )
{
list.clear () ;
size_t a;
int b;
for ( a = 0 ; a < s.length() ; a++ )
{
if ( s[a] == '>' )
{
s[a] = ';' ;
s.insert ( a , ">" ) ;
continue ;
}
else if ( s[a] != '<' ) continue ;
b = find_next_unquoted ( '>' , s , a ) ;
if ( b == -1 )
{
s[a] = ';' ;
s.insert ( a , "<" ) ;
continue ;
}
list.push_back ( TXML ( a , b , s ) ) ;
a = list[list.size()-1].to ;
}
}
void WIKI2XML::remove_evil_html ( string &s , vector <TXML> &taglist )
{
size_t a , b ;
for ( a = 0 ; a < taglist.size() ; a++ )
{
string tag = upper ( taglist[a].name ) ;
for ( b = 0 ; b < allowed_html.size() && tag != allowed_html[b] ; b++ ) ;
if ( b < allowed_html.size() ) continue ;
replace_part_sync ( s , taglist[a].from , taglist[a].from , "<" , taglist ) ;
replace_part_sync ( s , taglist[a].to , taglist[a].to , ">" , taglist ) ;
}
}
string WIKI2XML::table_markup ( string &l )
{
size_t a ;
string ret ;
if ( left ( l , 2 ) == "{|" )
{
ret = "<wikitable>" ;
ret += xml_embed ( l.substr ( 2 , l.length() - 2 ) , "wikiparameter" ) ;
tables.push_back ( TTableInfo () ) ;
}
else if ( left ( l , 2 ) == "|}" )
{
ret = tables[tables.size()-1].close () ;
tables.pop_back () ;
}
else if ( left ( l , 2 ) == "|-" )
{
ret = tables[tables.size()-1].new_row () ;
for ( a = 1 ; a < l.length() && l[a] == '-' ; a++ ) ;
ret += xml_params ( l.substr ( a , l.length() - a ) ) ;
}
else
{
string init ;
if ( left ( l , 2 ) == "|+" )
{
init = "caption" ;
l = l.substr ( 2 , l.length() - 2 ) ;
}
else if ( l[0] == '!' )
{
init = "header" ;
l = l.substr ( 1 , l.length() - 1 ) ;
}
else if ( l[0] == '|' )
{
init = "cell" ;
l = l.substr ( 1 , l.length() - 1 ) ;
}
vector <string> sublines ;
for ( a = 0 ; a + 1 < l.length() ; a++ )
{
if ( l[a] == '|' && l[a+1] == '|' )
{
sublines.push_back ( left ( l , a ) ) ;
l = l.substr ( a + 2 , l.length() - a ) ;
a = (size_t)(-1) ;
}
}
if ( l != "" ) sublines.push_back ( l ) ;
for ( a = 0 ; a < sublines.size() ; a++ )
{
l = sublines[a] ;
parse_line_sub ( l ) ;
string params ;
int b = find_next_unquoted ( '|' , l ) ;
if ( b != -1 )
{
params = left ( l , b ) ;
l = l.substr ( b + 1 , l.length() - b ) ;
}
if ( params != "" ) l = xml_params ( params ) + l ;
ret += tables[tables.size()-1].new_cell ( init ) ;
ret += l ;
}
}
return ret ;
}