HHu ZhengInit!
dcd27489创建于 3 天前历史提交
/*
 * Copyright 2011 kubtek <kubtek@mail.com>
 * Copyright 2025 Hu Zheng <huzheng_001@hotmail.com>
 *
 * This file is part of StarDict.
 *
 * StarDict is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * StarDict is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with StarDict.  If not, see <http://www.gnu.org/licenses/>.
 */

#include "parsedata_xdxf.h"
#include <glib/gi18n.h>
#include <vector>
#include <string>
#include <iostream>

#include <stdlib.h>
#include <string.h>

#ifdef _WIN32
#include <windows.h>
#endif

const char config_section[] = "xdxf";

struct ColorScheme {
	guint32 abr;
	guint32 ex;
	guint32 k;
	guint32 c;
	guint32 ref;
};

ColorScheme color_scheme;

static size_t xml_strlen(const std::string& str)
{
	const char *q;
	static const char* xml_entrs[] = { "lt;", "gt;", "amp;", "apos;", "quot;", 0 };
	static const int xml_ent_len[] = { 3,     3,     4,      5,       5 };
	size_t cur_pos;
	int i;

	for (cur_pos = 0, q = str.c_str(); *q; ++cur_pos) {
		if (*q == '&') {
			for (i = 0; xml_entrs[i]; ++i)
				if (strncmp(xml_entrs[i], q + 1,
					    xml_ent_len[i]) == 0) {
					q += xml_ent_len[i] + 1;
					break;
				}
			if (xml_entrs[i] == NULL)
				++q;
		} else if (*q == '<') {
			const char *p = strchr(q+1, '>');
			if (p)
				q = p + 1;
			else
				++q;
			--cur_pos;
		} else
			q = g_utf8_next_char(q);
	}

	return cur_pos;
}

static void xml_decode(const char *str, std::string& decoded)
{
	static const char raw_entrs[] = { 
		'<',   '>',   '&',    '\'',    '\"',    0 
	};
	static const char* xml_entrs[] = { 
		"lt;", "gt;", "amp;", "apos;", "quot;", 0 
	};
	static const int xml_ent_len[] = { 
		3,     3,     4,      5,       5 
	};
	int ient;
	const char *amp = strchr(str, '&');

	if (amp == NULL) {
	decoded = str;
		return;
	}
	decoded.assign(str, amp - str);

	while (*amp)
		if (*amp == '&') {
			for (ient = 0; xml_entrs[ient] != 0; ++ient)
				if (strncmp(amp + 1, xml_entrs[ient],
						xml_ent_len[ient]) == 0) {
					decoded += raw_entrs[ient];
					amp += xml_ent_len[ient]+1;
					break;
				}
			if (xml_entrs[ient] == 0)    // unrecognized sequence
				decoded += *amp++;
		} else {
			decoded += *amp++;
		}
}

static std::string print_pango_color(guint32 c)
{
	char buf[8]; // #001122
	gint n = g_snprintf(buf, sizeof(buf), "#%06x", c & 0xffffff);
	if(n != sizeof(buf)-1)
		return "";
	else
		return buf;
}

static void set_default_color_scheme(void)
{
	// ABBYY Lingvo 12 default color scheme
	color_scheme.abr = 0x007F00;
	color_scheme.ex = 0x7F7F7F;
	color_scheme.k = 0x000000;
	color_scheme.c = 0x0066FF;
	color_scheme.ref = 0x00007F;
}

struct ReplaceTag {
	ReplaceTag(const char* match, int match_len, const std::string& replace, int char_len)
	:
		match_(match),
		match_len_(match_len),
		replace_(replace),
		char_len_(char_len)
	{
	}
	const char *match_;
	int match_len_;
	std::string replace_;
	int char_len_;
};

class XDXFParser {
public:
	XDXFParser(const char *p, ParseResult &result);
	static void fill_replace_arr(void);
private:
	void flush(void);
private:
	ParseResult& result_;
	std::string res_;
	std::string::size_type cur_pos_;

	static std::vector<ReplaceTag> replace_arr_;
};

std::vector<ReplaceTag> XDXFParser::replace_arr_;

void XDXFParser::fill_replace_arr(void)
{
	replace_arr_.clear();
	std::string value;
	replace_arr_.push_back(ReplaceTag("abr>", 4,
		std::string("<span foreground=\"") + print_pango_color(color_scheme.abr) + "\" style=\"italic\">",
		0));
	replace_arr_.push_back(ReplaceTag("/abr>", 5, "</span>", 0));
	replace_arr_.push_back(ReplaceTag("b>", 2, "<b>", 0));
	replace_arr_.push_back(ReplaceTag("/b>", 3, "</b>", 0));
	replace_arr_.push_back(ReplaceTag("i>", 2, "<i>", 0));
	replace_arr_.push_back(ReplaceTag("/i>", 3, "</i>", 0));
	replace_arr_.push_back(ReplaceTag("sub>", 4, "<sub>", 0));
	replace_arr_.push_back(ReplaceTag("/sub>", 5, "</sub>", 0));
	replace_arr_.push_back(ReplaceTag("sup>", 4, "<sup>", 0));
	replace_arr_.push_back(ReplaceTag("/sup>", 5, "</sup>", 0));
	replace_arr_.push_back(ReplaceTag("tt>", 3, "<tt>", 0));
	replace_arr_.push_back(ReplaceTag("/tt>", 4, "</tt>", 0));
	replace_arr_.push_back(ReplaceTag("big>", 4, "<big>", 0));
	replace_arr_.push_back(ReplaceTag("/big>", 5, "</big>", 0));
	replace_arr_.push_back(ReplaceTag("small>", 6, "<small>", 0));
	replace_arr_.push_back(ReplaceTag("/small>", 7, "</small>", 0));
	replace_arr_.push_back(ReplaceTag("tr>", 3, "<b>[", 1));
	replace_arr_.push_back(ReplaceTag("/tr>", 4, "]</b>", 1));
	replace_arr_.push_back(ReplaceTag("ex>", 3,
		std::string("<span foreground=\"") + print_pango_color(color_scheme.ex) + "\">",
		0));
	replace_arr_.push_back(ReplaceTag("/ex>", 4, "</span>", 0));
	replace_arr_.push_back(ReplaceTag("/c>", 3, "</span>", 0));
}

XDXFParser::XDXFParser(const char *p, ParseResult &result) :
	result_(result)
{
	const char *tag, *next;
	std::string name;
	int i;

	bool is_first_k = true;
	for (cur_pos_ = 0; *p && (tag = strchr(p, '<')) != NULL;) {
		//TODO: do not create chunk
		std::string chunk(p, tag - p);
		res_ += chunk;
		cur_pos_ += xml_strlen(chunk);

		p = tag;
		for (i = 0; i < static_cast<int>(replace_arr_.size()); ++i)
			if (strncmp(replace_arr_[i].match_, p + 1,
						replace_arr_[i].match_len_) == 0) {
				res_ += replace_arr_[i].replace_;
				p += 1 + replace_arr_[i].match_len_;
				cur_pos_ += replace_arr_[i].char_len_;
				goto cycle_end;
			}

		if (strncmp("k>", p + 1, 2) == 0) {
			next = strstr(p + 3, "</k>");
			if (next) {
				if (is_first_k) {
					is_first_k = false;
					if (*(next + 4) == '\n')
						next++;
				} else {
					res_ += std::string("<span foreground=\"") + print_pango_color(color_scheme.k) + "\">";
					std::string chunk(p+3, next-(p+3));
					res_ += chunk;
					size_t xml_len = xml_strlen(chunk);
					cur_pos_ += xml_len;
					res_ += "</span>";
				}
				p = next + sizeof("</k>") - 1;
			} else
				p += sizeof("<k>") - 1;
		} else if (*(p + 1) == 'c' && (*(p + 2) == ' ' || *(p + 2) == '>')) {
			next = strchr(p, '>');
			if (!next) {
				++p;
				continue;
			}
			name.assign(p + 1, next - p - 1);
			std::string::size_type pos = name.find("c=\"");
			if (pos != std::string::npos) {
				pos += sizeof("c=\"") - 1;
				std::string::size_type end_pos = name.find("\"", pos);
				if (end_pos == std::string::npos)
					end_pos = name.length();

				std::string color(name, pos, end_pos - pos);
				if (pango_color_parse(NULL, color.c_str()))
					res_ += "<span foreground=\"" + color + "\">";
				else
					res_ += "<span>";
			} else
				res_ += std::string("<span foreground=\"") + print_pango_color(color_scheme.c) + "\">";
			p = next + 1;
		} else if (*(p + 1) == 'r' && *(p + 2) == 'r' && *(p + 3) == 'e' 
			&& *(p + 4) == 'f' && (*(p + 5) == ' ' || *(p + 5) == '>')) {
			next = strchr(p, '>');
			if (!next) {
				++p;
				continue;
			}
			name.assign(p + 1, next - p - 1);
			std::string type;
			std::string::size_type pos = name.find("type=\"");
			if (pos != std::string::npos) {
				pos += sizeof("type=\"") - 1;
				std::string::size_type end_pos = name.find("\"", pos);
				if (end_pos == std::string::npos)
					end_pos = name.length();
				type.assign(name, pos, end_pos - pos);
			}
			p = next + 1;
			next = strstr(p, "</rref>");
			if (!next)
				continue;
			std::string chunk(p, next - p);
			p = next + sizeof("</rref>") - 1;
			if (type.empty()) {
				if (g_str_has_suffix(chunk.c_str(), ".jpg") 
					|| g_str_has_suffix(chunk.c_str(), ".png")
					|| g_str_has_suffix(chunk.c_str(), ".bmp")) {
					type = "image";
				} else if (g_str_has_suffix(chunk.c_str(), ".wav") 
					|| g_str_has_suffix(chunk.c_str(), ".mp3") 
					|| g_str_has_suffix(chunk.c_str(), ".ogg")) {
					type = "sound";
				} else if (g_str_has_suffix(chunk.c_str(), ".avi") 
					|| g_str_has_suffix(chunk.c_str(), ".mpeg")
					|| g_str_has_suffix(chunk.c_str(), ".mpg")) {
					type = "video";
				} else {
					type = "attach";
				}
			}
			flush();
			ParseResultItem item;
			item.type = ParseResultItemType_res;
			item.res = new ParseResultResItem;
			item.res->type = type;
			item.res->key = chunk;
			result_.item_list.push_back(item);
		} else if ((*(p + 1) == 'k' || *(p + 1) == 'i') && *(p + 2) == 'r' 
			&& *(p + 3) == 'e' && *(p + 4) == 'f' && (*(p + 5) == ' ' 
			|| *(p + 5) == '>')) {
			// kref and iref
			bool is_k_or_i = (*(p + 1) == 'k');
			next = strchr(p, '>');
			if (!next) {
				++p;
				continue;
			}
			name.assign(p + 1, next - p - 1);
			std::string key;
			std::string::size_type pos;
			if (is_k_or_i)
				pos = name.find("k=\"");
			else
				pos = name.find("href=\"");
			if (pos != std::string::npos) {
				if (is_k_or_i)
					pos += sizeof("k=\"") - 1;
				else
					pos += sizeof("href=\"") - 1;
				std::string::size_type end_pos = name.find("\"", pos);
				if (end_pos == std::string::npos)
					end_pos = name.length();
				key.assign(name, pos, end_pos - pos);
			}

			p = next + 1;
			if (is_k_or_i)
				next = strstr(p, "</kref>");
			else
				next = strstr(p, "</iref>");
			if (!next)
				continue;

			res_ += std::string("<span foreground=\"") + print_pango_color(color_scheme.ref) + "\" underline=\"single\">";
			std::string::size_type link_len = next - p;
			std::string chunk(p, link_len);
			size_t xml_len = xml_strlen(chunk);
			std::string xml_enc;
			if (key.empty())
				xml_decode(chunk.c_str(), xml_enc);
			else
				xml_decode(key.c_str(), xml_enc);
			std::string link;
			if (is_k_or_i)
				link = "query://";
			link += xml_enc;
			links_list_.push_back(LinkDesc(cur_pos_, xml_len, link));
			res_ += chunk;
			cur_pos_ += xml_len;
			res_ += "</span>";
			if (is_k_or_i)
				p = next + sizeof("</kref>") - 1;
			else
				p = next + sizeof("</iref>") - 1;
		} else if (strncmp("blockquote", p + 1, 10) == 0 && (*(p + 11) == ' '
				|| *(p + 11) == '>')) {
			next = strchr(p, '>');
			if (!next) {
				++p;
				continue;
			}
			p = next + 1;
			flush();
			ParseResultItem item;
			item.type = ParseResultItemType_FormatBeg;
			item.format_beg = new ParseResultFormatBegItem;
			item.format_beg->type = ParseResultItemFormatType_Indent;
			result_.item_list.push_back(item);
		} else if (strncmp("/blockquote>", p + 1, 12) == 0) {
			p += sizeof("/blockquote>");
			flush();
			ParseResultItem item;
			item.type = ParseResultItemType_FormatEnd;
			item.format_end = new ParseResultFormatEndItem;
			item.format_end->type = ParseResultItemFormatType_Indent;
			result_.item_list.push_back(item);
		} else {
			next = strchr(p+1, '>');
			if (!next) {
				p++;
				res_ += "&lt;";
				cur_pos_++;
				continue;
			}
			p = next + 1;
		}
cycle_end:
		;
	}
	res_ += p;
	flush();
}

void XDXFParser::flush(void) 
{
	if (res_.empty()) {
		g_assert(cur_pos_ == 0);
		g_assert(links_list_.empty());
		return;
	}
	ParseResultItem item;
	item.type = ParseResultItemType_mark;
	item.mark = new ParseResultMarkItem;
	item.mark->pango = res_;

	result_.item_list.push_back(item);
	res_.clear();
	cur_pos_ = 0;
}

std::string xdxf2pango(const char *p, guint32 sec_size, const char *oword)
{
	set_default_color_scheme();
	XDXFParser::fill_replace_arr();
	XDXFParser(p, result);
	return result.pango;
}