// tinygettext - A gettext replacement that works directly on .po files // Copyright (c) 2009 Ingo Ruhnke // // This software is provided 'as-is', without any express or implied // warranty. In no event will the authors be held liable for any damages // arising from the use of this software. // // Permission is granted to anyone to use this software for any purpose, // including commercial applications, and to alter it and redistribute it // freely, subject to the following restrictions: // // 1. The origin of this software must not be misrepresented; you must not // claim that you wrote the original software. If you use this software // in a product, an acknowledgement in the product documentation would be // appreciated but is not required. // 2. Altered source versions must be plainly marked as such, and must not be // misrepresented as being the original software. // 3. This notice may not be removed or altered from any source distribution. #include "precompiled.h" #include "tinygettext/po_parser.hpp" #include #include #include #include #include #include #include #include "tinygettext/language.hpp" #include "tinygettext/log_stream.hpp" #include "tinygettext/iconv.hpp" #include "tinygettext/dictionary.hpp" #include "tinygettext/plural_forms.hpp" namespace tinygettext { bool POParser::pedantic = true; void POParser::parse(const std::string& filename, std::istream& in, Dictionary& dict) { POParser parser(filename, in, dict); parser.parse(); } class POParserError {}; POParser::POParser(const std::string& filename_, std::istream& in_, Dictionary& dict_, bool use_fuzzy_) : filename(filename_), in(in_), dict(dict_), use_fuzzy(use_fuzzy_), running(false), eof(false), big5(false), line_number(0), current_line(), conv() { } POParser::~POParser() { } void POParser::warning(const std::string& msg) { log_warning << filename << ":" << line_number << ": warning: " << msg << ": " << current_line << std::endl; //log_warning << "Line: " << current_line << std::endl; } void POParser::error(const std::string& msg) { log_error << filename << ":" << line_number << ": error: " << msg << ": " << current_line << std::endl; // Try to recover from an error by searching for start of another entry do next_line(); while(!eof && !is_empty_line()); throw POParserError(); } void POParser::next_line() { line_number += 1; if (!std::getline(in, current_line)) eof = true; } void POParser::get_string_line(std::ostringstream& out, size_t skip) { if (skip+1 >= static_cast(current_line.size())) error("unexpected end of line"); if (current_line[skip] != '"') error("expected start of string '\"'"); std::string::size_type i; for(i = skip+1; current_line[i] != '\"'; ++i) { if (big5 && static_cast(current_line[i]) >= 0x81 && static_cast(current_line[i]) <= 0xfe) { out << current_line[i]; i += 1; if (i >= current_line.size()) error("invalid big5 encoding"); out << current_line[i]; } else if (i >= current_line.size()) { error("unexpected end of string"); } else if (current_line[i] == '\\') { i += 1; if (i >= current_line.size()) error("unexpected end of string in handling '\\'"); switch (current_line[i]) { case 'a': out << '\a'; break; case 'b': out << '\b'; break; case 'v': out << '\v'; break; case 'n': out << '\n'; break; case 't': out << '\t'; break; case 'r': out << '\r'; break; case '"': out << '"'; break; case '\\': out << '\\'; break; default: std::ostringstream err; err << "unhandled escape '\\" << current_line[i] << "'"; warning(err.str()); out << current_line[i-1] << current_line[i]; break; } } else { out << current_line[i]; } } // process trailing garbage in line and warn if there is any for(i = i+1; i < current_line.size(); ++i) if (!isspace(current_line[i])) { warning("unexpected garbage after string ignoren"); break; } } std::string POParser::get_string(unsigned int skip) { std::ostringstream out; if (skip+1 >= static_cast(current_line.size())) error("unexpected end of line"); if (current_line[skip] == ' ' && current_line[skip+1] == '"') { get_string_line(out, skip+1); } else { if (pedantic) warning("keyword and string must be seperated by a single space"); for(;;) { if (skip >= static_cast(current_line.size())) error("unexpected end of line"); else if (current_line[skip] == '\"') { get_string_line(out, skip); break; } else if (!isspace(current_line[skip])) { error("string must start with '\"'"); } else { // skip space } skip += 1; } } next: next_line(); for(std::string::size_type i = 0; i < current_line.size(); ++i) { if (current_line[i] == '"') { if (i == 1) if (pedantic) warning("leading whitespace before string"); get_string_line(out, i); goto next; } else if (isspace(current_line[i])) { // skip } else { break; } } return out.str(); } static bool has_prefix(const std::string& lhs, const std::string& rhs) { if (lhs.length() < rhs.length()) return false; else return lhs.compare(0, rhs.length(), rhs) == 0; } void POParser::parse_header(const std::string& header) { std::string from_charset; std::string::size_type start = 0; for(std::string::size_type i = 0; i < header.length(); ++i) { if (header[i] == '\n') { std::string line = header.substr(start, i - start); if (has_prefix(line, "Content-Type:")) { // from_charset = line.substr(len); size_t len = strlen("Content-Type: text/plain; charset="); if (line.compare(0, len, "Content-Type: text/plain; charset=") == 0) { from_charset = line.substr(len); for(std::string::iterator ch = from_charset.begin(); ch != from_charset.end(); ++ch) *ch = static_cast(toupper(*ch)); } else { warning("malformed Content-Type header"); } } else if (has_prefix(line, "Plural-Forms:")) { PluralForms plural_forms = PluralForms::from_string(line); if (!plural_forms) { warning("unknown Plural-Forms given"); } else { if (!dict.get_plural_forms()) { dict.set_plural_forms(plural_forms); } else { if (dict.get_plural_forms() != plural_forms) { warning("Plural-Forms missmatch between .po file and dictionary"); } } } } start = i+1; } } if (from_charset.empty() || from_charset == "CHARSET") { warning("charset not specified for .po, fallback to utf-8"); from_charset = "UTF-8"; } else if (from_charset == "BIG5") { big5 = true; } conv.set_charsets(from_charset, dict.get_charset()); } bool POParser::is_empty_line() { if (current_line.empty()) { return true; } else if (current_line[0] == '#') { // handle comments as empty lines return (current_line.size() == 1 || (current_line.size() >= 2 && isspace(current_line[1]))); } else { for(std::string::iterator i = current_line.begin(); i != current_line.end(); ++i) { if (!isspace(*i)) return false; } } return true; } bool POParser::prefix(const char* prefix_str) { return current_line.compare(0, strlen(prefix_str), prefix_str) == 0; } void POParser::parse() { next_line(); // skip UTF-8 intro that some text editors produce // see http://en.wikipedia.org/wiki/Byte-order_mark if (current_line.size() >= 3 && current_line[0] == static_cast(0xef) && current_line[1] == static_cast(0xbb) && current_line[2] == static_cast(0xbf)) { current_line = current_line.substr(3); } // Parser structure while(!eof) { try { bool fuzzy = false; bool has_msgctxt = false; std::string msgctxt; std::string msgid; while(prefix("#")) { if (current_line.size() >= 2 && current_line[1] == ',') { // FIXME: Rather simplistic hunt for fuzzy flag if (current_line.find("fuzzy", 2) != std::string::npos) fuzzy = true; } next_line(); } if (!is_empty_line()) { if (prefix("msgctxt")) { has_msgctxt = true; msgctxt = get_string(7); } if (prefix("msgid")) msgid = get_string(5); else error("expected 'msgid'"); if (prefix("msgid_plural")) { std::string msgid_plural = get_string(12); std::vector msgstr_num; bool saw_nonempty_msgstr = false; next: if (is_empty_line()) { if (msgstr_num.empty()) error("expected 'msgstr[N] (0 <= N <= 9)'"); } else if (prefix("msgstr[") && current_line.size() > 8 && isdigit(current_line[7]) && current_line[8] == ']') { unsigned int number = static_cast(current_line[7] - '0'); std::string msgstr = get_string(9); if(!msgstr.empty()) saw_nonempty_msgstr = true; if (number >= msgstr_num.size()) msgstr_num.resize(number+1); msgstr_num[number] = conv.convert(msgstr); goto next; } else { error("expected 'msgstr[N]'"); } if (!is_empty_line()) error("expected 'msgstr[N]' or empty line"); if (saw_nonempty_msgstr) { if (use_fuzzy || !fuzzy) { if (!dict.get_plural_forms()) { warning("msgstr[N] seen, but no Plural-Forms given"); } else { if (msgstr_num.size() != dict.get_plural_forms().get_nplural()) { warning("msgstr[N] count doesn't match Plural-Forms.nplural"); } } if (has_msgctxt) dict.add_translation(msgctxt, msgid, msgid_plural, msgstr_num); else dict.add_translation(msgid, msgid_plural, msgstr_num); } if ((false)) { std::cout << (fuzzy?"fuzzy":"not-fuzzy") << std::endl; std::cout << "msgid \"" << msgid << "\"" << std::endl; std::cout << "msgid_plural \"" << msgid_plural << "\"" << std::endl; for(std::vector::size_type i = 0; i < msgstr_num.size(); ++i) std::cout << "msgstr[" << i << "] \"" << conv.convert(msgstr_num[i]) << "\"" << std::endl; std::cout << std::endl; } } } else if (prefix("msgstr")) { std::string msgstr = get_string(6); if (msgid.empty()) { parse_header(msgstr); } else if(!msgstr.empty()) { if (use_fuzzy || !fuzzy) { if (has_msgctxt) dict.add_translation(msgctxt, msgid, conv.convert(msgstr)); else dict.add_translation(msgid, conv.convert(msgstr)); } if ((false)) { std::cout << (fuzzy?"fuzzy":"not-fuzzy") << std::endl; std::cout << "msgid \"" << msgid << "\"" << std::endl; std::cout << "msgstr \"" << conv.convert(msgstr) << "\"" << std::endl; std::cout << std::endl; } } } else { error("expected 'msgstr' or 'msgid_plural'"); } } if (!is_empty_line()) error("expected empty line"); next_line(); } catch(POParserError&) { } } } } // namespace tinygettext /* EOF */