aboutsummaryrefslogtreecommitdiff
path: root/node_modules/gettext-parser/lib/poparser.js
diff options
context:
space:
mode:
Diffstat (limited to 'node_modules/gettext-parser/lib/poparser.js')
-rw-r--r--node_modules/gettext-parser/lib/poparser.js525
1 files changed, 525 insertions, 0 deletions
diff --git a/node_modules/gettext-parser/lib/poparser.js b/node_modules/gettext-parser/lib/poparser.js
new file mode 100644
index 000000000..e215bca08
--- /dev/null
+++ b/node_modules/gettext-parser/lib/poparser.js
@@ -0,0 +1,525 @@
+'use strict';
+
+var encoding = require('encoding');
+var sharedFuncs = require('./shared');
+var Transform = require('stream').Transform;
+var util = require('util');
+
+/**
+ * Parses a PO object into translation table
+ *
+ * @param {Buffer|String} buffer PO object
+ * @param {String} [defaultCharset] Default charset to use
+ * @return {Object} Translation object
+ */
+module.exports.parse = function(buffer, defaultCharset) {
+ var parser = new Parser(buffer, defaultCharset);
+ return parser.parse();
+};
+
+/**
+ * Parses a PO stream, emits translation table in object mode
+ *
+ * @param {String} [defaultCharset] Default charset to use
+ * @param {String} [options] Stream options
+ * @return {Stream} Transform stream
+ */
+module.exports.stream = function(defaultCharset, options) {
+ return new PoParserTransform(defaultCharset, options);
+};
+
+/**
+ * Creates a PO parser object. If PO object is a string,
+ * UTF-8 will be used as the charset
+ *
+ * @constructor
+ * @param {Buffer|String} fileContents PO object
+ * @param {String} [defaultCharset] Default charset to use
+ */
+function Parser(fileContents, defaultCharset) {
+
+ this._charset = defaultCharset || 'iso-8859-1';
+
+ this._lex = [];
+ this._escaped = false;
+ this._node;
+ this._state = this.states.none;
+
+ if (typeof fileContents === 'string') {
+ this._charset = 'utf-8';
+ this._fileContents = fileContents;
+ } else {
+ this._handleCharset(fileContents);
+ }
+}
+
+/**
+ * Parses the PO object and returns translation table
+ *
+ * @return {Object} Translation table
+ */
+Parser.prototype.parse = function() {
+ this._lexer(this._fileContents);
+ return this._finalize(this._lex);
+};
+
+/**
+ * Detects charset for PO strings from the header
+ *
+ * @param {Buffer} headers Header value
+ */
+Parser.prototype._handleCharset = function(buf) {
+ var str = (buf || '').toString(),
+ pos, headers = '',
+ match;
+
+ if ((pos = str.search(/^\s*msgid/im)) >= 0) {
+ if ((pos = pos + str.substr(pos + 5).search(/^\s*(msgid|msgctxt)/im))) {
+ headers = str.substr(0, pos);
+ }
+ }
+
+ if ((match = headers.match(/[; ]charset\s*=\s*([\w\-]+)(?:[\s;]|\\n)*"\s*$/mi))) {
+ this._charset = sharedFuncs.formatCharset(match[1], this._charset);
+ }
+
+ if (this._charset === 'utf-8') {
+ this._fileContents = str;
+ } else {
+ this._fileContents = this._toString(buf);
+ }
+};
+
+Parser.prototype._toString = function(buf) {
+ return encoding.convert(buf, 'utf-8', this._charset).toString('utf-8');
+};
+
+/**
+ * State constants for parsing FSM
+ */
+Parser.prototype.states = {
+ none: 0x01,
+ comments: 0x02,
+ key: 0x03,
+ string: 0x04
+};
+
+/**
+ * Value types for lexer
+ */
+Parser.prototype.types = {
+ comments: 0x01,
+ key: 0x02,
+ string: 0x03
+};
+
+/**
+ * String matches for lexer
+ */
+Parser.prototype.symbols = {
+ quotes: /["']/,
+ comments: /\#/,
+ whitespace: /\s/,
+ key: /[\w\-\[\]]/
+};
+
+/**
+ * Token parser. Parsed state can be found from this._lex
+ *
+ * @param {String} chunk String
+ */
+Parser.prototype._lexer = function(chunk) {
+ var chr;
+
+ for (var i = 0, len = chunk.length; i < len; i++) {
+ chr = chunk.charAt(i);
+ switch (this._state) {
+ case this.states.none:
+ if (chr.match(this.symbols.quotes)) {
+ this._node = {
+ type: this.types.string,
+ value: '',
+ quote: chr
+ };
+ this._lex.push(this._node);
+ this._state = this.states.string;
+ } else if (chr.match(this.symbols.comments)) {
+ this._node = {
+ type: this.types.comments,
+ value: ''
+ };
+ this._lex.push(this._node);
+ this._state = this.states.comments;
+ } else if (!chr.match(this.symbols.whitespace)) {
+ this._node = {
+ type: this.types.key,
+ value: chr
+ };
+ this._lex.push(this._node);
+ this._state = this.states.key;
+ }
+ break;
+ case this.states.comments:
+ if (chr === '\n') {
+ this._state = this.states.none;
+ } else if (chr !== '\r') {
+ this._node.value += chr;
+ }
+ break;
+ case this.states.string:
+ if (this._escaped) {
+ switch (chr) {
+ case 't':
+ this._node.value += '\t';
+ break;
+ case 'n':
+ this._node.value += '\n';
+ break;
+ case 'r':
+ this._node.value += '\r';
+ break;
+ default:
+ this._node.value += chr;
+ }
+ this._escaped = false;
+ } else {
+ if (chr === this._node.quote) {
+ this._state = this.states.none;
+ } else if (chr === '\\') {
+ this._escaped = true;
+ break;
+ } else {
+ this._node.value += chr;
+ }
+ this._escaped = false;
+ }
+ break;
+ case this.states.key:
+ if (!chr.match(this.symbols.key)) {
+ this._state = this.states.none;
+ i--;
+ } else {
+ this._node.value += chr;
+ }
+ break;
+ }
+ }
+};
+
+/**
+ * Join multi line strings
+ *
+ * @param {Object} tokens Parsed tokens
+ * @return {Object} Parsed tokens, with multi line strings joined into one
+ */
+Parser.prototype._joinStringValues = function(tokens) {
+ var lastNode, response = [];
+
+ for (var i = 0, len = tokens.length; i < len; i++) {
+ if (lastNode && tokens[i].type === this.types.string && lastNode.type === this.types.string) {
+ lastNode.value += tokens[i].value;
+ } else if (lastNode && tokens[i].type === this.types.comments && lastNode.type === this.types.comments) {
+ lastNode.value += '\n' + tokens[i].value;
+ } else {
+ response.push(tokens[i]);
+ lastNode = tokens[i];
+ }
+ }
+
+ return response;
+};
+
+/**
+ * Parse comments into separate comment blocks
+ *
+ * @param {Object} tokens Parsed tokens
+ */
+Parser.prototype._parseComments = function(tokens) {
+ // parse comments
+ tokens.forEach((function(node) {
+ var comment, lines;
+
+ if (node && node.type === this.types.comments) {
+ comment = {
+ translator: [],
+ extracted: [],
+ reference: [],
+ flag: [],
+ previous: []
+ };
+ lines = (node.value || '').split(/\n/);
+ lines.forEach(function(line) {
+ switch (line.charAt(0) || '') {
+ case ':':
+ comment.reference.push(line.substr(1).trim());
+ break;
+ case '.':
+ comment.extracted.push(line.substr(1).replace(/^\s+/, ''));
+ break;
+ case ',':
+ comment.flag.push(line.substr(1).replace(/^\s+/, ''));
+ break;
+ case '|':
+ comment.previous.push(line.substr(1).replace(/^\s+/, ''));
+ break;
+ default:
+ comment.translator.push(line.replace(/^\s+/, ''));
+ }
+ });
+
+ node.value = {};
+
+ Object.keys(comment).forEach(function(key) {
+ if (comment[key] && comment[key].length) {
+ node.value[key] = comment[key].join('\n');
+ }
+ });
+ }
+ }).bind(this));
+};
+
+/**
+ * Join gettext keys with values
+ *
+ * @param {Object} tokens Parsed tokens
+ * @return {Object} Tokens
+ */
+Parser.prototype._handleKeys = function(tokens) {
+ var response = [],
+ lastNode;
+
+ for (var i = 0, len = tokens.length; i < len; i++) {
+ if (tokens[i].type === this.types.key) {
+ lastNode = {
+ key: tokens[i].value
+ };
+ if (i && tokens[i - 1].type === this.types.comments) {
+ lastNode.comments = tokens[i - 1].value;
+ }
+ lastNode.value = '';
+ response.push(lastNode);
+ } else if (tokens[i].type === this.types.string && lastNode) {
+ lastNode.value += tokens[i].value;
+ }
+ }
+
+ return response;
+};
+
+/**
+ * Separate different values into individual translation objects
+ *
+ * @param {Object} tokens Parsed tokens
+ * @return {Object} Tokens
+ */
+Parser.prototype._handleValues = function(tokens) {
+ var response = [],
+ lastNode, curContext, curComments;
+
+ for (var i = 0, len = tokens.length; i < len; i++) {
+ if (tokens[i].key.toLowerCase() === 'msgctxt') {
+ curContext = tokens[i].value;
+ curComments = tokens[i].comments;
+ } else if (tokens[i].key.toLowerCase() === 'msgid') {
+ lastNode = {
+ msgid: tokens[i].value
+ };
+
+ if (curContext) {
+ lastNode.msgctxt = curContext;
+ }
+
+ if (curComments) {
+ lastNode.comments = curComments;
+ }
+
+ if (tokens[i].comments && !lastNode.comments) {
+ lastNode.comments = tokens[i].comments;
+ }
+
+ curContext = false;
+ curComments = false;
+ response.push(lastNode);
+ } else if (tokens[i].key.toLowerCase() === 'msgid_plural') {
+ if (lastNode) {
+ lastNode.msgid_plural = tokens[i].value;
+ }
+
+ if (tokens[i].comments && !lastNode.comments) {
+ lastNode.comments = tokens[i].comments;
+ }
+
+ curContext = false;
+ curComments = false;
+ } else if (tokens[i].key.substr(0, 6).toLowerCase() === 'msgstr') {
+ if (lastNode) {
+ lastNode.msgstr = (lastNode.msgstr || []).concat(tokens[i].value);
+ }
+
+ if (tokens[i].comments && !lastNode.comments) {
+ lastNode.comments = tokens[i].comments;
+ }
+
+ curContext = false;
+ curComments = false;
+ }
+ }
+
+ return response;
+};
+
+/**
+ * Compose a translation table from tokens object
+ *
+ * @param {Object} tokens Parsed tokens
+ * @return {Object} Translation table
+ */
+Parser.prototype._normalize = function(tokens) {
+ var msgctxt,
+ table = {
+ charset: this._charset,
+ headers: undefined,
+ translations: {}
+ };
+
+ for (var i = 0, len = tokens.length; i < len; i++) {
+ msgctxt = tokens[i].msgctxt || '';
+
+ if (!table.translations[msgctxt]) {
+ table.translations[msgctxt] = {};
+ }
+
+ if (!table.headers && !msgctxt && !tokens[i].msgid) {
+ table.headers = sharedFuncs.parseHeader(tokens[i].msgstr[0]);
+ }
+
+ table.translations[msgctxt][tokens[i].msgid] = tokens[i];
+ }
+
+ return table;
+};
+
+/**
+ * Converts parsed tokens to a translation table
+ *
+ * @param {Object} tokens Parsed tokens
+ * @returns {Object} Translation table
+ */
+Parser.prototype._finalize = function(tokens) {
+ var data = this._joinStringValues(tokens);
+ this._parseComments(data);
+ data = this._handleKeys(data);
+ data = this._handleValues(data);
+
+ return this._normalize(data);
+};
+
+/**
+ * Creates a transform stream for parsing PO input
+ *
+ * @constructor
+ * @param {String} [defaultCharset] Default charset to use
+ * @param {String} [options] Stream options
+ */
+function PoParserTransform(defaultCharset, options) {
+ if (!options && defaultCharset && typeof defaultCharset === 'object') {
+ options = defaultCharset;
+ defaultCharset = undefined;
+ }
+
+ this.defaultCharset = defaultCharset;
+ this._parser = false;
+ this._tokens = {};
+
+ this._cache = [];
+ this._cacheSize = 0;
+
+ this.initialTreshold = options.initialTreshold || 2 * 1024;
+
+ Transform.call(this, options);
+ this._writableState.objectMode = false;
+ this._readableState.objectMode = true;
+}
+util.inherits(PoParserTransform, Transform);
+
+/**
+ * Processes a chunk of the input stream
+ */
+PoParserTransform.prototype._transform = function(chunk, encoding, done) {
+ var i, len = 0;
+
+ if (!chunk || !chunk.length) {
+ return done();
+ }
+
+ if (!this._parser) {
+ this._cache.push(chunk);
+ this._cacheSize += chunk.length;
+
+ // wait until the first 1kb before parsing headers for charset
+ if (this._cacheSize < this.initialTreshold) {
+ return setImmediate(done);
+ } else if (this._cacheSize) {
+ chunk = Buffer.concat(this._cache, this._cacheSize);
+ this._cacheSize = 0;
+ this._cache = [];
+ }
+
+ this._parser = new Parser(chunk, this.defaultCharset);
+ } else if (this._cacheSize) {
+ // this only happens if we had an uncompleted 8bit sequence from the last iteration
+ this._cache.push(chunk);
+ this._cacheSize += chunk.length;
+ chunk = Buffer.concat(this._cache, this._cacheSize);
+ this._cacheSize = 0;
+ this._cache = [];
+ }
+
+ // cache 8bit bytes from the end of the chunk
+ // helps if the chunk ends in the middle of an utf-8 sequence
+ for (i = chunk.length - 1; i >= 0; i--) {
+ if (chunk[i] >= 0x80) {
+ len++;
+ continue;
+ }
+ break;
+ }
+ // it seems we found some 8bit bytes from the end of the string, so let's cache these
+ if (len) {
+ this._cache = [chunk.slice(chunk.length - len)];
+ this._cacheSize = this._cache[0].length;
+ chunk = chunk.slice(0, chunk.length - len);
+ }
+
+ // chunk might be empty if it only contined of 8bit bytes and these were all cached
+ if (chunk.length) {
+ this._parser._lexer(this._parser._toString(chunk));
+ }
+
+ setImmediate(done);
+};
+
+/**
+ * Once all input has been processed emit the parsed translation table as an object
+ */
+PoParserTransform.prototype._flush = function(done) {
+ var chunk;
+
+ if (this._cacheSize) {
+ chunk = Buffer.concat(this._cache, this._cacheSize);
+ }
+
+ if (!this._parser && chunk) {
+ this._parser = new Parser(chunk, this.defaultCharset);
+ }
+
+ if (chunk) {
+ this._parser._lexer(this._parser._toString(chunk));
+ }
+
+ if (this._parser) {
+ this.push(this._parser._finalize(this._parser._lex));
+ }
+
+ setImmediate(done);
+}; \ No newline at end of file