/*! * Sitemap * Copyright(c) 2011 Eugene Kalinin * MIT Licensed */ var ut = require('./utils') , err = require('./errors') , urlparser = require('url') , fs = require('fs') , urljoin = require('url-join') , _ = require('underscore'); exports.Sitemap = Sitemap; exports.SitemapItem = SitemapItem; exports.createSitemap = createSitemap; exports.createSitemapIndex = createSitemapIndex; /** * Shortcut for `new Sitemap (...)`. * * @param {Object} conf * @param {String} conf.hostname * @param {String|Array} conf.urls * @param {Number} conf.cacheTime * @param {String} conf.xslUrl * @return {Sitemap} */ function createSitemap(conf) { return new Sitemap(conf.urls, conf.hostname, conf.cacheTime, conf.xslUrl); } function safeUrl(conf) { var loc = conf['url']; if ( !conf['safe'] ) { var url_parts = urlparser.parse(conf['url']); if ( !url_parts['protocol'] ) { throw new err.NoURLProtocolError(); } loc = ut.htmlEscape(conf['url']); } return loc; } /** * Item in sitemap */ function SitemapItem(conf) { var conf = conf || {} , is_safe_url = conf['safe']; if ( !conf['url'] ) { throw new err.NoURLError(); } // URL of the page this.loc = safeUrl(conf); // If given a file to use for last modified date if ( conf['lastmodfile'] ) { //console.log('should read stat from file: ' + conf['lastmodfile']); var file = conf['lastmodfile']; var stat = fs.statSync( file ); var mtime = stat.mtime; var dt = new Date( mtime ); this.lastmod = ut.getTimestampFromDate(dt, conf['lastmodrealtime']); } // The date of last modification (YYYY-MM-DD) else if ( conf['lastmod'] ) { // append the timezone offset so that dates are treated as local time. // Otherwise the Unit tests fail sometimes. var timezoneOffset = 'UTC-' + (new Date().getTimezoneOffset()/60) + '00'; var dt = new Date( conf['lastmod'] + ' ' + timezoneOffset ); this.lastmod = ut.getTimestampFromDate(dt, conf['lastmodrealtime']); } else if ( conf['lastmodISO'] ) { this.lastmod = conf['lastmodISO']; } // How frequently the page is likely to change this.changefreq = conf['changefreq'] || 'weekly'; if ( !is_safe_url ) { if ( [ 'always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never' ].indexOf(this.changefreq) === -1 ) { throw new err.ChangeFreqInvalidError(); } } // The priority of this URL relative to other URLs this.priority = typeof conf['priority'] === 'number' ? conf['priority'] : (conf['priority'] || 0.5); if ( !is_safe_url ) { if ( !(this.priority >= 0.0 && this.priority <= 1.0) ) { throw new err.PriorityInvalidError(); } } this.news = conf['news'] || null; this.img = conf['img'] || null; this.links = conf['links'] || null; this.mobile = conf['mobile'] || null; } /** * Create sitemap xml * @return {String} */ SitemapItem.prototype.toXML = function () { return this.toString(); } /** * Alias for toXML() * @return {String} */ SitemapItem.prototype.toString = function () { // result xml var xml = ' {loc} {img} {lastmod} {changefreq} {priority} {links} {mobile} {news}' // xml property , props = ['loc', 'img', 'lastmod', 'changefreq', 'priority', 'links', 'mobile','news'] // property array size (for loop) , ps = props.length // current property name (for loop) , p; while ( ps-- ) { p = props[ps]; if(this[p] && p == 'img') { // Image handling imagexml = ''+this[p]+''; if(typeof(this[p])=='object'){ if(this[p]&&this[p].length>0){ imagexml = ''; this[p].forEach(function(image){ imagexml += ''+image+''; }); } } xml = xml.replace('{' + p + '}',imagexml); } else if (this[p] && p == 'links') { xml = xml.replace('{' + p + '}', this[p].map(function(link) { return ''; }).join(" ")); } else if (this[p] && p == 'mobile') { xml = xml.replace('{' + p + '}', ''); } else if (p == 'priority' && (this[p] >= 0.0 && this[p] <= 1.0)) { xml = xml.replace('{'+p+'}', '<'+p+'>'+parseFloat(this[p]).toFixed(1)+''); } else if (this[p] && p == 'news') { var newsitem = ''; if (this[p].publication) { newsitem += ''; if (this[p].publication.name) { newsitem += '' + this[p].publication.name + '' ;} if (this[p].publication.language) { newsitem += '' + this[p].publication.language + '' ;} newsitem += ''; } if (this[p].access) { newsitem += '' + this[p].access + '' ;} if (this[p].genres) { newsitem += '' + this[p].genres + '' ;} if (this[p].publication_date) { newsitem += '' + this[p].publication_date + '' ;} if (this[p].title) { newsitem += '' + this[p].title + '' ;} if (this[p].keywords) { newsitem += '' + this[p].keywords + '' ;} if (this[p].stock_tickers) { newsitem += '' + this[p].stock_tickers + '' ;} newsitem += ''; xml = xml.replace('{' + p + '}', newsitem); } else if (this[p]) { xml = xml.replace('{'+p+'}', '<'+p+'>'+this[p]+''); } else { xml = xml.replace('{'+p+'}', ''); } xml = xml.replace(' ', ' '); } return xml.replace(' ', ' '); } /** * Sitemap constructor * @param {String|Array} urls * @param {String} hostname optional * @param {Number} cacheTime optional in milliseconds; 0 - cache disabled * @param {String} xslUrl optional */ function Sitemap(urls, hostname, cacheTime, xslUrl) { // This limit is defined by Google. See: // http://sitemaps.org/protocol.php#index this.limit = 50000 // Base domain this.hostname = hostname; // URL list for sitemap this.urls = []; // Make copy of object if(urls) _.extend(this.urls, (urls instanceof Array) ? urls : [urls]); // sitemap cache this.cacheResetPeriod = cacheTime || 0; this.cache = ''; this.xslUrl = xslUrl; } /** * Clear sitemap cache */ Sitemap.prototype.clearCache = function () { this.cache = ''; } /** * Can cache be used */ Sitemap.prototype.isCacheValid = function() { var currTimestamp = ut.getTimestamp(); return this.cacheResetPeriod && this.cache && (this.cacheSetTimestamp + this.cacheResetPeriod) >= currTimestamp; } /** * Fill cache */ Sitemap.prototype.setCache = function(newCache) { this.cache = newCache; this.cacheSetTimestamp = ut.getTimestamp(); return this.cache; } /** * Add url to sitemap * @param {String} url */ Sitemap.prototype.add = function (url) { return this.urls.push(url); } /** * Delete url from sitemap * @param {String} url */ Sitemap.prototype.del = function (url) { var index_to_remove = [], key = '', self=this; if (typeof url == 'string') { key = url; } else { key = url['url']; } // find this.urls.forEach( function (elem, index) { if ( typeof elem == 'string' ) { if (elem == key) { index_to_remove.push(index); } } else { if (elem['url'] == key) { index_to_remove.push(index); } } }); // delete index_to_remove.forEach(function (elem) { self.urls.splice(elem, 1); }); return index_to_remove.length; } /** * Create sitemap xml * @param {Function} callback Callback function with one argument — xml */ Sitemap.prototype.toXML = function (callback) { if (typeof callback === 'undefined') { return this.toString(); } var self = this; process.nextTick( function () { try { return callback(null, self.toString()); } catch (err) { return callback(err); } }); } var reProto = /^https?:\/\//i; /** * Synchronous alias for toXML() * @return {String} */ Sitemap.prototype.toString = function () { var self = this , xml = [ '', '' ]; if(self.xslUrl) { xml.splice(1, 0, ''); } if (self.isCacheValid()) { return self.cache; } // TODO: if size > limit: create sitemapindex self.urls.forEach( function (elem, index) { // SitemapItem var smi = elem; // create object with url property if ( typeof elem == 'string' ) { smi = {'url': elem}; } // insert domain name if ( self.hostname ) { if ( !reProto.test(smi.url) ) { smi.url = urljoin(self.hostname, smi.url); } if ( smi.links ) { smi.links.forEach(function(link) { if ( !reProto.test(link.url) ) { link.url = urljoin(self.hostname, link.url); } }); } } xml.push( new SitemapItem(smi) ); }) // close xml xml.push(''); return self.setCache(xml.join('\n')); } Sitemap.prototype.toGzip = function(callback) { var zlib = require('zlib'); if (typeof callback === 'function') { zlib.gzip(this.toString(), callback); } else { return zlib.gzipSync(this.toString()); } } /** * Shortcut for `new SitemapIndex (...)`. * * @param {Object} conf * @param {String|Array} conf.urls * @param {String} conf.targetFolder * @param {String} conf.hostname * @param {Number} conf.cacheTime * @param {String} conf.sitemapName * @param {Number} conf.sitemapSize * @param {String} conf.xslUrl * @return {SitemapIndex} */ function createSitemapIndex(conf) { return new SitemapIndex(conf.urls, conf.targetFolder, conf.hostname, conf.cacheTime, conf.sitemapName, conf.sitemapSize, conf.xslUrl, conf.gzip, conf.callback); } /** * Sitemap index (for several sitemaps) * @param {String|Array} urls * @param {String} targetFolder * @param {String} hostname optional * @param {Number} cacheTime optional in milliseconds * @param {String} sitemapName optional * @param {Number} sitemapSize optional * @param {Number} xslUrl optional * @param {Boolean} gzip optional * @param {Function} callback optional */ function SitemapIndex(urls, targetFolder, hostname, cacheTime, sitemapName, sitemapSize, xslUrl, gzip, callback) { var self = this; self.fs = require('fs'); // Base domain self.hostname = hostname; if(sitemapName === undefined) { self.sitemapName = 'sitemap'; } else { self.sitemapName = sitemapName; } // This limit is defined by Google. See: // http://sitemaps.org/protocol.php#index self.sitemapSize = sitemapSize; self.xslUrl = xslUrl; self.sitemapId = 0; self.sitemaps = []; self.targetFolder = '.'; if(!self.fs.existsSync(targetFolder)) { throw new err.UndefinedTargetFolder(); } self.targetFolder = targetFolder; // URL list for sitemap self.urls = urls || []; if ( !(self.urls instanceof Array) ) { self.urls = [ self.urls ] } self.chunks = ut.chunkArray(self.urls, self.sitemapSize); self.callback = callback; var processesCount = self.chunks.length + 1; self.chunks.forEach( function (chunk, index) { var extension = '.xml' + (gzip ? '.gz' : ''), filename = self.sitemapName + '-' + self.sitemapId++ + extension; self.sitemaps.push(filename); var sitemap = createSitemap ({ hostname: self.hostname, cacheTime: self.cacheTime, // 600 sec - cache purge period urls: chunk, xslUrl: self.xslUrl }); var stream = self.fs.createWriteStream(targetFolder + '/' + filename); stream.once('open', function(fd) { stream.write(gzip ? sitemap.toGzip() : sitemap.toString()); stream.end(); processesCount--; if(processesCount === 0 && typeof self.callback === 'function') { self.callback(null, true); } }); }); var xml = []; xml.push(''); if(self.xslUrl) { xml.push(''); } xml.push(''); self.sitemaps.forEach( function (sitemap, index) { xml.push(''); xml.push('' + hostname + '/' + sitemap + ''); // xml.push('' + new Date() + ''); xml.push(''); }); xml.push(''); var stream = self.fs.createWriteStream(targetFolder + '/' + self.sitemapName + '-index.xml'); stream.once('open', function(fd) { stream.write(xml.join('\n')); stream.end(); processesCount--; if(processesCount === 0 && typeof self.callback === 'function') { self.callback(null, true); } }); }