/*!
* Sitemap
* Copyright(c) 2011 Eugene Kalinin
* MIT Licensed
*/
var ut = require('./utils')
, err = require('./errors')
, urlparser = require('url')
, fs = require('fs')
, urljoin = require('url-join')
, _ = require('underscore');
exports.Sitemap = Sitemap;
exports.SitemapItem = SitemapItem;
exports.createSitemap = createSitemap;
exports.createSitemapIndex = createSitemapIndex;
/**
* Shortcut for `new Sitemap (...)`.
*
* @param {Object} conf
* @param {String} conf.hostname
* @param {String|Array} conf.urls
* @param {Number} conf.cacheTime
* @param {String} conf.xslUrl
* @return {Sitemap}
*/
function createSitemap(conf) {
return new Sitemap(conf.urls, conf.hostname, conf.cacheTime, conf.xslUrl);
}
function safeUrl(conf) {
var loc = conf['url'];
if ( !conf['safe'] ) {
var url_parts = urlparser.parse(conf['url']);
if ( !url_parts['protocol'] ) {
throw new err.NoURLProtocolError();
}
loc = ut.htmlEscape(conf['url']);
}
return loc;
}
/**
* Item in sitemap
*/
function SitemapItem(conf) {
var conf = conf || {}
, is_safe_url = conf['safe'];
if ( !conf['url'] ) {
throw new err.NoURLError();
}
// URL of the page
this.loc = safeUrl(conf);
// If given a file to use for last modified date
if ( conf['lastmodfile'] ) {
//console.log('should read stat from file: ' + conf['lastmodfile']);
var file = conf['lastmodfile'];
var stat = fs.statSync( file );
var mtime = stat.mtime;
var dt = new Date( mtime );
this.lastmod = ut.getTimestampFromDate(dt, conf['lastmodrealtime']);
}
// The date of last modification (YYYY-MM-DD)
else if ( conf['lastmod'] ) {
// append the timezone offset so that dates are treated as local time.
// Otherwise the Unit tests fail sometimes.
var timezoneOffset = 'UTC-' + (new Date().getTimezoneOffset()/60) + '00';
var dt = new Date( conf['lastmod'] + ' ' + timezoneOffset );
this.lastmod = ut.getTimestampFromDate(dt, conf['lastmodrealtime']);
} else if ( conf['lastmodISO'] ) {
this.lastmod = conf['lastmodISO'];
}
// How frequently the page is likely to change
this.changefreq = conf['changefreq'] || 'weekly';
if ( !is_safe_url ) {
if ( [ 'always', 'hourly', 'daily', 'weekly', 'monthly',
'yearly', 'never' ].indexOf(this.changefreq) === -1 ) {
throw new err.ChangeFreqInvalidError();
}
}
// The priority of this URL relative to other URLs
this.priority = typeof conf['priority'] === 'number' ? conf['priority'] : (conf['priority'] || 0.5);
if ( !is_safe_url ) {
if ( !(this.priority >= 0.0 && this.priority <= 1.0) ) {
throw new err.PriorityInvalidError();
}
}
this.news = conf['news'] || null;
this.img = conf['img'] || null;
this.links = conf['links'] || null;
this.mobile = conf['mobile'] || null;
}
/**
* Create sitemap xml
* @return {String}
*/
SitemapItem.prototype.toXML = function () {
return this.toString();
}
/**
* Alias for toXML()
* @return {String}
*/
SitemapItem.prototype.toString = function () {
// result xml
var xml = ' {loc} {img} {lastmod} {changefreq} {priority} {links} {mobile} {news}'
// xml property
, props = ['loc', 'img', 'lastmod', 'changefreq', 'priority', 'links', 'mobile','news']
// property array size (for loop)
, ps = props.length
// current property name (for loop)
, p;
while ( ps-- ) {
p = props[ps];
if(this[p] && p == 'img') {
// Image handling
imagexml = ''+this[p]+'';
if(typeof(this[p])=='object'){
if(this[p]&&this[p].length>0){
imagexml = '';
this[p].forEach(function(image){
imagexml += ''+image+'';
});
}
}
xml = xml.replace('{' + p + '}',imagexml);
} else if (this[p] && p == 'links') {
xml = xml.replace('{' + p + '}',
this[p].map(function(link) {
return '';
}).join(" "));
} else if (this[p] && p == 'mobile') {
xml = xml.replace('{' + p + '}', '');
} else if (p == 'priority' && (this[p] >= 0.0 && this[p] <= 1.0)) {
xml = xml.replace('{'+p+'}',
'<'+p+'>'+parseFloat(this[p]).toFixed(1)+''+p+'>');
} else if (this[p] && p == 'news') {
var newsitem = '';
if (this[p].publication) {
newsitem += '';
if (this[p].publication.name) { newsitem += '' + this[p].publication.name + '' ;}
if (this[p].publication.language) { newsitem += '' + this[p].publication.language + '' ;}
newsitem += '';
}
if (this[p].access) { newsitem += '' + this[p].access + '' ;}
if (this[p].genres) { newsitem += '' + this[p].genres + '' ;}
if (this[p].publication_date) { newsitem += '' + this[p].publication_date + '' ;}
if (this[p].title) { newsitem += '' + this[p].title + '' ;}
if (this[p].keywords) { newsitem += '' + this[p].keywords + '' ;}
if (this[p].stock_tickers) { newsitem += '' + this[p].stock_tickers + '' ;}
newsitem += '';
xml = xml.replace('{' + p + '}', newsitem);
} else if (this[p]) {
xml = xml.replace('{'+p+'}',
'<'+p+'>'+this[p]+''+p+'>');
} else {
xml = xml.replace('{'+p+'}', '');
}
xml = xml.replace(' ', ' ');
}
return xml.replace(' ', ' ');
}
/**
* Sitemap constructor
* @param {String|Array} urls
* @param {String} hostname optional
* @param {Number} cacheTime optional in milliseconds; 0 - cache disabled
* @param {String} xslUrl optional
*/
function Sitemap(urls, hostname, cacheTime, xslUrl) {
// This limit is defined by Google. See:
// http://sitemaps.org/protocol.php#index
this.limit = 50000
// Base domain
this.hostname = hostname;
// URL list for sitemap
this.urls = [];
// Make copy of object
if(urls) _.extend(this.urls, (urls instanceof Array) ? urls : [urls]);
// sitemap cache
this.cacheResetPeriod = cacheTime || 0;
this.cache = '';
this.xslUrl = xslUrl;
}
/**
* Clear sitemap cache
*/
Sitemap.prototype.clearCache = function () {
this.cache = '';
}
/**
* Can cache be used
*/
Sitemap.prototype.isCacheValid = function() {
var currTimestamp = ut.getTimestamp();
return this.cacheResetPeriod && this.cache &&
(this.cacheSetTimestamp + this.cacheResetPeriod) >= currTimestamp;
}
/**
* Fill cache
*/
Sitemap.prototype.setCache = function(newCache) {
this.cache = newCache;
this.cacheSetTimestamp = ut.getTimestamp();
return this.cache;
}
/**
* Add url to sitemap
* @param {String} url
*/
Sitemap.prototype.add = function (url) {
return this.urls.push(url);
}
/**
* Delete url from sitemap
* @param {String} url
*/
Sitemap.prototype.del = function (url) {
var index_to_remove = [],
key = '',
self=this;
if (typeof url == 'string') {
key = url;
} else {
key = url['url'];
}
// find
this.urls.forEach( function (elem, index) {
if ( typeof elem == 'string' ) {
if (elem == key) {
index_to_remove.push(index);
}
} else {
if (elem['url'] == key) {
index_to_remove.push(index);
}
}
});
// delete
index_to_remove.forEach(function (elem) {
self.urls.splice(elem, 1);
});
return index_to_remove.length;
}
/**
* Create sitemap xml
* @param {Function} callback Callback function with one argument — xml
*/
Sitemap.prototype.toXML = function (callback) {
if (typeof callback === 'undefined') {
return this.toString();
}
var self = this;
process.nextTick( function () {
try {
return callback(null, self.toString());
} catch (err) {
return callback(err);
}
});
}
var reProto = /^https?:\/\//i;
/**
* Synchronous alias for toXML()
* @return {String}
*/
Sitemap.prototype.toString = function () {
var self = this
, xml = [ '',
''
];
if(self.xslUrl) {
xml.splice(1, 0,
'');
}
if (self.isCacheValid()) {
return self.cache;
}
// TODO: if size > limit: create sitemapindex
self.urls.forEach( function (elem, index) {
// SitemapItem
var smi = elem;
// create object with url property
if ( typeof elem == 'string' ) {
smi = {'url': elem};
}
// insert domain name
if ( self.hostname ) {
if ( !reProto.test(smi.url) ) {
smi.url = urljoin(self.hostname, smi.url);
}
if ( smi.links ) {
smi.links.forEach(function(link) {
if ( !reProto.test(link.url) ) {
link.url = urljoin(self.hostname, link.url);
}
});
}
}
xml.push( new SitemapItem(smi) );
})
// close xml
xml.push('');
return self.setCache(xml.join('\n'));
}
Sitemap.prototype.toGzip = function(callback) {
var zlib = require('zlib');
if (typeof callback === 'function') {
zlib.gzip(this.toString(), callback);
} else {
return zlib.gzipSync(this.toString());
}
}
/**
* Shortcut for `new SitemapIndex (...)`.
*
* @param {Object} conf
* @param {String|Array} conf.urls
* @param {String} conf.targetFolder
* @param {String} conf.hostname
* @param {Number} conf.cacheTime
* @param {String} conf.sitemapName
* @param {Number} conf.sitemapSize
* @param {String} conf.xslUrl
* @return {SitemapIndex}
*/
function createSitemapIndex(conf) {
return new SitemapIndex(conf.urls,
conf.targetFolder,
conf.hostname,
conf.cacheTime,
conf.sitemapName,
conf.sitemapSize,
conf.xslUrl,
conf.gzip,
conf.callback);
}
/**
* Sitemap index (for several sitemaps)
* @param {String|Array} urls
* @param {String} targetFolder
* @param {String} hostname optional
* @param {Number} cacheTime optional in milliseconds
* @param {String} sitemapName optional
* @param {Number} sitemapSize optional
* @param {Number} xslUrl optional
* @param {Boolean} gzip optional
* @param {Function} callback optional
*/
function SitemapIndex(urls, targetFolder, hostname, cacheTime, sitemapName, sitemapSize, xslUrl, gzip, callback) {
var self = this;
self.fs = require('fs');
// Base domain
self.hostname = hostname;
if(sitemapName === undefined) {
self.sitemapName = 'sitemap';
}
else {
self.sitemapName = sitemapName;
}
// This limit is defined by Google. See:
// http://sitemaps.org/protocol.php#index
self.sitemapSize = sitemapSize;
self.xslUrl = xslUrl;
self.sitemapId = 0;
self.sitemaps = [];
self.targetFolder = '.';
if(!self.fs.existsSync(targetFolder)) {
throw new err.UndefinedTargetFolder();
}
self.targetFolder = targetFolder;
// URL list for sitemap
self.urls = urls || [];
if ( !(self.urls instanceof Array) ) {
self.urls = [ self.urls ]
}
self.chunks = ut.chunkArray(self.urls, self.sitemapSize);
self.callback = callback;
var processesCount = self.chunks.length + 1;
self.chunks.forEach( function (chunk, index) {
var extension = '.xml' + (gzip ? '.gz' : ''),
filename = self.sitemapName + '-' + self.sitemapId++ + extension;
self.sitemaps.push(filename);
var sitemap = createSitemap ({
hostname: self.hostname,
cacheTime: self.cacheTime, // 600 sec - cache purge period
urls: chunk,
xslUrl: self.xslUrl
});
var stream = self.fs.createWriteStream(targetFolder + '/' + filename);
stream.once('open', function(fd) {
stream.write(gzip ? sitemap.toGzip() : sitemap.toString());
stream.end();
processesCount--;
if(processesCount === 0 && typeof self.callback === 'function') {
self.callback(null, true);
}
});
});
var xml = [];
xml.push('');
if(self.xslUrl) {
xml.push('');
}
xml.push('');
self.sitemaps.forEach( function (sitemap, index) {
xml.push('');
xml.push('' + hostname + '/' + sitemap + '');
// xml.push('' + new Date() + '');
xml.push('');
});
xml.push('');
var stream = self.fs.createWriteStream(targetFolder + '/' +
self.sitemapName + '-index.xml');
stream.once('open', function(fd) {
stream.write(xml.join('\n'));
stream.end();
processesCount--;
if(processesCount === 0 && typeof self.callback === 'function') {
self.callback(null, true);
}
});
}