diff --git a/lib/readability.js b/lib/readability.js index c8b5a52..63351e4 100644 --- a/lib/readability.js +++ b/lib/readability.js @@ -1,6 +1,7 @@ /*jslint undef: true, nomen: true, eqeqeq: true, plusplus: true, newcap: true, immed: true, browser: true, devel: true, passfail: false */ /*global window: false, readConvertLinksToFootnotes: false, readStyle: false, readSize: false, readMargin: false, Typekit: false, ActiveXObject: false */ - +var Buffer = require('buffer').Buffer; +var Iconv = require('iconv').Iconv; var dbg = (typeof console !== 'undefined') ? function(s) { if (readability.debugging) { console.log("Readability: " + s); @@ -41,7 +42,9 @@ var readability = { maxPages: 30, /* The maximum number of pages to loop through before we call it quits and just show a link. */ parsedPages: {}, /* The list of pages we've parsed in this call of readability, for autopaging. As a key store for easier searching. */ pageETags: {}, /* A list of the ETag headers of pages we've parsed, in case they happen to match, we'll know it's a duplicate. */ - + success: function (html) { + + }, /** * All of the regular expressions in use within readability. * Defined up here so we don't instantiate them repeatedly in loops. @@ -91,7 +94,6 @@ var readability = { /* Pull out any possible next page link first */ var nextPageLink = readability.findNextPageLink(document.body); - readability.prepDocument(); /* Build readability's DOM tree */ @@ -188,6 +190,8 @@ var readability = { window.setTimeout(function() { readability.appendNextPage(nextPageLink); }, 500); + } else { + readability.success(document.body.innerHTML); } /** Smooth scrolling **/ @@ -1408,49 +1412,26 @@ timed(function(){ * TODO: This could likely be simplified beyond what we have here right now. There's still a bit of excess junk. **/ xhr: function () { - if (typeof XMLHttpRequest !== 'undefined' && (window.location.protocol !== 'file:' || !window.ActiveXObject)) { + /*if (typeof XMLHttpRequest !== 'undefined' && (window.location.protocol !== 'file:' || !window.ActiveXObject)) { return new XMLHttpRequest(); } else { try { return new ActiveXObject('Msxml2.XMLHTTP.6.0'); } catch(sixerr) { } try { return new ActiveXObject('Msxml2.XMLHTTP.3.0'); } catch(threrr) { } try { return new ActiveXObject('Msxml2.XMLHTTP'); } catch(err) { } - } + }*/ + var request = require('request'); - return false; + return request; }, successfulRequest: function (request) { return (request.status >= 200 && request.status < 300) || request.status === 304 || (request.status === 0 && request.responseText); }, - ajax: function (url, options) { + ajax: function (url, callback) { var request = readability.xhr(); - - function respondToReadyState(readyState) { - if (request.readyState === 4) { - if (readability.successfulRequest(request)) { - if (options.success) { options.success(request); } - } - else { - if (options.error) { options.error(request); } - } - } - } - - if (typeof options === 'undefined') { options = {}; } - - request.onreadystatechange = respondToReadyState; - - request.open('get', url, true); - request.setRequestHeader('Accept', 'text/html'); - - try { - request.send(options.postBody); - } - catch (e) { - if (options.error) { options.error(); } - } + request({url:url, 'encoding':'binary'}, callback); return request; }, @@ -1482,11 +1463,20 @@ timed(function(){ * asynchronously and load the cleaned content into the div we created for it. **/ (function(pageUrl, thisPage) { - readability.ajax(pageUrl, { - success: function(r) { - + readability.ajax(pageUrl, function(error, r, html) { + var encoding = undefined; + if(r['headers']['content-type']) { + var content_type = r['headers']['content-type'].split('='); + if(content_type.length == 2) encoding = content_type[1].toUpperCase(); + } + if(encoding) { + body = new Buffer(html, 'binary'); + iconv = new Iconv(encoding, 'utf8'); + html = iconv.convert(body).toString('utf8'); + } + r.responseText = html; /* First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. */ - var eTag = r.getResponseHeader('ETag'); + var eTag = r['headers']['ETag']; if(eTag) { if(eTag in readability.pageETags) { dbg("Exact duplicate page found via ETag. Aborting."); @@ -1562,8 +1552,9 @@ timed(function(){ if(nextPageLink) { readability.appendNextPage(nextPageLink); + } else { + readability.success(document.body.innerHTML); } - } }); }(nextPageLink, articlePage)); }, @@ -1961,7 +1952,7 @@ var jsdom = require('jsdom'), var R = readability; var patch = { reComma: /[\uff0c,]/, // chinese comma, too - findNextPageLink: function() {return null;}, + /*findNextPageLink: function() {return null;},*/ getArticleTools: function() {return document.createElement('div');}, getArticleTitle: (function() { var old = R.getArticleTitle; @@ -2195,6 +2186,7 @@ function start(w, options, cb) { if (options.profile) { MyProfiler.reset(); } + readability.success = cb; readability.init(); @@ -2202,9 +2194,9 @@ function start(w, options, cb) { if (options.removeReadabilityArtifacts) removeReadabilityArtifacts(); if (options.removeClassNames) removeClassNames(); - + document.body.innerHTML = '
' + document.body.innerHTML + '
'; //dbg('[Readability] done'); - cb(document.body.innerHTML); + //cb(document.body.innerHTML); } var HTML5; @@ -2225,7 +2217,11 @@ exports.parse = function parse(theHtml, url, options, callback) { removeClassNames: true }; options = Utils.extend({}, defaultOptions, options); - + if(options.encoding && options.encoding != 'utf8') { + body = new Buffer(theHtml, 'binary'); + iconv = new Iconv(options.encoding, 'utf8'); + theHtml = iconv.convert(body).toString('utf8'); + } var startTime = new Date().getTime(); //dbg(html); var html = theHtml.replace(/]*>([\s\S]*?)<\/script>/gi, ''); @@ -2239,7 +2235,7 @@ exports.parse = function parse(theHtml, url, options, callback) { features : { FetchExternalResources : [], ProcessExternalResources : false - } + }, }; function createDocWithHTMLParser() { @@ -2279,7 +2275,7 @@ exports.parse = function parse(theHtml, url, options, callback) { return callback({title: '', content: '', error: true}); } - dbg('---DOM created'); + //dbg('---DOM created'); var win = doc.parentWindow; win = win || doc.createWindow(); //for backward compatibility with jsdom <= 0.1.20 diff --git a/package.json b/package.json index 417beaa..5de26fb 100644 --- a/package.json +++ b/package.json @@ -42,7 +42,9 @@ "dependencies": { "mjsunit.runner": ">=0.1.0", "jsdom": ">=0.1.21", - "htmlparser": ">=1.7.3" + "htmlparser": ">=1.7.3", + "html5":">0.1", + "iconv":">=1.1.3" }, "engines" : { "node" : ">=0.2.5" }, "directories": { diff --git a/test/multi-page.js b/test/multi-page.js new file mode 100644 index 0000000..67cbace --- /dev/null +++ b/test/multi-page.js @@ -0,0 +1,17 @@ +var readability = require('../lib/readability'), + request = require('request'), + encoding = 'utf8'; +var url = "http://www.washingtonpost.com/world/national-security/manhunt-details-us-mission-to-find-osama-bin-laden/2012/04/27/gIQAz5pLoT_story.html"; + + +request({url:url, 'encoding':'binary'}, function (error, response, html) { + if(response['headers']['content-type']) { + var content_type = response['headers']['content-type'].split('='); + if(content_type.length == 2) encoding = content_type[1].toUpperCase(); + } + if(!error && response.statusCode == 200) { + readability.parse(html, url, {encoding:encoding}, function(result) { + console.log(result.title, result.content); + }); + } +}); \ No newline at end of file