diff --git a/lib/readability.js b/lib/readability.js index c8b5a52..63351e4 100644 --- a/lib/readability.js +++ b/lib/readability.js @@ -1,6 +1,7 @@ /*jslint undef: true, nomen: true, eqeqeq: true, plusplus: true, newcap: true, immed: true, browser: true, devel: true, passfail: false */ /*global window: false, readConvertLinksToFootnotes: false, readStyle: false, readSize: false, readMargin: false, Typekit: false, ActiveXObject: false */ - +var Buffer = require('buffer').Buffer; +var Iconv = require('iconv').Iconv; var dbg = (typeof console !== 'undefined') ? function(s) { if (readability.debugging) { console.log("Readability: " + s); @@ -41,7 +42,9 @@ var readability = { maxPages: 30, /* The maximum number of pages to loop through before we call it quits and just show a link. */ parsedPages: {}, /* The list of pages we've parsed in this call of readability, for autopaging. As a key store for easier searching. */ pageETags: {}, /* A list of the ETag headers of pages we've parsed, in case they happen to match, we'll know it's a duplicate. */ - + success: function (html) { + + }, /** * All of the regular expressions in use within readability. * Defined up here so we don't instantiate them repeatedly in loops. @@ -91,7 +94,6 @@ var readability = { /* Pull out any possible next page link first */ var nextPageLink = readability.findNextPageLink(document.body); - readability.prepDocument(); /* Build readability's DOM tree */ @@ -188,6 +190,8 @@ var readability = { window.setTimeout(function() { readability.appendNextPage(nextPageLink); }, 500); + } else { + readability.success(document.body.innerHTML); } /** Smooth scrolling **/ @@ -1408,49 +1412,26 @@ timed(function(){ * TODO: This could likely be simplified beyond what we have here right now. There's still a bit of excess junk. **/ xhr: function () { - if (typeof XMLHttpRequest !== 'undefined' && (window.location.protocol !== 'file:' || !window.ActiveXObject)) { + /*if (typeof XMLHttpRequest !== 'undefined' && (window.location.protocol !== 'file:' || !window.ActiveXObject)) { return new XMLHttpRequest(); } else { try { return new ActiveXObject('Msxml2.XMLHTTP.6.0'); } catch(sixerr) { } try { return new ActiveXObject('Msxml2.XMLHTTP.3.0'); } catch(threrr) { } try { return new ActiveXObject('Msxml2.XMLHTTP'); } catch(err) { } - } + }*/ + var request = require('request'); - return false; + return request; }, successfulRequest: function (request) { return (request.status >= 200 && request.status < 300) || request.status === 304 || (request.status === 0 && request.responseText); }, - ajax: function (url, options) { + ajax: function (url, callback) { var request = readability.xhr(); - - function respondToReadyState(readyState) { - if (request.readyState === 4) { - if (readability.successfulRequest(request)) { - if (options.success) { options.success(request); } - } - else { - if (options.error) { options.error(request); } - } - } - } - - if (typeof options === 'undefined') { options = {}; } - - request.onreadystatechange = respondToReadyState; - - request.open('get', url, true); - request.setRequestHeader('Accept', 'text/html'); - - try { - request.send(options.postBody); - } - catch (e) { - if (options.error) { options.error(); } - } + request({url:url, 'encoding':'binary'}, callback); return request; }, @@ -1482,11 +1463,20 @@ timed(function(){ * asynchronously and load the cleaned content into the div we created for it. **/ (function(pageUrl, thisPage) { - readability.ajax(pageUrl, { - success: function(r) { - + readability.ajax(pageUrl, function(error, r, html) { + var encoding = undefined; + if(r['headers']['content-type']) { + var content_type = r['headers']['content-type'].split('='); + if(content_type.length == 2) encoding = content_type[1].toUpperCase(); + } + if(encoding) { + body = new Buffer(html, 'binary'); + iconv = new Iconv(encoding, 'utf8'); + html = iconv.convert(body).toString('utf8'); + } + r.responseText = html; /* First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. */ - var eTag = r.getResponseHeader('ETag'); + var eTag = r['headers']['ETag']; if(eTag) { if(eTag in readability.pageETags) { dbg("Exact duplicate page found via ETag. Aborting."); @@ -1562,8 +1552,9 @@ timed(function(){ if(nextPageLink) { readability.appendNextPage(nextPageLink); + } else { + readability.success(document.body.innerHTML); } - } }); }(nextPageLink, articlePage)); }, @@ -1961,7 +1952,7 @@ var jsdom = require('jsdom'), var R = readability; var patch = { reComma: /[\uff0c,]/, // chinese comma, too - findNextPageLink: function() {return null;}, + /*findNextPageLink: function() {return null;},*/ getArticleTools: function() {return document.createElement('div');}, getArticleTitle: (function() { var old = R.getArticleTitle; @@ -2195,6 +2186,7 @@ function start(w, options, cb) { if (options.profile) { MyProfiler.reset(); } + readability.success = cb; readability.init(); @@ -2202,9 +2194,9 @@ function start(w, options, cb) { if (options.removeReadabilityArtifacts) removeReadabilityArtifacts(); if (options.removeClassNames) removeClassNames(); - + document.body.innerHTML = '