Commit 4f2fc03c authored by Thomas Urban's avatar Thomas Urban

revising URL grabber to always use redirection targets

parent face5ef8
......@@ -102,7 +102,7 @@ class Client extends EventEmitter {
* Fetches provided URL using GET method.
*
* @param {string} url URL to be fetched using GET method
* @returns {Promise<{headers:object<string,string>, body:Buffer}>} promises response headers and body
* @returns {Promise<{url:string, headers:object<string,string>, body:Buffer}>} promises response headers and body
*/
fetch( url ) {
this.logInfo( `fetching URL ${url}` );
......@@ -166,6 +166,7 @@ class Client extends EventEmitter {
this.logInfo( `fetched URL ${url}` );
resolve( {
url,
headers: res.headers,
body: Buffer.concat( chunks ),
} );
......@@ -253,6 +254,7 @@ class Client extends EventEmitter {
}
const cacheFile = Path.resolve( __dirname, new URL( StartUrl ).hostname + ".cache" );
new Promise( resolve => {
......@@ -292,8 +294,8 @@ new Promise( resolve => {
}
grabber.fetch( url )
.then( ( { body } ) => {
found[url] = true;
.then( ( { url: actualUrl, body } ) => {
found[actualUrl] = true;
const numFound = Object.keys( found ).length;
grabber.logInfo( `got ${numFound}/${MaxUrlCount} URLs (queued: ${queue.length})` );
......@@ -312,9 +314,11 @@ new Promise( resolve => {
let newUrl = match[2];
if ( !newUrl.startsWith( StartUrl ) ) {
newUrl = Url.resolve( url, newUrl );
newUrl = Url.resolve( actualUrl, newUrl );
}
newUrl = newUrl.replace( /#.*$/, "" );
if ( !found[newUrl] && queue.indexOf( newUrl ) < 0 && newUrl.startsWith( StartUrl ) ) {
queue.push( newUrl );
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment