我会尝试禁用Javascript,以便网站知道在Puppeteer上Javascript已被禁用(即
<noscript>
标记)。我编写了一个基础类来爬取网站,但我的脚本无法禁用Javascript,因此无法实现该功能。以下是我的代码:// https://dev59.com/b5rga4cB1Zd3GeqPiyki
import puppeteer, { puppeteerErrors, Target, Browser } from "puppeteer";
import { readFileSync } from "fs"
import { helpers } from "./helpers";
import _ from "lodash"
/**
* Base class for all crawler
*/
abstract class BaseCrawler {
public static readonly TOR_PATH = process.env.TOR_PATH ?? "";
public static readonly TOR_PROFILE_PATH = process.env.TOR_PROFILE_PATH ?? "";
public static readonly TORRC_PATH = process.env.TORRC_PATH;
public static headless = false
public readonly browser: Promise<puppeteer.Browser>;
private readonly jsEnabled: boolean;
/**
* get the active page
* @returns null if it couldn't get the active
*/
public async activePage(timeout = 30_000): Promise<puppeteer.Page | null> {
const browser = await this.browser;
var start = new Date().getTime();
while (new Date().getTime() - start < timeout) {
var pages = await browser.pages();
var arr = [];
for (const p of pages) {
if (await p.evaluate(() => { return document.visibilityState == 'visible' })) {
arr.push(p);
}
}
if (arr.length == 1)
return arr[0];
}
return null;
}
constructor(jsEnabled = false, website = "https://google.com") {
console.log(Browser)
this.browser = puppeteer.launch({
headless: BaseCrawler.headless,
//args: ["--proxy-server=socks5://127.0.0.1:9050"],
userDataDir: "./.headless-data"
});
this.jsEnabled = jsEnabled;
this.browser.then(async (b) => {
b.on("targetcreated", async (e: Target) => {
const page = await e.page();
// set a tor useragent
page?.setUserAgent(`Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/${_.random(60, 100)}.0`);
// disable script if it's aksed
if (page?.url()) {
// console.log(page.url().search("chrome://"))
if (page.url().search("chrome://") < 0)
page?.setJavaScriptEnabled(this.jsEnabled)
}
page?.on('request', request => {
if (request.resourceType() === 'script')
request.abort();
else
request.continue();
})
})
})
this.browser.then(async (b) => {
b.newPage()
const page = await b.newPage();
page.goto(website);
})
}
}
/** Bland tor window just made to browse tor */
export class TorWindow extends BaseCrawler {
};
我尝试钩取newPage
函数,但它无法正常工作,因为它会给我以下错误:
PS C:\Users\vince\project\js\crawler-project> yarn run browser
yarn run v1.22.10
$ node . --tor-window
undefined
C:\Users\vince\project\js\crawler-project\\dist\Crawler.js:35
const old_newpage = puppeteer_1.default.Browser.prototype.newPage;
^
TypeError: Cannot read property 'prototype' of undefined
at new BaseCrawler (C:\Users\vince\project\js\crawler-project\dist\Crawler.js:35:57)
at new TorWindow (C:\Users\vince\project\js\crawler-project\dist\Crawler.js:98:1)
at Object.<anonymous> (C:\Users\vince\project\js\crawler-project\dist\index.js:27:5)
at Module._compile (node:internal/modules/cjs/loader:1092:14)
at Object.Module._extensions..js (node:internal/modules/cjs/loader:1121:10)
at Module.load (node:internal/modules/cjs/loader:972:32)
at Function.Module._load (node:internal/modules/cjs/loader:813:14)
at Function.executeUserEntryPoint [as runMain] (node:internal/modules/run_main:76:12)
at node:internal/main/run_main_module:17:47
error Command failed with exit code 1.
info Visit https://yarnpkg.com/en/docs/cli/run for documentation about this command.
// https://dev59.com/b5rga4cB1Zd3GeqPiyki
import puppeteer, { puppeteerErrors, Target, Browser } from "puppeteer";
import { readFileSync } from "fs"
import { helpers } from "./helpers";
import _ from "lodash"
/**
* Base class for all crawler
*/
abstract class BaseCrawler {
public static readonly TOR_PATH = process.env.TOR_PATH ?? "";
public static readonly TOR_PROFILE_PATH = process.env.TOR_PROFILE_PATH ?? "";
public static readonly TORRC_PATH = process.env.TORRC_PATH;
public static headless = false
public readonly browser: Promise<puppeteer.Browser>;
private readonly jsEnabled: boolean;
/**
* get the active page
* @returns null if it couldn't get the active
*/
public async activePage(timeout = 30_000): Promise<puppeteer.Page | null> {
const browser = await this.browser;
var start = new Date().getTime();
while (new Date().getTime() - start < timeout) {
var pages = await browser.pages();
var arr = [];
for (const p of pages) {
if (await p.evaluate(() => { return document.visibilityState == 'visible' })) {
arr.push(p);
}
}
if (arr.length == 1)
return arr[0];
}
return null;
}
constructor(jsEnabled = false, website = "https://check.torproject.org") {
console.log(Browser)
const old_newpage = puppeteer.Browser.prototype.newPage
puppeteer.Browser.prototype.newPage = async () => {
const page = await old_newpage()
page.setJavaScriptEnabled(this.jsEnabled)
return page;
}
this.browser = puppeteer.launch({
headless: BaseCrawler.headless,
args: ["--proxy-server=socks5://127.0.0.1:9050"],
userDataDir: "./.headless-data"
});
this.jsEnabled = jsEnabled;
this.browser.then(async (b) => {
b.on("targetcreated", async (e: Target) => {
const page = await e.page();
// set a tor useragent
page?.setUserAgent(`Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/${_.random(60, 100)}.0`);
// disable script if it's aksed
if (page?.url()) {
// console.log(page.url().search("chrome://"))
if (page.url().search("chrome://") < 0)
page?.setJavaScriptEnabled(this.jsEnabled)
}
page?.on('request', request => {
if (request.resourceType() === 'script')
request.abort();
else
request.continue();
})
})
})
this.browser.then(async (b) => {
b.newPage()
const page = await b.newPage();
page.goto(website);
})
}
}
/** Bland tor window just made to browse tor */
export class TorWindow extends BaseCrawler {
};
page.setJavaScriptEnabled(false)
来禁用 JavaScript,但这个错误与此无关;我建议您从一个更简单的例子开始。 - Vaviloff