爬虫 谷歌浏览器版本配置汇总(自用)
创建对象池,通过对象池获取(销毁)webclient对象
/**
* @description:
* @author: miracle-god
* @time: 2021/3/26 13:35
*/
@Slf4j
@Component
public class HttpPcDownloadUtils {
private static String webdriverChromeDriver;
private static String webdriverChromeLogfile;
private static String devFlag;
@Value("${webdriver.chrome.driver}")
private String webdriverChromeDriverProp;
@Value("${webdriver.chrome.logfile}")
private String webdriverChromeLogfileProp;
@Value("${spring.profiles.active}")
private String devFlagProp;
@PostConstruct
public void init() {
webdriverChromeLogfile = webdriverChromeLogfileProp;
webdriverChromeDriver = webdriverChromeDriverProp;
devFlag = devFlagProp;
}
/**
* 后续继承这个类,重写这个方法即可实现浏览器驱动版本的爬虫
*/
public void catchInfoOnHtml(ChromeDriver driver) {
log.info("-----------------do nothing---------------------------");
}
/**
* @description 获取浏览器驱动
* @author miracle-god
* @date 2021/7/21 20:31
**/
public ChromeDriver getChromeDriver(ChromeDriver chromeDriver, String proxyIp, String proxyPort) {
return getChromeDriver(chromeDriver, proxyIp, proxyPort, null);
}
/**
* @description 获取浏览器驱动
* @author miracle-god
* @date 2021/7/21 20:31
**/
public ChromeDriver getChromeDriver(ChromeDriver chromeDriver, String proxyIp, String proxyPort, Integer deviceFlag) {
if (chromeDriver != null) {
chromeDriver.manage().deleteAllCookies();
chromeDriver.quit();
}
ChromeOptions chromeOptions = new ChromeOptions();
//设置无头模式,没有引用进去
//linux下如果系统不支持可视化需要开启无头模式会启动失败
if (!System.getProperty("os.name").contains("Windows")) {
chromeOptions.setHeadless(Boolean.TRUE);
chromeOptions.addArguments("--no-sandbox", "--disable-dev-shm-usage", "--disable-gpu", "blink-settings=imagesEnabled=false", "--headless");
}
Random random = new Random();
//启动无痕/隐私模式
// chromeOptions.addArguments("–-incognito");
String resolution = Resolution.resolution.get(random.nextInt(Resolution.resolution.size()));
// chromeOptions.addArguments(String.format("--window-size=%s,%s", resolution.split(",")[0], resolution.split(",")[0]));
chromeOptions.addArguments("--start-maximized");
chromeOptions.addArguments("window-size=1920x3000");
chromeOptions.addArguments("--hide-scrollbars");
chromeOptions.addArguments("lang=zh_CN.UTF-8");
chromeOptions.addArguments("Accept=*/*");
chromeOptions.addArguments("Accept-Encoding=gzip, deflate, br");
chromeOptions.addArguments("Accept-Language=zh-CN,zh;q=0.9");
if (deviceFlag != null && deviceFlag == YxcBdKeywordsRankDTO.DEVICE_MOBILE) {
//模拟移动端
Map<String, String> mobileEmulation = new HashMap<>();
mobileEmulation.put("deviceName", "Nexus 5");
chromeOptions.setExperimentalOption("mobileEmulation", mobileEmulation);
}
/*
* (1) NONE: 当html下载完成之后,不等待解析完成,selenium会直接返回
* (2) EAGER:要等待整个dom树加载完成,即DOMContentLoaded这个事件完成,仅对html的内容进行下载解析
* (3) NORMAL:即正常情况下,selenium会等待整个界面加载完成(指对html和子资源的下载与解析,如JS文件,图片等,不包括ajax)
*/
chromeOptions.setPageLoadStrategy(PageLoadStrategy.NORMAL);
//禁用gpu加速
chromeOptions.addArguments("--disable-gpu");
//禁用正在自动控制提示语
chromeOptions.addArguments("--disable-infobars");
chromeOptions.setExperimentalOption("useAutomationExtension", false);
//忽略不可信证书错误。
chromeOptions.addArguments("--ignore-certificate-errors");
chromeOptions.addArguments("--disable-extensions");
// //不加载图片,加快访问速度 本地调试可以打开相关参数
// Map<String, Object> prefs = new HashMap<>();
// prefs.put("profile.managed_default_content_settings.images", 2);
// chromeOptions.setExperimentalOption("prefs", prefs);
//启用开发者,屏蔽selenium特征
//不提示“Chrome正受到自动测试软件控制”
chromeOptions.setExperimentalOption("excludeSwitches", Collections.singletonList("enable-automation"));
// chromeOptions.addArguments("user-data-dir=C:/Users/Administrator/AppData/Local/Google/Chrome/User Data");
// 禁止默认浏览器检查
chromeOptions.addArguments("no-default-browser-check");
chromeOptions.addArguments("--disable-cache");
chromeOptions.addArguments("--disk-cache-size=0");
chromeOptions.addArguments("--disable-icon-ntp");
chromeOptions.addArguments("--disable-ntp-favicons");
chromeOptions.addArguments("–Referer=https://www.facebook.com");
// 设置用户代理
// String userAgent = "\"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.3626.109 Safari/537.36;\"";
// chromeOptions.addArguments(userAgent);
String userAgent = UserAgent.userAgent.get(random.nextInt(UserAgent.userAgent.size()));
// log.info("[getChromeDriver]useragent:{},resolution:{}", userAgent, resolution);
chromeOptions.addArguments("user-agent=" + userAgent);
System.setProperty("webdriver.chrome.driver", webdriverChromeDriver);
// 设置日志
System.setProperty("webdriver.chrome.logfile", webdriverChromeLogfile);
System.setProperty("webdriver.chrome.verboseLogging", "true");
// 后期想要使用代理id可以放开这段代码 这个可以屏蔽id
DesiredCapabilities cap = new DesiredCapabilities();
//设置代理
if (StringUtils.isNotBlank(proxyIp) && StringUtils.isNotBlank(proxyPort)) {
String proxyIpAndPort = proxyIp + ":" + proxyPort;
Proxy proxy = new Proxy();
proxy.setHttpProxy(proxyIpAndPort).setFtpProxy(proxyIpAndPort).setSslProxy(proxyIpAndPort);
cap.setCapability(CapabilityType.ForSeleniumServer.AVOIDING_PROXY, true);
cap.setCapability(CapabilityType.ForSeleniumServer.ONLY_PROXYING_SELENIUM_TRAFFIC, true);
System.setProperty("http.nonProxyHosts", "localhost");
cap.setCapability(CapabilityType.PROXY, proxy); //加载代理
}
//加载option选项
cap.setCapability(ChromeOptions.CAPABILITY, chromeOptions);
if (chromeDriver == null) {
// chromeDriver = new ChromeDriver(chromeOptions);
chromeDriver = new ChromeDriver(cap);
}
//解决,对window.navigator.webdriver的检测机制
JavascriptExecutor executor = chromeDriver;
executor.executeScript("Object.defineProperties(navigator, {webdriver:{get:()=>undefined}});");
// 去除seleium全部指纹特征
// FileReader fileReader = new FileReader("C:\\Users\\Administrator\\Desktop\\stealth.min.js");
// String js = fileReader.readString();
// MapBuilder是依赖hutool工具包的api
// Map<String, Object> commandMap = MapBuilder.create(new LinkedHashMap<String, Object>()).put("source", js)
// .build();
// executeCdpCommand这个api在selenium3中是没有的,请使用selenium4才能使用此api
/*
* WebDriver自带了一个智能等待的方法。 dr.manage().timeouts().implicitlyWait(arg0, arg1);
* Arg0:等待的时间长度,int 类型 ; Arg1:等待时间的单位 TimeUnit.SECONDS 一般用秒作为单位。
*/
// chromeDriver.manage().timeouts().implicitlyWait(10, TimeUnit.SECONDS);
// chromeDriver.manage().timeouts().pageLoadTimeout(10, TimeUnit.SECONDS); //页面加载超时
// chromeDriver.manage().window().maximize();
return chromeDriver;
}
/**
* 模拟浏览器行为
* @param driver
* @param url
* @param proxyIp
* @param proxyPort
*/
@Retryable(value = {BizException.class}, maxAttempts = 5, backoff = @Backoff(delay = 150L, multiplier = 1))
public void simulationMobileUrl(ChromeDriver driver, String url, String proxyIp, String proxyPort) {
//获取浏览器驱动
driver = getChromeDriver(driver, proxyIp, proxyPort, DEVICE_MOBILE);
requestUrl(driver, url);
}
//todo 更改驱动
private void requestUrl(ChromeDriver driver, String url) {
try {
log.info("[simulationUrl]正在请求url:{}", url);
if (StringUtils.isNotBlank(url)) {
driver.get(url);
}
catchInfoOnHtml(driver);
} catch (BizException e) {
log.info("浏览器驱动捕捉到异常:", e);
throw e;
} catch (Exception e) {
log.info("浏览器驱动捕捉到异常:", e);
throw new BizException("浏览器异常");
} finally {
try {
// driver.close();
//清理所有cookie
driver.manage().deleteAllCookies();
driver.getLocalStorage().clear();
driver.quit();
} catch (Exception e) {
log.info("关闭浏览器驱动失败", e);
}
}
}
}
//google-chrome v95
//chromedriver.exe 从ChromeDriver Mirror (taobao.org) 查找对应的版本
//pom版本
/**<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.35.0</version>
</dependency>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>3.141.59</version>
</dependency> **/
//顺手放一个htmlunit版本
/**
* @description: webclient对象
* @author: miracle-god
* @time: 2021/7/22 10:18
*/
@Slf4j
public class HtmlUnitFactory extends BasePooledObjectFactory<WebClient> {
/**
* @description 创建webclient对象 无界面浏览器
* @author miracle-god
* @date 2021/7/22 13:38
**/
@Override
public WebClient create() {
log.info("[HtmlUnitFactory]create webclient start");
// 火狐版本
WebClient webClient = new WebClient(BrowserVersion.FIREFOX);
// 解决证书不信任问题
webClient.getOptions().setUseInsecureSSL(true);
// 禁用css ,一般来说 css没啥用
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
//节约带宽,不下载图片
webClient.getOptions().setDownloadImages(false);
//支持重定向
webClient.getOptions().setRedirectEnabled(true);
//关闭cookie管理
webClient.getCookieManager().setCookiesEnabled(false);
//伪装正常浏览器
// webClient.addRequestHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36");
// webClient.addRequestHeader("User-Agent", "Mozilla/5.0 (Linux; Android 10; ELE-AL00 Build/HUAWEIELE-AL00; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/76.0.3809.89 Mobile Safari/537.36 T7/12.20.0.0 SP-engine/2.33.0 baiduboxapp");
// 设置支持 js
webClient.getOptions().setJavaScriptEnabled(true);
// 不抛出异常
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.getOptions().setDoNotTrackEnabled(true);
webClient.getOptions().setHistorySizeLimit(50);
// 最好设置一下超时
webClient.getOptions().setTimeout(5 * 1000);
//设置文件缓存加速
webClient.getCache().setMaxSize(100);
webClient.setJavaScriptErrorListener(new JavaScriptErrorListener() {
@Override
public void scriptException(HtmlPage page, ScriptException scriptException) {
log.info("[JavaScriptErrorListener] scriptException do noting");
}
@Override
public void timeoutError(HtmlPage page, long allowedTime, long executionTime) {
log.info("[JavaScriptErrorListener] timeoutError do noting");
}
@Override
public void malformedScriptURL(HtmlPage page, String url, MalformedURLException malformedURLException) {
log.info("[JavaScriptErrorListener] malformedScriptURL do noting");
}
@Override
public void loadScriptError(HtmlPage page, URL scriptUrl, Exception exception) {
log.info("[JavaScriptErrorListener] loadScriptError do noting");
}
@Override
public void warn(String message, String sourceName, int line, String lineSource, int lineOffset) {
log.info("[JavaScriptErrorListener] warn do noting");
}
});
// 支持ajax
webClient.setAjaxController(new NicelyResynchronizingAjaxController());
return webClient;
}
@Override
public PooledObject<WebClient> wrap(WebClient webClient) {
return new DefaultPooledObject<>(webClient);
}
/**
* 销毁webClientPooledObject对象
*
* @param webClientPooledObject
*/
@Override
public void destroyObject(PooledObject<WebClient> webClientPooledObject) {
if (webClientPooledObject == null) {
return;
}
WebClient object = webClientPooledObject.getObject();
object.close();
}

浙公网安备 33010602011771号