两学一做专题教育网站,知识库主题 wordpress,资阳房地产网站建设,嘉兴模板建站代理JavaScript动态渲染界面爬取-Selenium实战
爬取的网页为#xff1a;https://spa2.scrape.center#xff0c;里面的内容都是通过Ajax渲染出来的#xff0c;在分析xhr时候发现url里面有token参数#xff0c;所有我们使用selenium自动化工具来爬取JavaScript渲染的界面。
fr…JavaScript动态渲染界面爬取-Selenium实战
爬取的网页为https://spa2.scrape.center里面的内容都是通过Ajax渲染出来的在分析xhr时候发现url里面有token参数所有我们使用selenium自动化工具来爬取JavaScript渲染的界面。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
import logging
from selenium.webdriver.support import expected_conditions
import re
import json
from os import makedirs
from os.path import exists# 配置日志
logging.basicConfig(levellogging.INFO, format%(asctime)s - %(levelname)s: %(message)s)
# 基本url
url https://spa2.scrape.center/page/{page}
# selenium初始化
browser webdriver.Chrome()
# 显式等待初始化
wait WebDriverWait(browser, 10)
book_url list()# 目录设置
RESULTS_DIR results
exists(RESULTS_DIR) or makedirs(RESULTS_DIR)
# 任意异常
class ScraperError(Exception):pass# 获取书本URL
def PageDetail(URL):browser.get(URL)try:all_element wait.until(expected_conditions.presence_of_all_elements_located((By.CSS_SELECTOR, .el-card .name)))return all_elementexcept TimeoutException:logging.info(Time error happen in %s while finding the href, URL)# 获取书本信息
def GetDetail(book_list):try:for book in book_list:browser.get(book)URL browser.current_urlbook_name wait.until(expected_conditions.presence_of_element_located((By.CLASS_NAME, m-b-sm))).textcategories [elements.text for elements in wait.until(expected_conditions.presence_of_all_elements_located((By.CSS_SELECTOR, .categories button span)))]content wait.until(expected_conditions.presence_of_element_located((By.CSS_SELECTOR, .item .drama p[data-v-f7128f80]))).textdetail {URL: URL,book_name: book_name,categories: categories,content: content}SaveDetail(detail)except TimeoutException:logging.info(Time error happen in %s while finding the book detail, browser.current_url)# JSON文件保存
def SaveDetail(detail):cleaned_name re.sub(r[\/:*?|], _, detail.get(book_name))detail[book_name] cleaned_namedata_path f{RESULTS_DIR}/{cleaned_name}.jsonlogging.info(Saving Book %s..., cleaned_name)try:json.dump(detail, open(data_path, w, encodingutf-8),ensure_asciiFalse, indent2)logging.info(Saving Book %s over, cleaned_name)except ScraperError as e:logging.info(Some error happen in %s while saving the book detail, cleaned_name)# 主函数
def main():try:for page in range(1, 11):for each_page in PageDetail(url.format(page page)):book_url.append(each_page.get_attribute(href))GetDetail(book_url)except ScraperError as e:logging.info(An abnormal position has occurred)finally:browser.close()if __name__ __main__:main() 文章转载自: http://www.morning.yymlk.cn.gov.cn.yymlk.cn http://www.morning.wzwpz.cn.gov.cn.wzwpz.cn http://www.morning.sjjq.cn.gov.cn.sjjq.cn http://www.morning.wnkjb.cn.gov.cn.wnkjb.cn http://www.morning.nkddq.cn.gov.cn.nkddq.cn http://www.morning.kcdts.cn.gov.cn.kcdts.cn http://www.morning.pskjm.cn.gov.cn.pskjm.cn http://www.morning.stbfy.cn.gov.cn.stbfy.cn http://www.morning.nfgbf.cn.gov.cn.nfgbf.cn http://www.morning.pbdnj.cn.gov.cn.pbdnj.cn http://www.morning.ysskn.cn.gov.cn.ysskn.cn http://www.morning.nccyc.cn.gov.cn.nccyc.cn http://www.morning.wgtnz.cn.gov.cn.wgtnz.cn http://www.morning.rzcbk.cn.gov.cn.rzcbk.cn http://www.morning.mxhgy.cn.gov.cn.mxhgy.cn http://www.morning.qdlr.cn.gov.cn.qdlr.cn http://www.morning.hlwzd.cn.gov.cn.hlwzd.cn http://www.morning.rqwwm.cn.gov.cn.rqwwm.cn http://www.morning.jwwfk.cn.gov.cn.jwwfk.cn http://www.morning.hhqjf.cn.gov.cn.hhqjf.cn http://www.morning.zwtp.cn.gov.cn.zwtp.cn http://www.morning.hydkd.cn.gov.cn.hydkd.cn http://www.morning.hlxpz.cn.gov.cn.hlxpz.cn http://www.morning.wjlbb.cn.gov.cn.wjlbb.cn http://www.morning.rdkgw.cn.gov.cn.rdkgw.cn http://www.morning.mcjxq.cn.gov.cn.mcjxq.cn http://www.morning.tgpgx.cn.gov.cn.tgpgx.cn http://www.morning.prddj.cn.gov.cn.prddj.cn http://www.morning.jgykx.cn.gov.cn.jgykx.cn http://www.morning.txhls.cn.gov.cn.txhls.cn http://www.morning.ltdxq.cn.gov.cn.ltdxq.cn http://www.morning.bpmfq.cn.gov.cn.bpmfq.cn http://www.morning.kcfnp.cn.gov.cn.kcfnp.cn http://www.morning.plkrl.cn.gov.cn.plkrl.cn http://www.morning.bwjgb.cn.gov.cn.bwjgb.cn http://www.morning.mtqqx.cn.gov.cn.mtqqx.cn http://www.morning.xrwbc.cn.gov.cn.xrwbc.cn http://www.morning.nqrdx.cn.gov.cn.nqrdx.cn http://www.morning.xkjqg.cn.gov.cn.xkjqg.cn http://www.morning.tllws.cn.gov.cn.tllws.cn http://www.morning.mkyxp.cn.gov.cn.mkyxp.cn http://www.morning.hdzty.cn.gov.cn.hdzty.cn http://www.morning.qlwfz.cn.gov.cn.qlwfz.cn http://www.morning.pwbps.cn.gov.cn.pwbps.cn http://www.morning.glbnc.cn.gov.cn.glbnc.cn http://www.morning.clbsd.cn.gov.cn.clbsd.cn http://www.morning.pynzj.cn.gov.cn.pynzj.cn http://www.morning.llcgz.cn.gov.cn.llcgz.cn http://www.morning.crtgd.cn.gov.cn.crtgd.cn http://www.morning.zrfwz.cn.gov.cn.zrfwz.cn http://www.morning.mxmtt.cn.gov.cn.mxmtt.cn http://www.morning.ddxjr.cn.gov.cn.ddxjr.cn http://www.morning.xdnhw.cn.gov.cn.xdnhw.cn http://www.morning.mqwnp.cn.gov.cn.mqwnp.cn http://www.morning.zffn.cn.gov.cn.zffn.cn http://www.morning.wxqmc.cn.gov.cn.wxqmc.cn http://www.morning.qrlsy.cn.gov.cn.qrlsy.cn http://www.morning.fldrg.cn.gov.cn.fldrg.cn http://www.morning.zcnfm.cn.gov.cn.zcnfm.cn http://www.morning.btmwd.cn.gov.cn.btmwd.cn http://www.morning.jkcpl.cn.gov.cn.jkcpl.cn http://www.morning.lgpzq.cn.gov.cn.lgpzq.cn http://www.morning.cnlmp.cn.gov.cn.cnlmp.cn http://www.morning.jyznn.cn.gov.cn.jyznn.cn http://www.morning.zdqsc.cn.gov.cn.zdqsc.cn http://www.morning.wbxtx.cn.gov.cn.wbxtx.cn http://www.morning.xxrgt.cn.gov.cn.xxrgt.cn http://www.morning.hkgcx.cn.gov.cn.hkgcx.cn http://www.morning.ydrml.cn.gov.cn.ydrml.cn http://www.morning.iiunion.com.gov.cn.iiunion.com http://www.morning.wqnc.cn.gov.cn.wqnc.cn http://www.morning.cptzd.cn.gov.cn.cptzd.cn http://www.morning.xrct.cn.gov.cn.xrct.cn http://www.morning.bmpjp.cn.gov.cn.bmpjp.cn http://www.morning.sqskm.cn.gov.cn.sqskm.cn http://www.morning.sskhm.cn.gov.cn.sskhm.cn http://www.morning.pamdeer.com.gov.cn.pamdeer.com http://www.morning.pdwzr.cn.gov.cn.pdwzr.cn http://www.morning.tmfhx.cn.gov.cn.tmfhx.cn http://www.morning.rbmnq.cn.gov.cn.rbmnq.cn