python_selenium

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
from openpyxl.reader.excel import load_workbook
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

# 安装driver,压缩包,将其解压,复制到Python的与Scripts目录同级文件夹下
# 创建一个新的工作簿
wb = load_workbook('output.xlsx')
ws = wb.active # 获取活动工作表

for page_num in range(84,154):
# option = selenium.webdriver.chrome.options.Options()
# option.binary_location = r"C:\Users\chrome\chrome-headless-shell.exe"
# driver = webdriver.Chrome(options=option)
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(f"https://www.baidu.com/{page_num}")
articles = driver.find_elements(By.XPATH,"/html/body/article")
# 列数据处理
for index,article in enumerate(articles[:2] + articles[3:]):
title = article.find_element(By.XPATH,"header/h2/a").text
try:
like_num = article.find_element(By.XPATH, "div/a[1]").text.split("(")[1].split(")")[0]
read_num = article.find_element(By.XPATH,"div/span[2]").text.split("(")[1].split(")")[0]
print(10*(page_num-1)+index+1)
print(title)
print(read_num)
print("*"*6)
ws.cell(row=10*(page_num-1)+index+1, column=1, value=title) # 第一列数据
ws.cell(row=10*(page_num-1)+index+1, column=2, value=read_num) # 第二列数据
ws.cell(row=10*(page_num-1)+index+1, column=3, value=like_num) # 第二列数据
# 保存 Excel 文件
wb.save('output.xlsx')
except:
pass
# 关闭浏览器
driver.quit()