# 大三下的实践周,做一个数据分析
……
# 数据获取
一开始爬的智联招聘的数据,但是由于他后来改成 js 动态加载了,github 上以前的代码都用不了,我自己写呢,用了俩种方法
# request 解析
import os | |
import re | |
import time | |
import requests | |
from bs4 import BeautifulSoup | |
from fake_useragent import UserAgent | |
from openpyxl import Workbook, load_workbook | |
import sqlite3 | |
# DRIVER_PATH = 'G:\DLibrary\chromedriver-win64\chromedriver.exe' | |
# def init_db(dbpath): | |
# sql = ''' | |
# create table IF NOT EXISTS movie250 | |
# ( | |
# id integer primary key autoincrement, | |
# name text, | |
# year text | |
# ) | |
# ''' | |
# | |
# conn = sqlite3.connect(dbpath) | |
# cursor = conn.cursor() | |
# | |
# cursor.execute(sql) | |
# | |
# conn.commit() | |
# conn.close() | |
def saveData2DB(datalist): | |
with open('./data.txt', 'a', encoding='utf-8') as file: | |
for data in datalist: | |
file.write(data + '/n') | |
def getpostinfo(url, headers): | |
url_split = url.split('?') | |
url = url_split[0] | |
url += '?refcode=4019&srccode=401901&preactionid=948d2073-65a0-497b-8123-6f56ca3e1af8' | |
r = requests.get(url, headers=headers, timeout=10) | |
r.raise_for_status() | |
r.encoding = 'utf-8' | |
soup = BeautifulSoup(r.text, 'html.parser') | |
postinfo = [] | |
# todo None,登陆验证? | |
print(soup) | |
post = soup.find(attrs={'class': 'summary-plane__title'}).text.split(' ')[0] | |
desc = soup.find(attrs={'class': 'describtion__detail-content'}).text.split(' ')[0] | |
postinfo.append(post) | |
postinfo.append(desc) | |
print(postinfo) | |
saveData2DB(postinfo) | |
def getonepagelist(url, headers): | |
r = requests.get(url, headers=headers, timeout=10) | |
r.raise_for_status() | |
r.encoding = 'utf-8' | |
soup = BeautifulSoup(r.text, 'html.parser') | |
lsts = soup.find_all(attrs={'class': 'jobinfo__top'}) | |
print(lsts) | |
for lst in lsts: | |
href = lst.a['href'] | |
time.sleep(0.5) | |
getpostinfo(href, headers) | |
if __name__ == '__main__': | |
# 本地获取 | |
ua = UserAgent() | |
# ua.random 随机获取一个请求头 | |
# headers = {"useragent": ua.random} | |
with open("./cookie", "r", encoding="utf-8") as file: | |
Cookie = file.read() | |
# print(cookie) | |
headers = { | |
# ':authority': 'i.zhaopin.com', | |
# ':method': 'GET', | |
# ':path': '/', | |
# ':scheme': 'https', | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', | |
'Accept-Encoding': 'gzip, deflate, br, zstd', | |
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', | |
# 'Connection': 'keep-alive', | |
# 'Content-Length': '1216', | |
# 'Content-Type': 'text/plain;charset=UTF-8', | |
# 'Host': 'dkapi.geetest.com', | |
# 'Origin': 'https://jobs.zhaopin.com', | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0', | |
# 'Priority': 'u=0,i', | |
'Referer': 'https://jobs.zhaopin.com/', | |
'Sec-Ch-Ua': '"Microsoft Edge";v ="125","Chromium";v="125","Not.A/Brand";v ="24"', | |
'Sec-Ch-Ua-Mobile': '?0', | |
'Sec-Ch-Ua-Platform': 'Windows', | |
'Sec-Fetch-Dest': 'script', | |
'Sec-Fetch-Mode': 'no-cors', | |
'Sec-Fetch-Site': 'same-site', | |
'Upgrade-Insecure-Requests': '1', | |
'Cookie': Cookie | |
} | |
turn = 2 | |
for i in range(1, turn): | |
print(f'正在爬取第{i}页') | |
url = f'https://sou.zhaopin.com/?jl=653&kw=Java&p={i}' | |
getonepagelist(url, headers) | |
print(f'第{i}页爬取完成') |
# selenium
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.chrome.service import Service as ChromeService | |
from webdriver_manager.chrome import ChromeDriverManager | |
from bs4 import BeautifulSoup | |
import time | |
# 初始化 Selenium WebDriver | |
service = ChromeService(executable_path="G:\DLibrary\chromedriver-win64\chromedriver.exe") | |
driver = webdriver.Chrome(service=service) | |
def getpostinfo(url): | |
# url_split = url.split('?') | |
# url = url_split[0] | |
# url += '?refcode=4019&srccode=401901&preactionid=948d2073-65a0-497b-8123-6f56ca3e1af8' | |
# r = requests.get(url, headers=headers, timeout=10) | |
# r.raise_for_status() | |
# r.encoding = 'utf-8' | |
driver.get(url) | |
html = driver.page_source | |
# 使用 BeautifulSoup 解析 HTML | |
soup = BeautifulSoup(html, 'html.parser') | |
print(soup) | |
postinfo = [] | |
post = soup.find(attrs={'class': 'summary-plane__title'}).text.split(' ')[0] | |
desc = soup.find(attrs={'class': 'describtion__detail-content'}).text.split(' ')[0] | |
postinfo.append(post) | |
postinfo.append(desc) | |
print(postinfo) | |
# saveData2DB(postinfo) | |
def getonepagelist(url): | |
driver.get(url) | |
html = driver.page_source | |
# 使用 BeautifulSoup 解析 HTML | |
soup = BeautifulSoup(html, 'html.parser') | |
# 进行你需要的解析操作 | |
# print(soup.prettify()) | |
lsts = soup.find_all(attrs={'class': 'jobinfo__top'}) | |
print(lsts) | |
for lst in lsts: | |
href = lst.a['href'] | |
time.sleep(0.5) | |
getpostinfo(href) | |
if __name__ == '__main__': | |
turn = 2 | |
for i in range(1, turn): | |
print(f'正在爬取第{i}页') | |
url = f'https://sou.zhaopin.com/?jl=653&kw=Java&p={i}' | |
getonepagelist(url) | |
print(f'第{i}页爬取完成') | |
# 进行你需要的解析操作 | |
# print(soup.prettify()) |
都以失败告终,我还在登陆验证纠结了好久。
# 换个网站(解决不了问题,就解决问题本身)
换了个网站爬,杭州人才网
import os | |
import re | |
import time | |
import requests | |
from bs4 import BeautifulSoup | |
from fake_useragent import UserAgent | |
import pandas as pd | |
def saveData2DB(datalist): | |
columns = ['id', '工位名称', '薪资', '工作地点', '经验要求', '学历要求', '应届生接受情况', '工作福利', '招聘人数', | |
'到岗时间', '年龄要求', '婚姻情况', '任职要求', '公司内容', '公司类型', '公司规模'] | |
# 将列表转换为 DataFrame | |
new_data = pd.DataFrame([datalist], columns=columns) | |
# 打印结果以确认 | |
# print(new_data) | |
# 追加到现有的 CSV 文件(如果文件不存在则创建) | |
PATH = 'data_hzrc3.csv' | |
try: | |
df = pd.read_csv(PATH, encoding='utf-8') | |
# 追加新数据 | |
df = pd.concat([df, new_data], ignore_index=True) | |
except FileNotFoundError: | |
# 如果文件不存在,则新建一个 DataFrame | |
df = new_data | |
# 保存更新后的 DataFrame 到 CSV 文件 | |
df.to_csv(PATH, index=False, encoding='utf-8') | |
# print ("data_hzrc.csv 文件中。") | |
def getonepagelist(id, url, headers): | |
r = requests.get(url, headers=headers, timeout=10) | |
r.raise_for_status() | |
r.encoding = 'utf-8' | |
soup = BeautifulSoup(r.text, 'html.parser') | |
# print(soup) | |
# 如果为空 | |
if soup.find(attrs={'class': 'job_details_salary_n'}) is None: | |
print("该岗位不存在") | |
return | |
postinfo = [id] | |
job_name = soup.find(attrs={'class': 'job_details_name'}).text.split(' ')[0] | |
job_salary_n = soup.find(attrs={'class': 'job_details_salary_n'}).text.split(' ')[0] | |
postinfo.append(job_name) | |
postinfo.append(job_salary_n) | |
details = soup.find(attrs={'class': 'job_details_info'}).text.split("|") | |
details_segments = [segment.strip() for segment in details if segment.strip()] # (地点,经验要求,学历要求,(是否接受应届生)) | |
while len(details_segments) < 4: | |
details_segments.append(" ") | |
# print(details_segments) | |
postinfo += details_segments | |
job_details_welfare = "没有写福利,快跑!" | |
if soup.find(attrs={'class': 'job_details_welfare'}) is not None: | |
job_details_welfare = soup.find(attrs={'class': 'job_details_welfare'}).text # 福利 | |
job_details_welfare = job_details_welfare.strip().replace("\n", ",") | |
# print(job_details_welfare) | |
postinfo.append(job_details_welfare) | |
job_details_describe_yq = [] | |
if soup.find_all(attrs={'class': 'job_details_describe_yq'}) is not None: | |
job_details_describe_yq = soup.find_all(attrs={'class': 'job_details_describe_yq'}) | |
# print(job_details_describe_yq) | |
job_details_describe_segments_tmp = [""] * 4 | |
for job_details_describe in job_details_describe_yq: | |
if "招聘人数" in job_details_describe.text: | |
job_details_describe_segments_tmp[0] = job_details_describe.text | |
elif "到岗时间" in job_details_describe.text: | |
job_details_describe_segments_tmp[1] = job_details_describe.text | |
elif "年龄要求" in job_details_describe.text: | |
job_details_describe_segments_tmp[2] = job_details_describe.text | |
elif "婚况要求" in job_details_describe.text: | |
job_details_describe_segments_tmp[3] = job_details_describe.text | |
# print(job_details_describe_segments_tmp) | |
postinfo += job_details_describe_segments_tmp # 招聘人数… | |
description = "" | |
ps = soup.find(attrs="job_details_describe").find_all('p') | |
for i, p in enumerate(ps): | |
description += p.text.strip() | |
# print(description) | |
postinfo.append(description) # 任职要求… | |
compply_right_span_cs = soup.find_all(attrs={'class': 'Compply_right_span_c'}) | |
for i, compply_right_span_c in enumerate(compply_right_span_cs): | |
compply_right_span_cs[i] = compply_right_span_c.text | |
# print(compply_right_span_cs[:4]) | |
postinfo += compply_right_span_cs[:4] # 能源 / 环保 / 矿产 民营 20-99 人 (融资大部分企业都没写,就不要了,只要前三个) | |
# 拼接 | |
# postinfo.append(job_name) | |
# postinfo.append(job_salary_n) | |
# postinfo += details_segments | |
# postinfo.append(job_details_welfare) | |
# postinfo += job_details_describe_segments_tmp # 招聘人数… | |
# postinfo.append (description) # 任职要求… | |
print(postinfo) | |
# print(len(postinfo)) | |
# saveData2DB(postinfo) | |
if __name__ == '__main__': | |
# 本地获取 | |
ua = UserAgent() | |
# ua.random 随机获取一个请求头 | |
# headers = {"useragent": ua.random} | |
# with open("./cookie", "r", encoding="utf-8") as file: | |
# Cookie = file.read() | |
# print(cookie) | |
headers = { | |
'User-Agent': ua.random, | |
} | |
# 8482 | |
for i in range(1, 8482): | |
# print (f' 正在爬取第 {i} 个 ') | |
url = f'https://www.hangpin.com.cn/job/{i}.html' | |
getonepagelist(i, url, headers) | |
time.sleep(1) | |
print(f'第{i}个爬取完成') | |
# url = f'https://www.hangpin.com.cn/job/6454.html' | |
# getonepagelist(6454, url, headers) |
# 数据分析
数据可视化
# 总结
懂了好些关于爬虫的知识,
但是反爬虫还不是很懂,以及 JS 动态的数据该怎么爬