# 大三下的实践周,做一个数据分析

……

# 数据获取

一开始爬的智联招聘的数据,但是由于他后来改成 js 动态加载了,github 上以前的代码都用不了,我自己写呢,用了俩种方法

# request 解析

import os
import re
import time
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from openpyxl import Workbook, load_workbook
import sqlite3
# DRIVER_PATH = 'G:\DLibrary\chromedriver-win64\chromedriver.exe'
# def init_db(dbpath):
#     sql = '''
#             create table IF NOT EXISTS movie250
#             (
#             id integer primary key autoincrement,
#             name text,
#             year text
#             )
#         '''
#
#     conn = sqlite3.connect(dbpath)
#     cursor = conn.cursor()
#
#     cursor.execute(sql)
#
#     conn.commit()
#     conn.close()
def saveData2DB(datalist):
    with open('./data.txt', 'a', encoding='utf-8') as file:
        for data in datalist:
            file.write(data + '/n')
def getpostinfo(url, headers):
    url_split = url.split('?')
    url = url_split[0]
    url += '?refcode=4019&srccode=401901&preactionid=948d2073-65a0-497b-8123-6f56ca3e1af8'
    r = requests.get(url, headers=headers, timeout=10)
    r.raise_for_status()
    r.encoding = 'utf-8'
    soup = BeautifulSoup(r.text, 'html.parser')
    postinfo = []
    # todo None,登陆验证?
    print(soup)
    post = soup.find(attrs={'class': 'summary-plane__title'}).text.split(' ')[0]
    desc = soup.find(attrs={'class': 'describtion__detail-content'}).text.split(' ')[0]
    postinfo.append(post)
    postinfo.append(desc)
    print(postinfo)
    saveData2DB(postinfo)
def getonepagelist(url, headers):
    r = requests.get(url, headers=headers, timeout=10)
    r.raise_for_status()
    r.encoding = 'utf-8'
    soup = BeautifulSoup(r.text, 'html.parser')
    lsts = soup.find_all(attrs={'class': 'jobinfo__top'})
    print(lsts)
    for lst in lsts:
        href = lst.a['href']
        time.sleep(0.5)
        getpostinfo(href, headers)
if __name__ == '__main__':
    # 本地获取
    ua = UserAgent()
    # ua.random 随机获取一个请求头
    # headers = {"useragent": ua.random}
    with open("./cookie", "r", encoding="utf-8") as file:
        Cookie = file.read()
    # print(cookie)
    headers = {
        # ':authority': 'i.zhaopin.com',
        # ':method': 'GET',
        # ':path': '/',
        # ':scheme': 'https',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Accept-Encoding': 'gzip, deflate, br, zstd',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
        # 'Connection': 'keep-alive',
        # 'Content-Length': '1216',
        # 'Content-Type': 'text/plain;charset=UTF-8',
        # 'Host': 'dkapi.geetest.com',
        # 'Origin': 'https://jobs.zhaopin.com',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0',
        # 'Priority': 'u=0,i',
        'Referer': 'https://jobs.zhaopin.com/',
        'Sec-Ch-Ua': '"Microsoft Edge";v ="125","Chromium";v="125","Not.A/Brand";v ="24"',
        'Sec-Ch-Ua-Mobile': '?0',
        'Sec-Ch-Ua-Platform': 'Windows',
        'Sec-Fetch-Dest': 'script',
        'Sec-Fetch-Mode': 'no-cors',
        'Sec-Fetch-Site': 'same-site',
        'Upgrade-Insecure-Requests': '1',
        'Cookie': Cookie
    }
    turn = 2
    for i in range(1, turn):
        print(f'正在爬取第{i}页')
        url = f'https://sou.zhaopin.com/?jl=653&kw=Java&p={i}'
        getonepagelist(url, headers)
        print(f'第{i}页爬取完成')

# selenium

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
# 初始化 Selenium WebDriver
service = ChromeService(executable_path="G:\DLibrary\chromedriver-win64\chromedriver.exe")
driver = webdriver.Chrome(service=service)
def getpostinfo(url):
    # url_split = url.split('?')
    # url = url_split[0]
    # url += '?refcode=4019&srccode=401901&preactionid=948d2073-65a0-497b-8123-6f56ca3e1af8'
    # r = requests.get(url, headers=headers, timeout=10)
    # r.raise_for_status()
    # r.encoding = 'utf-8'
    driver.get(url)
    html = driver.page_source
    # 使用 BeautifulSoup 解析 HTML
    soup = BeautifulSoup(html, 'html.parser')
    print(soup)
    postinfo = []
    post = soup.find(attrs={'class': 'summary-plane__title'}).text.split(' ')[0]
    desc = soup.find(attrs={'class': 'describtion__detail-content'}).text.split(' ')[0]
    postinfo.append(post)
    postinfo.append(desc)
    print(postinfo)
    # saveData2DB(postinfo)
def getonepagelist(url):
    driver.get(url)
    html = driver.page_source
    # 使用 BeautifulSoup 解析 HTML
    soup = BeautifulSoup(html, 'html.parser')
    # 进行你需要的解析操作
    # print(soup.prettify())
    lsts = soup.find_all(attrs={'class': 'jobinfo__top'})
    print(lsts)
    for lst in lsts:
        href = lst.a['href']
        time.sleep(0.5)
        getpostinfo(href)
if __name__ == '__main__':
    turn = 2
    for i in range(1, turn):
        print(f'正在爬取第{i}页')
        url = f'https://sou.zhaopin.com/?jl=653&kw=Java&p={i}'
        getonepagelist(url)
        print(f'第{i}页爬取完成')
    # 进行你需要的解析操作
    # print(soup.prettify())

都以失败告终,我还在登陆验证纠结了好久。

# 换个网站(解决不了问题,就解决问题本身)

换了个网站爬,杭州人才网

import os
import re
import time
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import pandas as pd
def saveData2DB(datalist):
    columns = ['id', '工位名称', '薪资', '工作地点', '经验要求', '学历要求', '应届生接受情况', '工作福利', '招聘人数',
               '到岗时间', '年龄要求', '婚姻情况', '任职要求', '公司内容', '公司类型', '公司规模']
    # 将列表转换为 DataFrame
    new_data = pd.DataFrame([datalist], columns=columns)
    # 打印结果以确认
    # print(new_data)
    # 追加到现有的 CSV 文件(如果文件不存在则创建)
    PATH = 'data_hzrc3.csv'
    try:
        df = pd.read_csv(PATH, encoding='utf-8')
        # 追加新数据
        df = pd.concat([df, new_data], ignore_index=True)
    except FileNotFoundError:
        # 如果文件不存在,则新建一个 DataFrame
        df = new_data
    # 保存更新后的 DataFrame 到 CSV 文件
    df.to_csv(PATH, index=False, encoding='utf-8')
    # print ("data_hzrc.csv 文件中。")
def getonepagelist(id, url, headers):
    r = requests.get(url, headers=headers, timeout=10)
    r.raise_for_status()
    r.encoding = 'utf-8'
    soup = BeautifulSoup(r.text, 'html.parser')
    # print(soup)
    # 如果为空
    if soup.find(attrs={'class': 'job_details_salary_n'}) is None:
        print("该岗位不存在")
        return
    postinfo = [id]
    job_name = soup.find(attrs={'class': 'job_details_name'}).text.split(' ')[0]
    job_salary_n = soup.find(attrs={'class': 'job_details_salary_n'}).text.split(' ')[0]
    postinfo.append(job_name)
    postinfo.append(job_salary_n)
    details = soup.find(attrs={'class': 'job_details_info'}).text.split("|")
    details_segments = [segment.strip() for segment in details if segment.strip()]  # (地点,经验要求,学历要求,(是否接受应届生))
    while len(details_segments) < 4:
        details_segments.append(" ")
    # print(details_segments)
    postinfo += details_segments
    job_details_welfare = "没有写福利,快跑!"
    if soup.find(attrs={'class': 'job_details_welfare'}) is not None:
        job_details_welfare = soup.find(attrs={'class': 'job_details_welfare'}).text  # 福利
        job_details_welfare = job_details_welfare.strip().replace("\n", ",")
    # print(job_details_welfare)
    postinfo.append(job_details_welfare)
    job_details_describe_yq = []
    if soup.find_all(attrs={'class': 'job_details_describe_yq'}) is not None:
        job_details_describe_yq = soup.find_all(attrs={'class': 'job_details_describe_yq'})
    # print(job_details_describe_yq)
    job_details_describe_segments_tmp = [""] * 4
    for job_details_describe in job_details_describe_yq:
        if "招聘人数" in job_details_describe.text:
            job_details_describe_segments_tmp[0] = job_details_describe.text
        elif "到岗时间" in job_details_describe.text:
            job_details_describe_segments_tmp[1] = job_details_describe.text
        elif "年龄要求" in job_details_describe.text:
            job_details_describe_segments_tmp[2] = job_details_describe.text
        elif "婚况要求" in job_details_describe.text:
            job_details_describe_segments_tmp[3] = job_details_describe.text
    # print(job_details_describe_segments_tmp)
    postinfo += job_details_describe_segments_tmp  # 招聘人数…
    description = ""
    ps = soup.find(attrs="job_details_describe").find_all('p')
    for i, p in enumerate(ps):
        description += p.text.strip()
    # print(description)
    postinfo.append(description)  # 任职要求…
    compply_right_span_cs = soup.find_all(attrs={'class': 'Compply_right_span_c'})
    for i, compply_right_span_c in enumerate(compply_right_span_cs):
        compply_right_span_cs[i] = compply_right_span_c.text
    # print(compply_right_span_cs[:4])
    postinfo += compply_right_span_cs[:4]  # 能源 / 环保 / 矿产 民营 20-99 人 (融资大部分企业都没写,就不要了,只要前三个)
    # 拼接
    # postinfo.append(job_name)
    # postinfo.append(job_salary_n)
    # postinfo += details_segments
    # postinfo.append(job_details_welfare)
    # postinfo += job_details_describe_segments_tmp  # 招聘人数…
    # postinfo.append (description)  # 任职要求…
    print(postinfo)
    # print(len(postinfo))
    # saveData2DB(postinfo)
if __name__ == '__main__':
    # 本地获取
    ua = UserAgent()
    # ua.random 随机获取一个请求头
    # headers = {"useragent": ua.random}
    # with open("./cookie", "r", encoding="utf-8") as file:
    #     Cookie = file.read()
    # print(cookie)
    headers = {
        'User-Agent': ua.random,
    }
    # 8482
    for i in range(1, 8482):
        # print (f' 正在爬取第 {i} 个 ')
        url = f'https://www.hangpin.com.cn/job/{i}.html'
        getonepagelist(i, url, headers)
        time.sleep(1)
        print(f'第{i}个爬取完成')
    # url = f'https://www.hangpin.com.cn/job/6454.html'
    # getonepagelist(6454, url, headers)

# 数据分析

数据可视化

# 总结

懂了好些关于爬虫的知识,

但是反爬虫还不是很懂,以及 JS 动态的数据该怎么爬