python爬取51job数据并存储csv文件

#程序来源:泓源视野 博弈源 byy3.com 

#程序改好爬取数据和存储csv 下一步解决匹配数据 工作地点和招聘人数
#新问题出现了直接调用 datalist写入数据是一条数据包括(网址 工资...)写到了一起
#添加header use-agent 已解决 下一步利用数据进行画图输出可视化数据

from bs4 import BeautifulSoup

# 正则表达式

import re

# 定制url,获取网页数据

import urllib.request,urllib.error

# 进行excel操作

import xlwt

# 进行数据库操作

import sqlite3

# 根据需求对输入的解析

from urllib import parse

from lxml import etree

import json

import requests

import csv

 

keyWord =input("请输入需要爬取的关键字:")

word=parse.quote(keyWord)

newWord = parse.quote(word)

# jobData={}#每一个记录是一个列表,把二次获取到的网页存成字典格式

# jobList =[] #把上面获取到的信息放在列表中

#伪装爬取头部,以防止被网站禁止
headers={'Host':'search.51job.com',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko)\
Chrome/63.0.3239.132 Safari/537.36'}

def main():

# 爬取网页

# url = "https://search.51job.com/list/090200,000000,0000,00,9,99," + newWord + ",2," + str(pageNmb) + ".html"

 

# html = askUrl(url)

# print(html)

datalist = []

for i in range(1,10005): #爬取的1-2页数

url = "https://search.51job.com/list/000000,000000,0000,00,9,99,"+newWord+",2,"+str(i)+".html"

pagaLink=getLink(url) #爬取列表页,获取该列表的全部岗位链接

if len(pagaLink) == 0:

break

for jobpage in pagaLink:

data=getaData(jobpage) #一个详情页的链接

datalist.append(data)

print(datalist)

 

#打开Data_mining.csv文件,进行写入操作

csvFile = open("51job1.csv", 'a', newline='')

writer = csv.writer(csvFile)

writer.writerow(('link','job','salary','company','experience',\

'education','renshu',))

writer.writerow(datalist)

# f.write(job_name + ","+place+',' + data_time + ',' + money + '\n')

# f.write(','.join(job_list))

# datalist = getaData(baseurl)

# savapath = ".\\51job.xlsx"

# dbpath = "movie.db"

# 保存数据到excel

# savaData(datalist,savapath)

# 保存数据到数据库

# savaData2DB(datalist, dbpath)

#关闭写入文件

# csvFile.close()

 

# 获取网页信息

def getLink(url):

dataList=[]

#获取到源码

html = askUrl(url)

data = str(html)

# print(data)

find_info = re.findall(r'window.__SEARCH_RESULT__ = {(.*?)keywords', data)

# print(find_info)

#拼接上少了的:keywords":"找工作,求职,人才,招聘"}这一部分

find_info = '{' + str(find_info[0]) + 'keywords":"找工作,求职,人才,招聘"}'

# print(find_info)

# 将获取到内容转换为json格式

json_data = json.loads(find_info)

# print(json_data)

for i in json_data['engine_search_result']:

dataList.append(i['job_href'])

# print(jobList)

# print(dataList)

return dataList

 

 

# 二次获取到的链接页面解析

def getaData(jobpage):

data = []

jobHtml = askUrl(jobpage) #获取详情页面

# print(jobHtml)

soup = BeautifulSoup(jobHtml, "html.parser")

 

# data.append(jobpage)

jnames = soup.select('h1[title]')#职位名称

# print(jnames)

jname = jnames[0]["title"]

# print(jname)

# data.append(jname)

 

moneys = soup.select("div.cn strong ") # 待遇

# print(moneys[0])

if str(moneys[0])=="<strong></strong>":

money="面谈"

else:

money = moneys[0].string

# data.append(money)

 

gnames = soup.select(".cname > a") # 公司名字

gname = gnames[0]["title"]

# data.append(gname)

 

jobMlsit = soup.select("p.msg") # 工作经验

list1 = jobMlsit[0]["title"].split("|")

jinyan = list1[1].strip()

# data.append(jinyan)

 

lieu = list1[-1].strip() # 工作地点

# data.append(lieu)

# print(lieu)

 

xuli = list1[2].strip() # 学历

# data.append(xuli)

# print(xuli)

# renshu = list1[3].strip() # 招人数

 

# try:

data.append(jobpage+" "+jname+" "+money+" "+gname+" "+jinyan+" "+xuli)

# except:

# pass

# print(renshu)

 

print(data)

return data

 

 

# 爬取网页

def askUrl(url):

# 模拟浏览器信息,想豆瓣服务器发送信息

headers = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36",

}

# 用户代理,告诉服务器,我们是什么类型的机器,浏览器(本质上是告诉浏览器我们可以接收什么水平的内容)

 

# 发送请求消息

request = urllib.request.Request(url,headers=headers)

html = ""

# 捕获异常

try:

# 取得响应

response = urllib.request.urlopen(request)

# 获取网页内容

html = response.read().decode("gbk")

# print(html)

except urllib.error.URLError as e:

if hasattr(e,"code"):

print(e.code)

if hasattr(e,"reson"):

print(e.reson)

return html

 

 

if __name__ == '__main__':

main()

# askUrl("https://jobs.51job.com/chengdu-gxq/124561448.html?s=01&t=0")

print("爬取完毕")

 

 

本文由 泓源视野 作者:admin 发表,其版权均为 泓源视野 所有,文章内容系作者个人观点,不代表 泓源视野 对观点赞同或支持。如需转载,请注明文章来源。

发表评论

Protected with IP Blacklist CloudIP Blacklist Cloud

您是第8238192 位访客, 您的IP是:[34.200.226.179]