python爬取51job数据并存储csv文件
#程序改好爬取数据和存储csv 下一步解决匹配数据 工作地点和招聘人数
#新问题出现了直接调用 datalist写入数据是一条数据包括(网址 工资...)写到了一起
#添加header use-agent 已解决 下一步利用数据进行画图输出可视化数据
from bs4 import BeautifulSoup
# 正则表达式
import re
# 定制url,获取网页数据
import urllib.request,urllib.error
# 进行excel操作
import xlwt
# 进行数据库操作
import sqlite3
# 根据需求对输入的解析
from urllib import parse
from lxml import etree
import json
import requests
import csv
keyWord =input("请输入需要爬取的关键字:")
word=parse.quote(keyWord)
newWord = parse.quote(word)
# jobData={}#每一个记录是一个列表,把二次获取到的网页存成字典格式
# jobList =[] #把上面获取到的信息放在列表中
#伪装爬取头部,以防止被网站禁止
headers={'Host':'search.51job.com',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko)\
Chrome/63.0.3239.132 Safari/537.36'}
def main():
# 爬取网页
# url = "https://search.51job.com/list/090200,000000,0000,00,9,99," + newWord + ",2," + str(pageNmb) + ".html"
# html = askUrl(url)
# print(html)
datalist = []
for i in range(1,10005): #爬取的1-2页数
url = "https://search.51job.com/list/000000,000000,0000,00,9,99,"+newWord+",2,"+str(i)+".html"
pagaLink=getLink(url) #爬取列表页,获取该列表的全部岗位链接
if len(pagaLink) == 0:
break
for jobpage in pagaLink:
data=getaData(jobpage) #一个详情页的链接
datalist.append(data)
print(datalist)
#打开Data_mining.csv文件,进行写入操作
csvFile = open("51job1.csv", 'a', newline='')
writer = csv.writer(csvFile)
writer.writerow(('link','job','salary','company','experience',\
'education','renshu',))
writer.writerow(datalist)
# f.write(job_name + ","+place+',' + data_time + ',' + money + '\n')
# f.write(','.join(job_list))
# datalist = getaData(baseurl)
# savapath = ".\\51job.xlsx"
# dbpath = "movie.db"
# 保存数据到excel
# savaData(datalist,savapath)
# 保存数据到数据库
# savaData2DB(datalist, dbpath)
#关闭写入文件
# csvFile.close()
# 获取网页信息
def getLink(url):
dataList=[]
#获取到源码
html = askUrl(url)
data = str(html)
# print(data)
find_info = re.findall(r'window.__SEARCH_RESULT__ = {(.*?)keywords', data)
# print(find_info)
#拼接上少了的:keywords":"找工作,求职,人才,招聘"}这一部分
find_info = '{' + str(find_info[0]) + 'keywords":"找工作,求职,人才,招聘"}'
# print(find_info)
# 将获取到内容转换为json格式
json_data = json.loads(find_info)
# print(json_data)
for i in json_data['engine_search_result']:
dataList.append(i['job_href'])
# print(jobList)
# print(dataList)
return dataList
# 二次获取到的链接页面解析
def getaData(jobpage):
data = []
jobHtml = askUrl(jobpage) #获取详情页面
# print(jobHtml)
soup = BeautifulSoup(jobHtml, "html.parser")
# data.append(jobpage)
jnames = soup.select('h1[title]')#职位名称
# print(jnames)
jname = jnames[0]["title"]
# print(jname)
# data.append(jname)
moneys = soup.select("div.cn strong ") # 待遇
# print(moneys[0])
if str(moneys[0])=="<strong></strong>":
money="面谈"
else:
money = moneys[0].string
# data.append(money)
gnames = soup.select(".cname > a") # 公司名字
gname = gnames[0]["title"]
# data.append(gname)
jobMlsit = soup.select("p.msg") # 工作经验
list1 = jobMlsit[0]["title"].split("|")
jinyan = list1[1].strip()
# data.append(jinyan)
lieu = list1[-1].strip() # 工作地点
# data.append(lieu)
# print(lieu)
xuli = list1[2].strip() # 学历
# data.append(xuli)
# print(xuli)
# renshu = list1[3].strip() # 招人数
# try:
data.append(jobpage+" "+jname+" "+money+" "+gname+" "+jinyan+" "+xuli)
# except:
# pass
# print(renshu)
print(data)
return data
# 爬取网页
def askUrl(url):
# 模拟浏览器信息,想豆瓣服务器发送信息
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36",
}
# 用户代理,告诉服务器,我们是什么类型的机器,浏览器(本质上是告诉浏览器我们可以接收什么水平的内容)
# 发送请求消息
request = urllib.request.Request(url,headers=headers)
html = ""
# 捕获异常
try:
# 取得响应
response = urllib.request.urlopen(request)
# 获取网页内容
html = response.read().decode("gbk")
# print(html)
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reson"):
print(e.reson)
return html
if __name__ == '__main__':
main()
# askUrl("https://jobs.51job.com/chengdu-gxq/124561448.html?s=01&t=0")
print("爬取完毕")