﻿{"id":545,"date":"2020-09-06T04:33:57","date_gmt":"2020-09-05T20:33:57","guid":{"rendered":"https:\/\/byy3.com\/?p=545"},"modified":"2021-01-09T10:10:09","modified_gmt":"2021-01-09T02:10:09","slug":"python%e7%88%ac%e5%8f%9651job%e6%95%b0%e6%8d%ae%e5%b9%b6%e5%ad%98%e5%82%a8csv%e6%96%87%e4%bb%b6","status":"publish","type":"post","link":"https:\/\/byy3.com\/?p=545","title":{"rendered":"python\u722c\u53d651job\u6570\u636e\u5e76\u5b58\u50a8csv\u6587\u4ef6"},"content":{"rendered":"<p>#\u7a0b\u5e8f\u6765\u6e90\uff1a<a href=\"https:\/\/byy3.com\/go\/?url=https:\/\/www.byy3.com\" target=\"_blank\" rel=\"noopener noreferrer\" rel=\"nofollow\" >\u6cd3\u6e90\u89c6\u91ce<\/a> <a href=\"https:\/\/byy3.com\/go\/?url=https:\/\/www.byy3.com\" target=\"_blank\" rel=\"noopener noreferrer\" rel=\"nofollow\" >\u535a\u5f08\u6e90<\/a> <a href=\"https:\/\/byy3.com\">byy3.com\u00a0<\/a><\/p>\n<p>#\u7a0b\u5e8f\u6539\u597d\u722c\u53d6\u6570\u636e\u548c\u5b58\u50a8csv \u4e0b\u4e00\u6b65\u89e3\u51b3\u5339\u914d\u6570\u636e \u5de5\u4f5c\u5730\u70b9\u548c\u62db\u8058\u4eba\u6570<br \/>\n#\u65b0\u95ee\u9898\u51fa\u73b0\u4e86\u76f4\u63a5\u8c03\u7528 datalist\u5199\u5165\u6570\u636e\u662f\u4e00\u6761\u6570\u636e\u5305\u62ec\uff08\u7f51\u5740 \u5de5\u8d44...\uff09\u5199\u5230\u4e86\u4e00\u8d77<br \/>\n#\u6dfb\u52a0header use-agent \u5df2\u89e3\u51b3 \u4e0b\u4e00\u6b65\u5229\u7528\u6570\u636e\u8fdb\u884c\u753b\u56fe\u8f93\u51fa\u53ef\u89c6\u5316\u6570\u636e<\/p>\n<p>from bs4 import BeautifulSoup<\/p>\n<p># \u6b63\u5219\u8868\u8fbe\u5f0f<\/p>\n<p>import re<\/p>\n<p># \u5b9a\u5236url,\u83b7\u53d6\u7f51\u9875\u6570\u636e<\/p>\n<p>import urllib.request,urllib.error<\/p>\n<p># \u8fdb\u884cexcel\u64cd\u4f5c<\/p>\n<p>import xlwt<\/p>\n<p># \u8fdb\u884c\u6570\u636e\u5e93\u64cd\u4f5c<\/p>\n<p>import sqlite3<\/p>\n<p># \u6839\u636e\u9700\u6c42\u5bf9\u8f93\u5165\u7684\u89e3\u6790<\/p>\n<p>from urllib import parse<\/p>\n<p>from lxml import etree<\/p>\n<p>import json<\/p>\n<p>import requests<\/p>\n<p>import csv<\/p>\n<p>&nbsp;<\/p>\n<p>keyWord =input(\"\u8bf7\u8f93\u5165\u9700\u8981\u722c\u53d6\u7684\u5173\u952e\u5b57\uff1a\")<\/p>\n<p>word=parse.quote(keyWord)<\/p>\n<p>newWord = parse.quote(word)<\/p>\n<p># jobData={}#\u6bcf\u4e00\u4e2a\u8bb0\u5f55\u662f\u4e00\u4e2a\u5217\u8868\uff0c\u628a\u4e8c\u6b21\u83b7\u53d6\u5230\u7684\u7f51\u9875\u5b58\u6210\u5b57\u5178\u683c\u5f0f<\/p>\n<p># jobList =[] #\u628a\u4e0a\u9762\u83b7\u53d6\u5230\u7684\u4fe1\u606f\u653e\u5728\u5217\u8868\u4e2d<\/p>\n<p>#\u4f2a\u88c5\u722c\u53d6\u5934\u90e8\uff0c\u4ee5\u9632\u6b62\u88ab\u7f51\u7ad9\u7981\u6b62<br \/>\nheaders={'Host':'search.51job.com',<br \/>\n'Upgrade-Insecure-Requests':'1',<br \/>\n'User-Agent':'Mozilla\/5.0 (Windows NT 6.1) AppleWebKit\/537.36 (KHTML, like Gecko)\\<br \/>\nChrome\/63.0.3239.132 Safari\/537.36'}<\/p>\n<p>def main():<\/p>\n<p># \u722c\u53d6\u7f51\u9875<\/p>\n<p># url = \"https:\/\/search.51job.com\/list\/090200,000000,0000,00,9,99,\" + newWord + \",2,\" + str(pageNmb) + \".html\"<\/p>\n<p>&nbsp;<\/p>\n<p># html = askUrl(url)<\/p>\n<p># print(html)<\/p>\n<p>datalist = []<\/p>\n<p>for i in range(1,10005): #\u722c\u53d6\u76841-2\u9875\u6570<\/p>\n<p>url = \"https:\/\/search.51job.com\/list\/000000,000000,0000,00,9,99,\"+newWord+\",2,\"+str(i)+\".html\"<\/p>\n<p>pagaLink=getLink(url) #\u722c\u53d6\u5217\u8868\u9875\uff0c\u83b7\u53d6\u8be5\u5217\u8868\u7684\u5168\u90e8\u5c97\u4f4d\u94fe\u63a5<\/p>\n<p>if len(pagaLink) == 0:<\/p>\n<p>break<\/p>\n<p>for jobpage in pagaLink:<\/p>\n<p>data=getaData(jobpage) #\u4e00\u4e2a\u8be6\u60c5\u9875\u7684\u94fe\u63a5<\/p>\n<p>datalist.append(data)<\/p>\n<p>print(datalist)<\/p>\n<p>&nbsp;<\/p>\n<p>#\u6253\u5f00Data_mining.csv\u6587\u4ef6\uff0c\u8fdb\u884c\u5199\u5165\u64cd\u4f5c<\/p>\n<p>csvFile = open(\"51job1.csv\", 'a', newline='')<\/p>\n<p>writer = csv.writer(csvFile)<\/p>\n<p>writer.writerow(('link','job','salary','company','experience',\\<\/p>\n<p>'education','renshu',))<\/p>\n<p>writer.writerow(datalist)<\/p>\n<p># f.write(job_name + \",\"+place+',' + data_time + ',' + money + '\\n')<\/p>\n<p># f.write(','.join(job_list))<\/p>\n<p># datalist = getaData(baseurl)<\/p>\n<p># savapath = \".\\\\51job.xlsx\"<\/p>\n<p># dbpath = \"movie.db\"<\/p>\n<p># \u4fdd\u5b58\u6570\u636e\u5230excel<\/p>\n<p># savaData(datalist,savapath)<\/p>\n<p># \u4fdd\u5b58\u6570\u636e\u5230\u6570\u636e\u5e93<\/p>\n<p># savaData2DB(datalist, dbpath)<\/p>\n<p>#\u5173\u95ed\u5199\u5165\u6587\u4ef6<\/p>\n<p># csvFile.close()<\/p>\n<p>&nbsp;<\/p>\n<p># \u83b7\u53d6\u7f51\u9875\u4fe1\u606f<\/p>\n<p>def getLink(url):<\/p>\n<p>dataList=[]<\/p>\n<p>#\u83b7\u53d6\u5230\u6e90\u7801<\/p>\n<p>html = askUrl(url)<\/p>\n<p>data = str(html)<\/p>\n<p># print(data)<\/p>\n<p>find_info = re.findall(r'window.__SEARCH_RESULT__ = {(.*?)keywords', data)<\/p>\n<p># print(find_info)<\/p>\n<p>#\u62fc\u63a5\u4e0a\u5c11\u4e86\u7684:keywords\":\"\u627e\u5de5\u4f5c,\u6c42\u804c,\u4eba\u624d,\u62db\u8058\"}\u8fd9\u4e00\u90e8\u5206<\/p>\n<p>find_info = '{' + str(find_info[0]) + 'keywords\":\"\u627e\u5de5\u4f5c,\u6c42\u804c,\u4eba\u624d,\u62db\u8058\"}'<\/p>\n<p># print(find_info)<\/p>\n<p># \u5c06\u83b7\u53d6\u5230\u5185\u5bb9\u8f6c\u6362\u4e3ajson\u683c\u5f0f<\/p>\n<p>json_data = json.loads(find_info)<\/p>\n<p># print(json_data)<\/p>\n<p>for i in json_data['engine_search_result']:<\/p>\n<p>dataList.append(i['job_href'])<\/p>\n<p># print(jobList)<\/p>\n<p># print(dataList)<\/p>\n<p>return dataList<\/p>\n<p>&nbsp;<\/p>\n<p>&nbsp;<\/p>\n<p># \u4e8c\u6b21\u83b7\u53d6\u5230\u7684\u94fe\u63a5\u9875\u9762\u89e3\u6790<\/p>\n<p>def getaData(jobpage):<\/p>\n<p>data = []<\/p>\n<p>jobHtml = askUrl(jobpage) #\u83b7\u53d6\u8be6\u60c5\u9875\u9762<\/p>\n<p># print(jobHtml)<\/p>\n<p>soup = BeautifulSoup(jobHtml, \"html.parser\")<\/p>\n<p>&nbsp;<\/p>\n<p># data.append(jobpage)<\/p>\n<p>jnames = soup.select('h1[title]')#\u804c\u4f4d\u540d\u79f0<\/p>\n<p># print(jnames)<\/p>\n<p>jname = jnames[0][\"title\"]<\/p>\n<p># print(jname)<\/p>\n<p># data.append(jname)<\/p>\n<p>&nbsp;<\/p>\n<p>moneys = soup.select(\"div.cn strong \") # \u5f85\u9047<\/p>\n<p># print(moneys[0])<\/p>\n<p>if str(moneys[0])==\"&lt;strong&gt;&lt;\/strong&gt;\":<\/p>\n<p>money=\"\u9762\u8c08\"<\/p>\n<p>else:<\/p>\n<p>money = moneys[0].string<\/p>\n<p># data.append(money)<\/p>\n<p>&nbsp;<\/p>\n<p>gnames = soup.select(\".cname &gt; a\") # \u516c\u53f8\u540d\u5b57<\/p>\n<p>gname = gnames[0][\"title\"]<\/p>\n<p># data.append(gname)<\/p>\n<p>&nbsp;<\/p>\n<p>jobMlsit = soup.select(\"p.msg\") # \u5de5\u4f5c\u7ecf\u9a8c<\/p>\n<p>list1 = jobMlsit[0][\"title\"].split(\"|\")<\/p>\n<p>jinyan = list1[1].strip()<\/p>\n<p># data.append(jinyan)<\/p>\n<p>&nbsp;<\/p>\n<p>lieu = list1[-1].strip() # \u5de5\u4f5c\u5730\u70b9<\/p>\n<p># data.append(lieu)<\/p>\n<p># print(lieu)<\/p>\n<p>&nbsp;<\/p>\n<p>xuli = list1[2].strip() # \u5b66\u5386<\/p>\n<p># data.append(xuli)<\/p>\n<p># print(xuli)<\/p>\n<p># renshu = list1[3].strip() # \u62db\u4eba\u6570<\/p>\n<p>&nbsp;<\/p>\n<p># try:<\/p>\n<p>data.append(jobpage+\" \"+jname+\" \"+money+\" \"+gname+\" \"+jinyan+\" \"+xuli)<\/p>\n<p># except:<\/p>\n<p># pass<\/p>\n<p># print(renshu)<\/p>\n<p>&nbsp;<\/p>\n<p>print(data)<\/p>\n<p>return data<\/p>\n<p>&nbsp;<\/p>\n<p>&nbsp;<\/p>\n<p># \u722c\u53d6\u7f51\u9875<\/p>\n<p>def askUrl(url):<\/p>\n<p># \u6a21\u62df\u6d4f\u89c8\u5668\u4fe1\u606f\uff0c\u60f3\u8c46\u74e3\u670d\u52a1\u5668\u53d1\u9001\u4fe1\u606f<\/p>\n<p>headers = {<\/p>\n<p>\"User-Agent\": \"Mozilla\/5.0 (Windows NT 10.0; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/83.0.4103.116 Safari\/537.36\",<\/p>\n<p>}<\/p>\n<p># \u7528\u6237\u4ee3\u7406\uff0c\u544a\u8bc9\u670d\u52a1\u5668\uff0c\u6211\u4eec\u662f\u4ec0\u4e48\u7c7b\u578b\u7684\u673a\u5668\uff0c\u6d4f\u89c8\u5668\uff08\u672c\u8d28\u4e0a\u662f\u544a\u8bc9\u6d4f\u89c8\u5668\u6211\u4eec\u53ef\u4ee5\u63a5\u6536\u4ec0\u4e48\u6c34\u5e73\u7684\u5185\u5bb9\uff09<\/p>\n<p>&nbsp;<\/p>\n<p># \u53d1\u9001\u8bf7\u6c42\u6d88\u606f<\/p>\n<p>request = urllib.request.Request(url,headers=headers)<\/p>\n<p>html = \"\"<\/p>\n<p># \u6355\u83b7\u5f02\u5e38<\/p>\n<p>try:<\/p>\n<p># \u53d6\u5f97\u54cd\u5e94<\/p>\n<p>response = urllib.request.urlopen(request)<\/p>\n<p># \u83b7\u53d6\u7f51\u9875\u5185\u5bb9<\/p>\n<p>html = response.read().decode(\"gbk\")<\/p>\n<p># print(html)<\/p>\n<p>except urllib.error.URLError as e:<\/p>\n<p>if hasattr(e,\"code\"):<\/p>\n<p>print(e.code)<\/p>\n<p>if hasattr(e,\"reson\"):<\/p>\n<p>print(e.reson)<\/p>\n<p>return html<\/p>\n<p>&nbsp;<\/p>\n<p>&nbsp;<\/p>\n<p>if __name__ == '__main__':<\/p>\n<p>main()<\/p>\n<p># askUrl(\"https:\/\/jobs.51job.com\/chengdu-gxq\/124561448.html?s=01&amp;t=0\")<\/p>\n<p>print(\"\u722c\u53d6\u5b8c\u6bd5\")<\/p>\n<p>&nbsp;<\/p>\n<p>&nbsp;<\/p>\n","protected":false},"excerpt":{"rendered":"<p>#\u7a0b\u5e8f\u6765\u6e90\uff1a\u6cd3\u6e90\u89c6\u91ce \u535a\u5f08\u6e90 byy3.com\u00a0 #\u7a0b\u5e8f\u6539\u597d\u722c\u53d6\u6570\u636e\u548c\u5b58\u50a8csv \u4e0b\u4e00\u6b65\u89e3\u51b3\u5339\u914d\u6570\u636e \u5de5\u4f5c\u5730\u70b9 [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[20],"tags":[352,49],"class_list":["post-545","post","type-post","status-publish","format-standard","hentry","category-python","tag-python"],"_links":{"self":[{"href":"https:\/\/byy3.com\/index.php?rest_route=\/wp\/v2\/posts\/545","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/byy3.com\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/byy3.com\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/byy3.com\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/byy3.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=545"}],"version-history":[{"count":0,"href":"https:\/\/byy3.com\/index.php?rest_route=\/wp\/v2\/posts\/545\/revisions"}],"wp:attachment":[{"href":"https:\/\/byy3.com\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=545"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/byy3.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=545"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/byy3.com\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=545"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}