﻿{"id":862,"date":"2021-01-27T23:07:10","date_gmt":"2021-01-27T15:07:10","guid":{"rendered":"https:\/\/byy3.com\/?p=862"},"modified":"2021-01-27T23:07:10","modified_gmt":"2021-01-27T15:07:10","slug":"%e5%82%bb%e7%93%9c%e5%bc%8f%e6%96%87%e7%ab%a0%e7%88%ac%e8%99%ab-newspaper%e5%ba%93%e7%ae%80%e4%bb%8b","status":"publish","type":"post","link":"https:\/\/byy3.com\/?p=862","title":{"rendered":"\u50bb\u74dc\u5f0f\u6587\u7ae0\u722c\u866b-newspaper\u5e93\u7b80\u4ecb"},"content":{"rendered":"<p>\u4eca\u5929\u6bd4\u8f83\u95f2\uff0c\u6211\u5c31\u6d4f\u89c8\u4e86\u4f1agithub\u4e0a\u6709\u5173python\u722c\u866b\u7684\u9879\u76ee\u3002\u770b\u5230\u4e00\u4e2anewspaper\u5e93\uff0c\u5173\u6ce8\u6570\u633a\u9ad8\u7684\u3002\u4f5c\u8005\u53d7lxml\u7684\u5f3a\u5927\u548crequests\u7684\u7b80\u6d01\uff0c\u5f00\u53d1\u4e86newspaper\u5e93\u3002<\/p>\n<p>requests\u5e93\u7684\u4f5c\u8005\u90fd\u76db\u8d5enewspaper\u5e93\u7684\u725bB\u3002<\/p>\n<pre class=\"hljs coffeescript\"><code><span class=\"hljs-string\">\"Newspaper is an amazing python library for extracting &amp; curating\r\n\r\n articles.\"<\/span> -- tweeted <span class=\"hljs-keyword\">by<\/span> Kenneth Reitz, Author <span class=\"hljs-keyword\">of<\/span> requests<\/code><\/pre>\n<p><strong>\u4e00\u3001newspaper\u7279\u6027<\/strong><\/p>\n<ul>\n<li>\u591a\u8fdb\u7a0b\u6587\u7ae0\u4e0b\u8f7d\u6846\u67b6<\/li>\n<li>\u65b0\u95fb\u94fe\u63a5\u8bc6\u522b<\/li>\n<li>\u53ef\u4ecehtml\u6587\u4ef6\u4e2d\u63d0\u53d6\u6587\u672c\u3001\u56fe\u7247<\/li>\n<li>\u53ef\u6587\u7ae0\u5173\u952e\u8bcd\u63d0\u53d6<\/li>\n<li>\u53ef\u751f\u6210\u6587\u7ae0\u6982\u8981<\/li>\n<li>\u63d0\u53d6\u6587\u7ae0\u4f5c\u8005\u540d<\/li>\n<li>\u8c37\u6b4c\u8d8b\u52bf\u8bcd\u63d0\u53d6<\/li>\n<li>\u652f\u6301\u5341\u6570\u79cd\u8bed\u8a00\uff08\u542b\u4e2d\u6587\uff09<\/li>\n<\/ul>\n<p>\u5176\u5b9e\u4e4b\u524d\u6211\u5199\u8fc7\u4e00\u4e2a\u7c7b\u4f3c\u7684\u5e93\u7684\u4ecb\u7ecd-goose\uff08\u4ec5\u652f\u6301python2\uff09\uff0c\u8ddfnewspaper\u6709\u7c7b\u4f3c\u529f\u80fd\u3002 \u6587\u7ae0\u540d\u300a\u4e0d\u4f1a\u5199\u722c\u866b\u7684\u5feb\u6765goose\u4e00\u4e0b\u300b<\/p>\n<p><strong>\u4e8c\u3001\u5b89\u88c5<\/strong><\/p>\n<pre class=\"hljs nginx\"><code>\r\n<span class=\"hljs-attribute\">pip3<\/span> install newspaper3k<\/code><\/pre>\n<p>\u6ce8\u610f:\u5728python3\u4e2d\u5b89\u88c5\uff0c\u5fc5\u987b\u662fnewspaper3k\u3002 newspaper\u662fpython2\u4e0a\u7684\u5e93\u3002<\/p>\n<p><strong>\u4e09\u3001\u5f00\u59cb\u4ee3\u7801<\/strong><br \/>\n<strong>3.1newspaper\u652f\u6301\u7684\u8bed\u8a00<\/strong><\/p>\n<pre class=\"hljs coffeescript\"><code><span class=\"hljs-keyword\">import<\/span> newspaper\r\n\r\n<span class=\"hljs-built_in\">print<\/span>(newspaper.languages())\r\n\r\nYour available languages are:\r\n\r\ninput code full name\r\n\r\n ar Arabic\r\n\r\n da Danish\r\n\r\n de German\r\n\r\n el Greek\r\n\r\n en English\r\n\r\n es Spanish\r\n\r\n fi Finnish\r\n\r\n fr French\r\n\r\n he Hebrew\r\n\r\n hu Hungarian\r\n\r\n id Indonesian\r\n\r\n it Italian\r\n\r\n ko Korean\r\n\r\n mk Macedonian\r\n\r\n nb Norwegian (Bokm\u00e5l)\r\n\r\n nl Dutch\r\n\r\n <span class=\"hljs-literal\">no<\/span> Norwegian\r\n\r\n pt Portuguese\r\n\r\n ru Russian\r\n\r\n sv Swedish\r\n\r\n tr Turkish\r\n\r\n vi Vietnamese\r\n\r\n zh Chinese<\/code><\/pre>\n<p><strong>3.2 \u6587\u7ae0\u5185\u5bb9\u63d0\u53d6<\/strong><br \/>\n\u63d0\u53d6\u6587\u7ae0\u5185\u5bb9\uff0c\u5982\u4f5c\u8005\u3001\u51fa\u7248\u65e5\u671f\u3001\u6587\u7ae0\u5185\u5bb9\u3001\u56fe\u7247\u94fe\u63a5<\/p>\n<pre class=\"hljs shell\"><code>\r\nfrom newspaper import Article\r\n\r\nurl = 'http:\/\/media.china.com.cn\/cmyw\/2017-06-13\/1067887.html'\r\n\r\narticle = Article(url, language='zh')\r\n<span class=\"hljs-meta\">\r\n#<\/span><span class=\"bash\">\u4e0b\u8f7d\u6587\u7ae0<\/span>\r\n\r\narticle.download()\r\n<span class=\"hljs-meta\">\r\n#<\/span><span class=\"bash\">\u67e5\u770b\u6587\u7ae0\u7684html\u6570\u636e<\/span>\r\n<span class=\"hljs-meta\">\r\n#<\/span><span class=\"bash\"><span class=\"hljs-built_in\">print<\/span>(article.html)<\/span>\r\n<span class=\"hljs-meta\">\r\n#<\/span><span class=\"bash\">\u89e3\u6790\u6587\u7ae0html\u6570\u636e<\/span>\r\n\r\narticle.parse()\r\n<span class=\"hljs-meta\">\r\n#<\/span><span class=\"bash\">\u63d0\u53d6\u5404\u79cd\u6570\u636e\u4fe1\u606f<\/span>\r\n<span class=\"hljs-meta\">\r\n#<\/span><span class=\"bash\">\u4f5c\u8005<\/span>\r\n\r\nprint(article.authors)\r\n<span class=\"hljs-meta\">\r\n#<\/span><span class=\"bash\">\u51fa\u7248\u65e5\u671f<\/span>\r\n\r\nprint(article.publish_date)\r\n<span class=\"hljs-meta\">\r\n#<\/span><span class=\"bash\">\u65b0\u95fb\u5185\u5bb9<\/span>\r\n\r\nprint(article.text)\r\n<span class=\"hljs-meta\">\r\n#<\/span><span class=\"bash\">\u6587\u7ae0\u7684\u9996\u56fe\u94fe\u63a5<\/span>\r\n\r\nprint(article.top_image)<\/code><\/pre>\n<p><strong>3.3 \u81ea\u7136\u8bed\u8a00\u5904\u7406<\/strong><br \/>\n\u7ee7\u7eed3.2\u90e8\u5206\u4ee3\u7801<\/p>\n<pre class=\"hljs shell\"><code><span class=\"hljs-meta\">\r\n#<\/span><span class=\"bash\">nlp\u521d\u59cb\u5316<\/span>\r\n\r\narticle.nlp()\r\n<span class=\"hljs-meta\">\r\n#<\/span><span class=\"bash\">\u63d0\u53d6\u5173\u952e\u8bcd<\/span>\r\n\r\nprint(article.keywords)\r\n<span class=\"hljs-meta\">\r\n#<\/span><span class=\"bash\">\u6587\u7ae0\u6982\u8981<\/span>\r\n\r\nprint(article.summary)\r\n<\/code><\/pre>\n<p><strong>3.4 \u66f4\u7cbe\u7ec6\u7684\u4f7f\u7528\u65b9\u6cd5<\/strong><br \/>\n\u4e0a\u9762\u7684\u65b9\u6cd5\u662f\u9ed8\u8ba4\u7684\u65b9\u6cd5\uff0c\u5982\u679c\u4f60\u786e\u5b9a\u67d0\u7f51\u7ad9\u91c7\u7528\u7684\u5168\u90e8\u662f\u4e00\u79cd\u8bed\u8a00\uff0c\u4f60\u53ef\u4ee5\u4f7f\u7528\u4e0b\u9762\u4ee3\u7801<\/p>\n<pre class=\"hljs coffeescript\"><code>\r\n<span class=\"hljs-comment\">#\u6587\u6863\u4e2d\u4f7f\u7528\u7684\u6848\u4f8b<\/span>\r\n\r\n<span class=\"hljs-keyword\">import<\/span> newspaper\r\n\r\nsina_paper = newspaper.build(<span class=\"hljs-string\">'http:\/\/www.sina.com.cn\/'<\/span>, language=<span class=\"hljs-string\">'zh'<\/span>)\r\n\r\n<span class=\"hljs-keyword\">for<\/span> category <span class=\"hljs-keyword\">in<\/span> sina_paper.category_urls():\r\n\r\n <span class=\"hljs-built_in\">print<\/span>(category)<\/code><\/pre>\n<p>\u8f93\u51fa\u4e86\u65b0\u6d6a\u7f51\u6240\u6709\u680f\u76ee<\/p>\n<pre class=\"hljs cpp\"><code>\r\nhttp:<span class=\"hljs-comment\">\/\/roll.fashion.sina.com.cn<\/span>\r\n\r\nhttp:<span class=\"hljs-comment\">\/\/www.sina.com.cn\/<\/span>\r\n\r\nhttp:<span class=\"hljs-comment\">\/\/hainan.sina.com.cn<\/span>\r\n\r\nhttp:<span class=\"hljs-comment\">\/\/jiangsu.sina.com.cn<\/span>\r\n\r\nhttp:<span class=\"hljs-comment\">\/\/vr.sina.cn<\/span>\r\n\r\nhttp:<span class=\"hljs-comment\">\/\/cq.auto.sina.com.cn<\/span>\r\n\r\nhttp:<span class=\"hljs-comment\">\/\/eladies.sina.com.cn<\/span>\r\n\r\nhttp:<span class=\"hljs-comment\">\/\/chuangye.sina.com.cn<\/span>\r\n\r\nhttp:<span class=\"hljs-comment\">\/\/gx.sina.com.cn<\/span>\r\n\r\nhttp:<span class=\"hljs-comment\">\/\/slide.mil.news.sina.com.cn<\/span>\r\n\r\nhttp:<span class=\"hljs-comment\">\/\/hlj.sina.com.cn<\/span>\r\n\r\nhttp:<span class=\"hljs-comment\">\/\/history.sina.com.cn<\/span>\r\n\r\n\r\nhttp:<span class=\"hljs-comment\">\/\/tech.sina.com.cn\/\/nmg.sina.com.cn<\/span>\r\n\r\nhttp:<span class=\"hljs-comment\">\/\/shiqu.sina.com.cn<\/span>\r\n\r\nhttp:<span class=\"hljs-comment\">\/\/ah.sina.com.cn<\/span>\r\n\r\nhttp:<span class=\"hljs-comment\">\/\/slide.news.sina.com.cn<\/span>\r\n\r\nhttp:<span class=\"hljs-comment\">\/\/chexian.sina.com<\/span>\r\n<\/code><\/pre>\n<p>\u603b\u7ed3\uff0c\u7528\u7740\u6548\u679c\u6ca1\u6709requests\u4f5c\u8005\u5938\u7684\u90a3\u4e48\u68d2\uff0c\u53ef\u80fd\u6211\u627e\u7684\u7f51\u7ad9\u6b63\u597d\u662fnewspaper\u65e0\u6cd5\u5b8c\u7f8e\u5904\u7406\u7684\u7f51\u7ad9\u3002<\/p>\n<p>Tips\uff1a\u8fd9\u7bc7\u6587\u7ae0\u6293\u7684\u662f\u5916\u56fd\u7f51\u7ad9-Twitter\u6bcf\u65e5\u63a8\u8350\u5bfc\u8bfb<\/p>\n","protected":false},"excerpt":{"rendered":"<p>\u4eca\u5929\u6bd4\u8f83\u95f2\uff0c\u6211\u5c31\u6d4f\u89c8\u4e86\u4f1agithub\u4e0a\u6709\u5173python\u722c\u866b\u7684\u9879\u76ee\u3002\u770b\u5230\u4e00\u4e2anewspaper\u5e93\uff0c\u5173\u6ce8\u6570\u633a\u9ad8\u7684\u3002 [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[20],"tags":[405,404,33,378,352],"class_list":["post-862","post","type-post","status-publish","format-standard","hentry","category-python","tag-newspaper","tag-python"],"_links":{"self":[{"href":"https:\/\/byy3.com\/index.php?rest_route=\/wp\/v2\/posts\/862","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/byy3.com\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/byy3.com\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/byy3.com\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/byy3.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=862"}],"version-history":[{"count":0,"href":"https:\/\/byy3.com\/index.php?rest_route=\/wp\/v2\/posts\/862\/revisions"}],"wp:attachment":[{"href":"https:\/\/byy3.com\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=862"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/byy3.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=862"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/byy3.com\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=862"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}