{"id":913,"date":"2024-11-07T13:40:55","date_gmt":"2024-11-07T05:40:55","guid":{"rendered":"https:\/\/fwq.ai\/blog\/913\/"},"modified":"2024-11-07T13:40:55","modified_gmt":"2024-11-07T05:40:55","slug":"%e5%a6%82%e4%bd%95%e4%bd%bf%e7%94%a8scrapy%e5%b0%86%e5%88%97%e8%a1%a8%e9%a1%b5%e5%92%8c%e8%af%a6%e6%83%85%e9%a1%b5%e6%95%b0%e6%8d%ae%e5%90%88%e5%b9%b6%e5%88%b0%e4%b8%80%e4%b8%aaitem%e4%b8%ad%ef%bc%9f","status":"publish","type":"post","link":"https:\/\/fwq.ai\/blog\/913\/","title":{"rendered":"\u5982\u4f55\u4f7f\u7528Scrapy\u5c06\u5217\u8868\u9875\u548c\u8be6\u60c5\u9875\u6570\u636e\u5408\u5e76\u5230\u4e00\u4e2aItem\u4e2d\uff1f"},"content":{"rendered":"<p><img loading=\"lazy\" decoding=\"async\" class=\"alignnone size-full wp-image-991\" src=\"https:\/\/fwq.ai\/blog\/wp-content\/uploads\/2024\/11\/173064009714190.jpg\" width=\"800\" height=\"320\" srcset=\"https:\/\/fwq.ai\/blog\/wp-content\/uploads\/2024\/11\/173064009714190.jpg 800w, https:\/\/fwq.ai\/blog\/wp-content\/uploads\/2024\/11\/173064009714190-300x120.jpg 300w, https:\/\/fwq.ai\/blog\/wp-content\/uploads\/2024\/11\/173064009714190-768x307.jpg 768w, https:\/\/fwq.ai\/blog\/wp-content\/uploads\/2024\/11\/173064009714190-670x268.jpg 670w\" sizes=\"auto, (max-width: 800px) 100vw, 800px\" title=\"\u5982\u4f55\u4f7f\u7528Scrapy\u5c06\u5217\u8868\u9875\u548c\u8be6\u60c5\u9875\u6570\u636e\u5408\u5e76\u5230\u4e00\u4e2aItem\u4e2d\uff1f\u63d2\u56fe\" alt=\"\u5982\u4f55\u4f7f\u7528Scrapy\u5c06\u5217\u8868\u9875\u548c\u8be6\u60c5\u9875\u6570\u636e\u5408\u5e76\u5230\u4e00\u4e2aItem\u4e2d\uff1f\u63d2\u56fe\" \/><\/p>\n<p><strong>scrapy\u5982\u4f55\u5c06\u5217\u8868\u9875\u548c\u8be6\u60c5\u9875\u5185\u5bb9\u5408\u5e76\u5230\u4e00\u4e2aitem\u4e2d<\/strong><\/p>\n<p>\u5728\u4f7f\u7528scrapy\u6293\u53d6\u6570\u636e\u65f6\uff0c\u7ecf\u5e38\u4f1a\u9047\u5230\u9700\u8981\u4ece\u5217\u8868\u9875\u548c\u8be6\u60c5\u9875\u4e2d\u6293\u53d6\u5185\u5bb9\u5e76\u5b58\u50a8\u5728\u4e00\u8d77\u7684\u60c5\u51b5\u3002\u4f8b\u5982\uff0c\u4ece\u767e\u5ea6\u641c\u7d22\u9875\u9762\uff08\u5217\u8868\u9875\uff09\u83b7\u53d6\u6807\u9898\u3001\u65f6\u95f4\u3001url\u7b49\u4fe1\u606f\uff0c\u7136\u540e\u901a\u8fc7url\u94fe\u63a5\u5230\u8be6\u60c5\u9875\u8fdb\u4e00\u6b65\u83b7\u53d6\u5185\u5bb9\u3002<\/p>\n<p>\u5982\u679c\u6309\u7167\u4f20\u7edf\u7684\u601d\u7ef4\uff0c\u6211\u4eec\u53ef\u80fd\u4f1a\u5c06\u5217\u8868\u9875\u548c\u8be6\u60c5\u9875\u7684\u6293\u53d6\u903b\u8f91\u5206\u522b\u5199\u5728\u4e24\u4e2a\u4e0d\u540c\u7684\u51fd\u6570\u4e2d\uff0c\u5982\u4e0b\u6240\u793a\uff1a<\/p>\n<pre>def parse(self, response):\n    \"\"\"\u83b7\u53d6\u5217\u8868\u9875\u7684\u6807\u9898\u3001\u65f6\u95f4\u3001url\u5e76\u56de\u8c03parse_item\"\"\"\n\ndef parse_item(self, response):\n    \"\"\"\u83b7\u53d6\u8be6\u60c5\u9875\u7684\u5185\u5bb9\"\"\"<\/pre>\n<p> \u767b\u5f55\u540e\u590d\u5236 <\/p>\n<p>\u8fd9\u6837\u505a\u7684\u95ee\u9898\u662f\uff0c\u5217\u8868\u9875\u548c\u8be6\u60c5\u9875\u7684\u6293\u53d6\u903b\u8f91\u5206\u79bb\uff0c\u65e0\u6cd5\u5c06\u4e24\u90e8\u5206\u7684\u4fe1\u606f\u5408\u5e76\u5230\u540c\u4e00\u4e2aitem\u4e2d\u3002<\/p>\n<p>\u4e3a\u4e86\u89e3\u51b3\u8fd9\u4e2a\u95ee\u9898\uff0cscrapy\u63d0\u4f9b\u4e86meta\u53c2\u6570\uff0c\u5b83\u53ef\u4ee5\u5c06\u4fe1\u606f\u4ece\u4e00\u4e2a\u8bf7\u6c42\u4f20\u9012\u5230\u53e6\u4e00\u4e2a\u8bf7\u6c42\u3002\u5177\u4f53\u5b9e\u73b0\u65b9\u6cd5\u5982\u4e0b\uff1a<\/p>\n<pre>def parse(self, response):\n    \"\"\"\u83b7\u53d6\u5217\u8868\u9875\u7684\u6807\u9898\u3001\u65f6\u95f4\u3001URL\u5e76\u56de\u8c03parse_item\"\"\"\n    for item in response.css('\u641c\u7d22\u7ed3\u679c\u5217\u8868\u5143\u7d20CSS\u9009\u62e9\u5668'):\n        title = item.css('\u6807\u9898CSS\u9009\u62e9\u5668').get()\n        time = item.css('\u65f6\u95f4CSS\u9009\u62e9\u5668').get()\n        url = item.css('URL CSS\u9009\u62e9\u5668').get()\n\n        # \u5c06\u5217\u8868\u9875\u4fe1\u606f\u5b58\u50a8\u5728meta\u4e2d\n        meta = {'title': title, 'time': time, 'url': url}\n\n        # \u56de\u8c03parse_item\u5e76\u4f20\u9012meta\n        yield Request(url, callback=self.parse_item, meta=meta)\n\ndef parse_item(self, response):\n    \"\"\"\u83b7\u53d6\u8be6\u60c5\u9875\u7684\u5185\u5bb9\"\"\"\n    content = response.css('\u5185\u5bb9CSS\u9009\u62e9\u5668').get()\n\n    # \u4ecemeta\u4e2d\u83b7\u53d6\u5217\u8868\u9875\u4fe1\u606f\n    title = response.meta['title']\n    time = response.meta['time']\n    url = response.meta['url']\n\n    # \u5c06\u5217\u8868\u9875\u548c\u8be6\u60c5\u9875\u4fe1\u606f\u5408\u5e76\u5230\u4e00\u4e2aItem\u4e2d\n    item = Item()\n    item['title'] = title\n    item['time'] = time\n    item['url'] = url\n    item['content'] = content\n\n    yield item<\/pre>\n<p> \u767b\u5f55\u540e\u590d\u5236 <\/p>\n<p>\u901a\u8fc7\u8fd9\u79cd\u65b9\u5f0f\uff0c\u6211\u4eec\u53ef\u4ee5\u5c06\u5217\u8868\u9875\u548c\u8be6\u60c5\u9875\u6293\u53d6\u7684\u6807\u9898\u3001\u65f6\u95f4\u3001url\u3001\u5185\u5bb9\u7b49\u4fe1\u606f\u5408\u5e76\u5230\u4e00\u4e2aitem\u4e2d\uff0c\u4ece\u800c\u5b9e\u73b0\u5bf9\u5b8c\u6574\u6570\u636e\u7684\u6293\u53d6\u3002<\/p>\n<p>\u4ee5\u4e0a\u5c31\u662f\u5982\u4f55\u4f7f\u7528Scrapy\u5c06\u5217\u8868\u9875\u548c\u8be6\u60c5\u9875\u6570\u636e\u5408\u5e76\u5230\u4e00\u4e2aItem\u4e2d\uff1f\u7684\u8be6\u7ec6\u5185\u5bb9\uff0c\u66f4\u591a\u8bf7\u5173\u6ce8\u7c73\u4e91\u5176\u5b83\u76f8\u5173\u6587\u7ae0\uff01<\/p>\n","protected":false},"excerpt":{"rendered":"<p>scrapy\u5982\u4f55\u5c06\u5217\u8868\u9875\u548c\u8be6\u60c5\u9875\u5185\u5bb9\u5408\u5e76\u5230\u4e00\u4e2aitem\u4e2d \u5728\u4f7f\u7528scrapy\u6293\u53d6\u6570\u636e\u65f6\uff0c\u7ecf\u5e38\u4f1a\u9047\u5230\u9700\u8981\u4ece\u5217\u8868\u9875\u548c\u8be6\u60c5\u9875\u4e2d\u6293\u53d6\u5185\u5bb9\u5e76\u5b58\u50a8\u5728\u4e00\u8d77\u7684\u60c5\u51b5\u3002\u4f8b\u5982\uff0c\u4ece\u767e\u5ea6\u641c\u7d22\u9875\u9762\uff08\u5217\u8868\u9875\uff09\u83b7\u53d6\u6807\u9898\u3001\u65f6\u95f4\u3001url\u7b49\u4fe1\u606f\uff0c\u7136\u540e\u901a\u8fc7url\u94fe\u63a5\u5230\u8be6\u60c5\u9875\u8fdb\u4e00\u6b65\u83b7\u53d6\u5185\u5bb9\u3002 \u5982\u679c\u6309\u7167\u4f20\u7edf\u7684\u601d\u7ef4\uff0c\u6211\u4eec\u53ef\u80fd\u4f1a\u5c06\u5217\u8868\u9875\u548c\u8be6\u60c5\u9875\u7684\u6293\u53d6\u903b\u8f91\u5206\u522b\u5199\u5728\u4e24\u4e2a\u4e0d\u540c\u7684\u51fd\u6570\u4e2d\uff0c\u5982\u4e0b\u6240\u793a\uff1a def parse(self, response): &#8220;&#8221;&#8221;\u83b7\u53d6\u5217\u8868\u9875\u7684\u6807\u9898\u3001\u65f6\u95f4\u3001url\u5e76\u56de\u8c03parse_item&#8221;&#8221;&#8221; def parse_item(self, response): &#8220;&#8221;&#8221;\u83b7\u53d6\u8be6\u60c5\u9875\u7684\u5185\u5bb9&#8221;&#8221;&#8221; \u767b\u5f55\u540e\u590d\u5236 \u8fd9\u6837\u505a\u7684\u95ee\u9898\u662f\uff0c\u5217\u8868\u9875\u548c\u8be6\u60c5\u9875\u7684\u6293\u53d6\u903b\u8f91\u5206\u79bb\uff0c\u65e0\u6cd5\u5c06\u4e24\u90e8\u5206\u7684\u4fe1\u606f\u5408\u5e76\u5230\u540c\u4e00\u4e2aitem\u4e2d\u3002 \u4e3a\u4e86\u89e3\u51b3\u8fd9\u4e2a\u95ee\u9898\uff0cscrapy\u63d0\u4f9b\u4e86meta\u53c2\u6570\uff0c\u5b83\u53ef\u4ee5\u5c06\u4fe1\u606f\u4ece\u4e00\u4e2a\u8bf7\u6c42\u4f20\u9012\u5230\u53e6\u4e00\u4e2a\u8bf7\u6c42\u3002\u5177\u4f53\u5b9e\u73b0\u65b9\u6cd5\u5982\u4e0b\uff1a def parse(self, response): &#8220;&#8221;&#8221;\u83b7\u53d6\u5217\u8868\u9875\u7684\u6807\u9898\u3001\u65f6\u95f4\u3001URL\u5e76\u56de\u8c03parse_item&#8221;&#8221;&#8221; for item in response.css(&#8216;\u641c\u7d22\u7ed3\u679c\u5217\u8868\u5143\u7d20CSS\u9009\u62e9\u5668&#8217;): title = item.css(&#8216;\u6807\u9898CSS\u9009\u62e9\u5668&#8217;).get() time = item.css(&#8216;\u65f6\u95f4CSS\u9009\u62e9\u5668&#8217;).get() url = item.css(&#8216;URL CSS\u9009\u62e9\u5668&#8217;).get() # \u5c06\u5217\u8868\u9875\u4fe1\u606f\u5b58\u50a8\u5728meta\u4e2d meta = {&#8216;title&#8217;: title, &#8216;time&#8217;: time, &#8216;url&#8217;: url} # \u56de\u8c03parse_item\u5e76\u4f20\u9012meta yield Request(url, callback=self.parse_item, meta=meta) def parse_item(self, response): &#8220;&#8221;&#8221;\u83b7\u53d6\u8be6\u60c5\u9875\u7684\u5185\u5bb9&#8221;&#8221;&#8221; content = response.css(&#8216;\u5185\u5bb9CSS\u9009\u62e9\u5668&#8217;).get() [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"closed","ping_status":"","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[16],"tags":[],"class_list":["post-913","post","type-post","status-publish","format-standard","hentry","category-16"],"_links":{"self":[{"href":"https:\/\/fwq.ai\/blog\/wp-json\/wp\/v2\/posts\/913","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/fwq.ai\/blog\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/fwq.ai\/blog\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/fwq.ai\/blog\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/fwq.ai\/blog\/wp-json\/wp\/v2\/comments?post=913"}],"version-history":[{"count":0,"href":"https:\/\/fwq.ai\/blog\/wp-json\/wp\/v2\/posts\/913\/revisions"}],"wp:attachment":[{"href":"https:\/\/fwq.ai\/blog\/wp-json\/wp\/v2\/media?parent=913"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/fwq.ai\/blog\/wp-json\/wp\/v2\/categories?post=913"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/fwq.ai\/blog\/wp-json\/wp\/v2\/tags?post=913"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}