1 # -*- coding:utf-8 -*- 2 # Author:Sure Feng 3 4 import requests 5 from lxml import etree 6 import json 7 8 9 class QiubaiSpider(object):10 def __init__(self):11 self.tempt_url = "https://www.qiushibaike.com/8hr/page/{}/"12 self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"}13 14 def parse_url(self, url):15 """发送请求,获取响应"""16 respond = requests.get(url, self.headers)17 return respond.content.decode()18 19 def get_content(self, html_str, num):20 """提取数据"""21 html = etree.HTML(html_str)22 div_list = html.xpath("//div[@id='content-left']/div") # 分组23 content_list = []24 for div in div_list:25 item = {}26 item["page"] = num27 item["content"] = div.xpath(".//div[@class='content']/span/text()")28 item["content"] = [i.replace("\n","") for i in item["content"]]29 item["author_gender"] = div.xpath(".//div[contains(@class, 'articleGender')]/@class")30 item["author_gender"] = item["author_gender"][0].split(" ")[-1].replace("Icon", "") if len(item["author_gender"])>0 else None31 item["age"] = div.xpath(".//div[contains(@class, 'articleGender')]/text()")32 item["age"] = item["age"][0] if len(item["age"])>0 else None33 item["content_img"] = div.xpath(".//div[@class='thumb']/a/img/@src")34 item["content_img"] = "https" + item["content_img"][0] if len(item["content_img"])>0 else None35 item["author_img"] = div.xpath(".//div[@class='author clearfix']//img/@src")36 item["author_img"] = "https:" + item["author_img"][0] if len(item["author_img"])>0 else None37 item["stats_vote"] = div.xpath(".//span[@class='stats-vote']/i/text()")38 item["stats_vote"] = item["stats_vote"][0] if len(item["stats_vote"])>0 else None39 content_list.append(item)40 return content_list41 42 def save_conten(self, content_list):43 """保存"""44 with open("qiubai.txt", "a", encoding="utf-8") as f:45 for content in content_list:46 f.write(json.dumps(content, ensure_ascii=False, indent=4))47 f.write("\n")48 print("保存成功")49 50 def run(self): # 实现主要逻辑51 # 获取URL列表,遍历列表52 start_url = [self.tempt_url.format(i) for i in range(1, 14)]53 num = 154 for url in start_url:55 # 发送请求,获取响应56 html_str = self.parse_url(url)57 # 提取数据58 content_list = self.get_content(html_str, num)59 # 保存60 self.save_conten(content_list)61 num += 162 63 64 if __name__ == '__main__':65 qiubai_spider = QiubaiSpider()66 qiubai_spider.run()