scrapy多个page的爬取
import scrapyfrom bossPro.items import BossproItemclass BossSpider(scrapy.Spider): name = 'boss' # allowed_domains = ['www.xxx.com'] start_urls = [ 'https://www.zhipin.com/job_detail/?query=python%E7%88%AC%E8%99%AB&scity=101010100&industry=&position='] url = 'https://www.zhipin.com/c101010100/?query=python爬虫&page=%d&ka=page-2' page = 1 # 解析+管道持久化存储 def parse(self, response): li_list = response.xpath('//div[@class="job-list"]/ul/li') for li in li_list: job_name = li.xpath('.//div[@class="info-primary"]/h3/a/div/text()').extract_first() salary = li.xpath('.//div[@class="info-primary"]/h3/a/span/text()').extract_first() company = li.xpath('.//div[@class="company-text"]/h3/a/text()').extract_first() # 实例化一个item对象 item = BossproItem() # 将解析到的数据全部封装到item对象中 item['job_name'] = job_name item['salary'] = salary item['company'] = company # 将item提交给管道 yield item if self.page <= 3: print('if 执行!!!') self.page += 1 new_url = format(self.url % self.page) print(new_url) # 手动请求发送 yield scrapy.Request(url=new_url, callback=self.parse)
scrapy post请求
import scrapyfrom scrapy1.items import Scrapy1Itemclass MyspiderSpider(scrapy.Spider): name = 'qiubai' # allowed_domains = ['www.baidu.com'] start_urls = ['https://fanyi.baidu.com/sug'] data = { 'kw': 'cat'} def start_requests(self): for url in self.start_urls: yield scrapy.FormRequest(url=url, formdata=self.data, callback=self.parse) def parse(self, response): item = Scrapy1Item() item['title'] = 'cat' item['content'] = response.text yield item
scrapy通过爬到的URL继续发请求爬页面
import scrapyfrom scrapy1.items import Scrapy1Itemclass MyspiderSpider(scrapy.Spider): name = 'qiubai' # allowed_domains = ['www.baidu.com'] start_urls = ['https://www.4567tv.tv/frim/index1.html'] def get_detail(self, response): item = response.meta['item'] detail = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]/text()').extract_first() item['content'] = detail yield item def parse(self, response): div_list = response.xpath('//li[@class="col-md-6 col-sm-4 col-xs-3"]') # print(div_list) for li in div_list: item = Scrapy1Item() name = li.xpath('./div/a/@title').extract_first() href = 'https://www.4567tv.tv' + li.xpath('./div/a/@href').extract_first() item['title'] = name yield scrapy.Request(url=href, callback=self.get_detail, meta={ 'item':item})