1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
| # -*- coding: utf-8 -*- import re import pdb import os from pyquery import PyQuery as py
article = [] #糗百段子 visitedUrl = [] #已经爬虫过的地址 spiderUrl = [] #即将爬虫的地址
def spider(url): global article global visitedUrl global spiderUrl
visitedUrl.append(url) headers = { 'Accept-Encoding':'gzip, deflate, sdch', 'Connection':'keep-alive', 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' } d = py(url,headers=headers,encoding='utf-8') d('.article').each( lambda i,e: article.append((py(e).attr('id'),py(e).find('.content').text())) ) # 下一个爬虫链接 nextUrl = d(".pagination .current").parents('li').nextAll('li a:first').attr('href') if nextUrl and nextUrl != r'/hot/': nextUrl = re.match(r'https?://(\w\.?)+',url).group() + nextUrl spiderUrl.append(nextUrl) # pdb.set_trace()
if __name__ == '__main__': spiderUrl.append('http://www.qiushibaike.com/') while spiderUrl: spider(spiderUrl.pop()) #输出爬虫过的地址 path = os.path.join(os.path.abspath('.'),'visited.txt') with open(path,'w',encoding='utf-8') as file: for index in range(len(visitedUrl)): file.write('[' + str(index+1) + ']:' + visitedUrl[index] + '\n') #输出爬虫到的段子 path = os.path.join(os.path.abspath('.'),'article.txt') with open(path,'w',encoding='utf-8') as file: for index in range(len(article)): file.write('[' + str(index + 1) + ']\n' +article[index][0] + '\n' + article[index][1] + '\n') print('运行结束!') print('共计抓取:' + str(len(visitedUrl)) + '个地址,' + str(len(article)) + '个段子!') print('具体详情:请查看目录下visited.txt和article.txt!')
|