基础爬虫
基础知识
xpath
有的浏览器右键可以直接获得xpath路径
https://www.jianshu.com/p/85a3004b5c06
# 常用检查
xpath('//*') # 所有
xpath('/*') # 子节点所有
# 不同用法
xpath('div[@class="xxxx"]')
xpath('div/@href')
xpath('div[1]/li[2]') # 第一个下标为1不是0,以此类推
请求头
https://github.com/wistbean/learn_python3_spider
https://www.cnblogs.com/gswang/p/7475494.html
#定义头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}
get和post
用到的包
requests
re
beautiful soups
threading
xpath
实例
应用宝爬虫
#应用宝爬虫
def request_page(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
except requests.RequestException:
return None
baseurl = 'https://sj.qq.com/myapp/category.htm?orgame=1&categoryId={}'.format(i)
html = request_page(baseurl)
soup = BeautifulSoup(html, 'lxml')
list = soup.find(class_='app-list clearfix').find_all('li')
num = 0
for item in list:
url = item.find(class_='app-info clearfix').find(class_='com-install-btn').get('ex_url')
# name = item.find(class_='app-info clearfix').find(class_='app-info-desc').find('a').string
name = item.find(class_='app-info clearfix').find(class_='app-info-desc').find(class_='com-install-btn').get('apk')
print('正在下载:%s\n页面链接:%s\n' % (name, url))
file_path = 'D:\\app_demo\\apk\\'+str(i)+'\\'+name+'.apk'
with open(file_path, 'wb') as f:
# apk = requests.get(url, headers=header(url)).content
apk = requests.get(url).content
f.write(apk)
一些注意点
爬虫乱码
resp = requests.get(url)
resp.encoding = resp.apparent_encoding
html = resp.text
requests get 403
爬虫模拟浏览器,修改headers
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/51.0.2704.63 Safari/537.36'}
requests.get(url, headers = headers)
selenium
模拟网页点击
#快捷键
browser.find_element_by_tag_name('html').send_keys(Keys.SHIFT, Keys.CONTROL, Keys.ALT, 'y') #将element定位到html整体
实例
import subprocess
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
sn_date=b'210235C2W3F18700225020111201'
#sn_date=input().encode('utf-8')
s = subprocess.Popen('.\\testcpp2.exe', stderr=subprocess.PIPE, stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=True)
s.stdin.write(sn_date)
s.stdin.flush()
stdoutinfo, stderrinfo = s.communicate()
#print(stderrinfo,stdoutinfo)
result = ''.join(["", stdoutinfo.decode('utf-8')])
print(result)
browser=webdriver.Firefox("D:\\Tools\\geckodriver-v0.29.1-win64")
browser.get("http://192.168.50.13")
sleep(2)
browser.switch_to.frame(browser.find_element_by_xpath('/html[1]/body[1]/iframe[1]'))
browser.find_element_by_id("password").send_keys(result)
browser.find_element_by_class_name("custom-btn-center").click()
sleep(2)
browser.find_element_by_xpath('/html/body/div[13]/div/div[1]/a').click()
browser.find_element_by_xpath('//*[@id="keyTip"]').click()
sleep(2)
browser.find_element_by_tag_name('html').send_keys(Keys.SHIFT, Keys.CONTROL, Keys.ALT, 'y')
sleep(2)
browser.find_element_by_xpath('/html/body/div[4]/div[2]/div[3]/div[1]/div[1]/div[28]/a').click()
sleep(2)
browser.find_element_by_xpath('//*[@id="telnetOpen"]').click()
#不显示浏览器模拟点击
from selenium.webdriver.firefox.options import Options
options = Options()
options.add_argument('--headless')
browser=webdriver.Firefox("D:\\Tools\\geckodriver-v0.29.1-win64", options=options)
一些注意点
无法匹配到id:右键复制xpath使用find_element_by_xpath()
无法定位到id位置:sleep几秒等待资源加载完再进行sendkeys
带有hidden等隐藏可用js点击(selenium不能改hidden)
需确定frame后再by xpath(不确定frame即使full xpath也找不到)
超时重试
https://www.jianshu.com/p/fdfb5591536c
# 超时
request.get(url, timeout=(10, 120))
# 失败重连3次
i = 0
while i < 3:
try:
data = get_url(url)
if data:
break
else:
i += 1
except:
i += 1
多进程爬虫
多线程爬虫
异步爬虫
yield generator 迭代器
懒,下次再做到项目的时候整理……
Scrapy框架
新建Scrapy项目
scrapy startproject tutorial
# 创建后的目录如下
tutorial/
scrapy.cfg # deploy configuration file
tutorial/ # project's Python module, you'll import your code from here
__init__.py
items.py # project items definition file
middlewares.py # project middlewares file
pipelines.py # project pipelines file
settings.py # project settings file
spiders/ # a directory where you'll later put your spiders
__init__.py
在tutorial/tutorial/spiders
下创建项目
# 创建类继承scrapy的Spider类
class BaiDuSpider(scrapy.Spider):
# 识别spider项目。项目中必须是唯一的,不能为不同的 spider 设置相同的名称。
name = 'BaiDuSpider'
或者
scrapy genspider baidu www.baidu.com
自动生成:
import scrapy
class BaiduSpider(scrapy.Spider):
name = 'baidu'
allowed_domains = ['www.baidu.com']
start_urls = ['http://www.baidu.com/']
def parse(self, response):
pass
运行多个爬虫
# main.py
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
# 根据项目配置获取 CrawlerProcess 实例
process = CrawlerProcess(get_project_settings())
# 添加需要执行的爬虫
process.crawl('project1')
process.crawl('project2')
process.crawl('project3')
# 执行
process.start()
运行所有爬虫
# main.py
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from scrapy.spiderloader import SpiderLoader
# 根据项目配置获取 CrawlerProcess 实例
process = CrawlerProcess(get_project_settings())
# 获取 spiderloader 对象,以进一步获取项目下所有爬虫名称
spider_loader = SpiderLoader(get_project_settings())
# 添加需要执行的爬虫
for spidername in spider_loader.list():
process.crawl(spidername)
# 执行
process.start()
setting.py配置
USER_AGENT:默认是注释的,这个东西非常重要,如果不写很容易被判断为电脑,简单点洗一个Mozilla/5.0即可
ROBOTSTXT_OBEY:是否遵循机器人协议,默认是true,需要改为false,否则很多东西爬不了
CONCURRENT_REQUESTS:最大并发数,很好理解,就是同时允许开启多少个爬虫线程
DOWNLOAD_DELAY:下载延迟时间,单位是秒,控制爬虫爬取的频率,根据你的项目调整,不要太快也不要太慢,默认是3秒,即爬一个停3秒,设置为1秒性价比较高,如果要爬取的文件较多,写零点几秒也行
COOKIES_ENABLED:是否保存COOKIES,默认关闭,开机可以记录爬取过程中的COKIE,非常好用的一个参数
DEFAULT_REQUEST_HEADERS:默认请求头,上面写了一个USER_AGENT,其实这个东西就是放在请求头里面的,这个东西可以根据你爬取的内容做相应设置。
ITEM_PIPELINES:项目管道,300为优先级,越低越爬取的优先度越高
在setting.py里面可以设置日志的等级与日志存放的路径
# splite是必须写的,目的是把字符串转为列表形式,第一个参数是scrapy,第二个crawl,第三个baidu
from scrapy import cmdline
cmdline.execute('scrapy crawl baidu'.split())
scrapy crawl 项目名 -o *.json
对于json文件,在setting.js文件里添加,设置编码格式,否则会乱码:
FEED_EXPORT_ENCODING='utf-8'
from scrapy import cmdline
cmdline.execute('scrapy crawl baidu -o baidu.csv'.split())
scrapy 超时时间设置
custom_settings = {
"CLOSESPIDER_TIMEOUT": 3600
}
pipelines.py
当Item在Spider中被收集之后,它将会被传递到Item Pipeline(管道),这些Item Pipeline组件按定义的顺序处理Item。
下载
可以继承scrapy/pipelines下的file、media、image里的类,重写方法,实现下载
class APKDownloadPipeline(FilesPipeline):
def get_apk_requests(self, item, info):
if 'app_os' in item.keys() and ('os' in item['app_os'] or 'OS' in item['app_os']):
pass
else:
yield scrapy.Request(item.get(self.FILES_URLS_FIELD, []), meta=item)
def file_path(self, request, response=None, info=None, *, item=None):
filename = item['app_name'] + item['version'] + '.apk'
filename = filename.replace(' ', '').replace(':', "_")
if not os.path.exists("download_apk/{}".format(item['name'])):
os.makedirs("download_apk/{}".format(item['name']))
return '{}/{}'.format(item['name'], filename)
def media_downloaded(self, response, request, info, *, item=None):
referer = referer_str(request)
if response.status != 200:
logger.warning(
'File (code: %(status)s): Error downloading file from '
'%(request)s referred in <%(referer)s>',
{'status': response.status,
'request': request, 'referer': referer},
extra={'spider': info.spider}
)
raise FileException('download-error')
if not response.body:
logger.warning(
'File (empty-content): Empty file from %(request)s referred '
'in <%(referer)s>: no-content',
{'request': request, 'referer': referer},
extra={'spider': info.spider}
)
raise FileException('empty-content')
status = 'cached' if 'cached' in response.flags else 'downloaded'
logger.debug(
'File (%(status)s): Downloaded file from %(request)s referred in '
'<%(referer)s>',
{'status': status, 'request': request, 'referer': referer},
extra={'spider': info.spider}
)
self.inc_stats(info.spider, status)
try:
path = self.file_path(request, response=response, info=info, item=item)
checksum = self.file_downloaded(response, request, info, item=item)
except FileException as exc:
logger.warning(
'File (error): Error processing file from %(request)s '
'referred in <%(referer)s>: %(errormsg)s',
{'request': request, 'referer': referer, 'errormsg': str(exc)},
extra={'spider': info.spider}, exc_info=True
)
raise
except Exception as exc:
logger.error(
'File (unknown-error): Error processing file from %(request)s '
'referred in <%(referer)s>',
{'request': request, 'referer': referer},
exc_info=True, extra={'spider': info.spider}
)
raise FileException(str(exc))
return {'url': request.url, 'path': path, 'md5': checksum, 'status': status}
其他
xpath获取:
extract():这个方法返回的是一个数组list,,里面包含了多个string,如果只有一个string,则返回['ABC']这样的形式。
extract_first():这个方法返回的是一个string字符串,是list数组里面的第一个字符串。
案例
百度apk搜索页面的爬虫
# from import_models import *
import scrapy
from lxml import etree
from BaiDuSpider.items import BaiduspiderItem
class BaiDuSpider(scrapy.Spider):
name = 'BaiDuSpider'
search_url = u'http://shouji.baidu.com/s?wd='
keyword = u'京东'
start_urls = [search_url + keyword]
pre_url = u'https://shouji.baidu.com'
start_url = u'http://shouji.baidu.com/s?wd=京东'
post_url = u'#page'
page = 1
def parse(self, response, **kwargs):
all_elts = response.xpath('//ul[@class="app-list"]/li')
for elt in all_elts:
item = BaiduspiderItem() # 注意!一定要写循环里面,外面则会循环内的item值都一样
# elt = etree.HTML(elt)
item['app_name'] = elt.xpath('div/div[@class="info"]/div[1]/a/text()').extract_first().replace('\n', '').replace(' ', '')
item['app_detail_url'] = self.pre_url + elt.xpath('div/div[@class="info"]/div[1]/a/@href').extract_first()
item['download_count'] = elt.xpath('div/div[@class="info"]/div[3]/em/span/text()').extract_first()
item['apk_download_url'] = elt.xpath('div/div[@class="little-install"]/a/@data_url').extract_first()
item['version'] = elt.xpath('div/div[@class="little-install"]/a/@data_versionname').extract_first()
app_detail_url = self.pre_url + elt.xpath('div/div[@class="info"]/div[1]/a/@href').extract_first()
yield scrapy.Request(url=app_detail_url, callback=self.parse_detail, meta={'item': item}) # meta传递item
if self.page < 11:
print('next page')
self.page += 1
next_page_url = self.start_url + self.post_url + '{}'.format(self.page)
yield scrapy.Request(url=next_page_url, callback=self.parse)
def parse_detail(self, response):
item = response.meta['item'] # 接受传递的item
item['categories'] = response.xpath('//div[@class="nav"]/span[5]/a/text()').extract_first()
item['description'] = response.xpath('//div[@class="brief-long"]/p/text()').extract_first()
yield item
# items.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class BaiduspiderItem(scrapy.Item):
app_name = scrapy.Field()
app_detail_url = scrapy.Field()
download_count = scrapy.Field()
apk_download_url = scrapy.Field()
categories = scrapy.Field()
version = scrapy.Field()
description = scrapy.Field()
# setteings.py
# 需先配置settings.py才能使用pipelines.py
# 取消ITEM_PIPELINES注释即可
ITEM_PIPELINES = {
'BaiDuSpider.pipelines.BaiduspiderPipeline': 300,
}
# pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
import json
from itemadapter import ItemAdapter
class BaiduspiderPipeline(object):
def __init__(self):
self.fp = open('./apk_info.json', 'w+', encoding='utf8')
def process_item(self, item, spider):
print('开始存储')
d = dict(item)
content = json.dumps(d, ensure_ascii=False)
self.fp.write(content + '\n')
return item
def close_spider(self, spider):
self.fp.close()
常见问题
check_hostname requires server_hostname
urllib3 1.26更新了主架构,使用urllib3 1.25可以解决问题
pip install --upgrade pip
pip install urllib3==1.25.11
设置代理
proxy = {
'https': 'https://127.0.0.1:7890',# vpn代理的端口
'http': 'http://127.0.0.1:7890'
}
response = requests.get(url, proxies=proxy)
返回数据乱码
# Accept-Encoding = "gzip, deflate, br"
Accept-Encoding = "gzip, deflate"
# 去掉br