内容纲要

基础爬虫

基础知识

xpath

image-20211214165544327

有的浏览器右键可以直接获得xpath路径

https://www.jianshu.com/p/85a3004b5c06

# 常用检查
xpath('//*') # 所有
xpath('/*') # 子节点所有
# 不同用法
xpath('div[@class="xxxx"]')
xpath('div/@href')
xpath('div[1]/li[2]') # 第一个下标为1不是0,以此类推

请求头

https://github.com/wistbean/learn_python3_spider

https://www.cnblogs.com/gswang/p/7475494.html

#定义头
headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
    }

get和post

image-20211214165351283

image-20211214165725903

image-20211214165755049

用到的包

requests

re

beautiful soups

threading

xpath

实例

应用宝爬虫

#应用宝爬虫
def request_page(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
    except requests.RequestException:
        return None

baseurl = 'https://sj.qq.com/myapp/category.htm?orgame=1&categoryId={}'.format(i)
        html = request_page(baseurl)
        soup = BeautifulSoup(html, 'lxml')
        list = soup.find(class_='app-list clearfix').find_all('li')
        num = 0
        for item in list:
            url = item.find(class_='app-info clearfix').find(class_='com-install-btn').get('ex_url')
           # name = item.find(class_='app-info clearfix').find(class_='app-info-desc').find('a').string
            name = item.find(class_='app-info clearfix').find(class_='app-info-desc').find(class_='com-install-btn').get('apk')
            print('正在下载:%s\n页面链接:%s\n' % (name, url))
            file_path = 'D:\\app_demo\\apk\\'+str(i)+'\\'+name+'.apk'
            with open(file_path, 'wb') as f:
               # apk = requests.get(url, headers=header(url)).content
                apk = requests.get(url).content
                f.write(apk)

一些注意点

爬虫乱码

resp = requests.get(url)
resp.encoding = resp.apparent_encoding
html = resp.text

requests get 403

爬虫模拟浏览器,修改headers

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
                        'Chrome/51.0.2704.63 Safari/537.36'}
requests.get(url, headers = headers)

selenium

模拟网页点击

image-20211214165855561

#快捷键
browser.find_element_by_tag_name('html').send_keys(Keys.SHIFT, Keys.CONTROL, Keys.ALT, 'y')      #将element定位到html整体

实例

import subprocess
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

sn_date=b'210235C2W3F18700225020111201'
#sn_date=input().encode('utf-8')
s = subprocess.Popen('.\\testcpp2.exe', stderr=subprocess.PIPE, stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=True)
s.stdin.write(sn_date)
s.stdin.flush()
stdoutinfo, stderrinfo = s.communicate()
#print(stderrinfo,stdoutinfo)
result = ''.join(["", stdoutinfo.decode('utf-8')])
print(result)
browser=webdriver.Firefox("D:\\Tools\\geckodriver-v0.29.1-win64")
browser.get("http://192.168.50.13")
sleep(2)
browser.switch_to.frame(browser.find_element_by_xpath('/html[1]/body[1]/iframe[1]'))
browser.find_element_by_id("password").send_keys(result)
browser.find_element_by_class_name("custom-btn-center").click()
sleep(2)
browser.find_element_by_xpath('/html/body/div[13]/div/div[1]/a').click()
browser.find_element_by_xpath('//*[@id="keyTip"]').click()
sleep(2)
browser.find_element_by_tag_name('html').send_keys(Keys.SHIFT, Keys.CONTROL, Keys.ALT, 'y')
sleep(2)
browser.find_element_by_xpath('/html/body/div[4]/div[2]/div[3]/div[1]/div[1]/div[28]/a').click()
sleep(2)
browser.find_element_by_xpath('//*[@id="telnetOpen"]').click()
#不显示浏览器模拟点击
from selenium.webdriver.firefox.options import Options
options = Options()
options.add_argument('--headless')
browser=webdriver.Firefox("D:\\Tools\\geckodriver-v0.29.1-win64", options=options)

一些注意点

无法匹配到id:右键复制xpath使用find_element_by_xpath()

无法定位到id位置:sleep几秒等待资源加载完再进行sendkeys

带有hidden等隐藏可用js点击(selenium不能改hidden)

需确定frame后再by xpath(不确定frame即使full xpath也找不到)

超时重试

https://www.jianshu.com/p/fdfb5591536c

# 超时
request.get(url, timeout=(10, 120))
# 失败重连3次
i = 0
while i < 3:
    try:
        data = get_url(url)
        if data:
            break
        else:
            i += 1
    except:
        i += 1

image-20211117225357423

多进程爬虫

多线程爬虫

异步爬虫

yield generator 迭代器

懒,下次再做到项目的时候整理……

Scrapy框架

新建Scrapy项目

scrapy startproject tutorial

# 创建后的目录如下
tutorial/
    scrapy.cfg            # deploy configuration file

    tutorial/             # project's Python module, you'll import your code from here
        __init__.py

        items.py          # project items definition file

        middlewares.py    # project middlewares file

        pipelines.py      # project pipelines file

        settings.py       # project settings file

        spiders/          # a directory where you'll later put your spiders
            __init__.py

tutorial/tutorial/spiders下创建项目

# 创建类继承scrapy的Spider类
class BaiDuSpider(scrapy.Spider):
    # 识别spider项目。项目中必须是唯一的,不能为不同的 spider 设置相同的名称。
    name = 'BaiDuSpider'

或者

scrapy genspider baidu www.baidu.com

自动生成:

import scrapy

class BaiduSpider(scrapy.Spider):
    name = 'baidu'
    allowed_domains = ['www.baidu.com']
    start_urls = ['http://www.baidu.com/']

    def parse(self, response):
        pass

运行多个爬虫

# main.py
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

# 根据项目配置获取 CrawlerProcess 实例
process = CrawlerProcess(get_project_settings())

# 添加需要执行的爬虫
process.crawl('project1')
process.crawl('project2')
process.crawl('project3')

# 执行
process.start()

运行所有爬虫

# main.py
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from scrapy.spiderloader import SpiderLoader

# 根据项目配置获取 CrawlerProcess 实例
process = CrawlerProcess(get_project_settings())

# 获取 spiderloader 对象,以进一步获取项目下所有爬虫名称
spider_loader = SpiderLoader(get_project_settings())

# 添加需要执行的爬虫
for spidername in spider_loader.list():
    process.crawl(spidername)

# 执行
process.start()

setting.py配置

USER_AGENT:默认是注释的,这个东西非常重要,如果不写很容易被判断为电脑,简单点洗一个Mozilla/5.0即可

ROBOTSTXT_OBEY:是否遵循机器人协议,默认是true,需要改为false,否则很多东西爬不了

CONCURRENT_REQUESTS:最大并发数,很好理解,就是同时允许开启多少个爬虫线程

DOWNLOAD_DELAY:下载延迟时间,单位是秒,控制爬虫爬取的频率,根据你的项目调整,不要太快也不要太慢,默认是3秒,即爬一个停3秒,设置为1秒性价比较高,如果要爬取的文件较多,写零点几秒也行

COOKIES_ENABLED:是否保存COOKIES,默认关闭,开机可以记录爬取过程中的COKIE,非常好用的一个参数

DEFAULT_REQUEST_HEADERS:默认请求头,上面写了一个USER_AGENT,其实这个东西就是放在请求头里面的,这个东西可以根据你爬取的内容做相应设置。

ITEM_PIPELINES:项目管道,300为优先级,越低越爬取的优先度越高

image-20211214172608571

在setting.py里面可以设置日志的等级与日志存放的路径

# splite是必须写的,目的是把字符串转为列表形式,第一个参数是scrapy,第二个crawl,第三个baidu
from scrapy import cmdline
cmdline.execute('scrapy crawl baidu'.split())

scrapy crawl 项目名 -o *.json

对于json文件,在setting.js文件里添加,设置编码格式,否则会乱码:

FEED_EXPORT_ENCODING='utf-8'

from scrapy import cmdline

cmdline.execute('scrapy crawl baidu -o baidu.csv'.split())
scrapy 超时时间设置
custom_settings = {
        "CLOSESPIDER_TIMEOUT": 3600
    }

pipelines.py

当Item在Spider中被收集之后,它将会被传递到Item Pipeline(管道),这些Item Pipeline组件按定义的顺序处理Item。

image-20211214193922241

image-20211214193938214

下载

可以继承scrapy/pipelines下的file、media、image里的类,重写方法,实现下载

class APKDownloadPipeline(FilesPipeline):
    def get_apk_requests(self, item, info):
        if 'app_os' in item.keys() and ('os' in item['app_os'] or 'OS' in item['app_os']):
            pass
        else:
            yield scrapy.Request(item.get(self.FILES_URLS_FIELD, []), meta=item)

    def file_path(self, request, response=None, info=None, *, item=None):
        filename = item['app_name'] + item['version'] + '.apk'
        filename = filename.replace(' ', '').replace(':', "_")
        if not os.path.exists("download_apk/{}".format(item['name'])):
            os.makedirs("download_apk/{}".format(item['name']))
        return '{}/{}'.format(item['name'], filename)

    def media_downloaded(self, response, request, info, *, item=None):
        referer = referer_str(request)

        if response.status != 200:
            logger.warning(
                'File (code: %(status)s): Error downloading file from '
                '%(request)s referred in <%(referer)s>',
                {'status': response.status,
                 'request': request, 'referer': referer},
                extra={'spider': info.spider}
            )
            raise FileException('download-error')

        if not response.body:
            logger.warning(
                'File (empty-content): Empty file from %(request)s referred '
                'in <%(referer)s>: no-content',
                {'request': request, 'referer': referer},
                extra={'spider': info.spider}
            )
            raise FileException('empty-content')

        status = 'cached' if 'cached' in response.flags else 'downloaded'
        logger.debug(
            'File (%(status)s): Downloaded file from %(request)s referred in '
            '<%(referer)s>',
            {'status': status, 'request': request, 'referer': referer},
            extra={'spider': info.spider}
        )
        self.inc_stats(info.spider, status)

        try:
            path = self.file_path(request, response=response, info=info, item=item)
            checksum = self.file_downloaded(response, request, info, item=item)
        except FileException as exc:
            logger.warning(
                'File (error): Error processing file from %(request)s '
                'referred in <%(referer)s>: %(errormsg)s',
                {'request': request, 'referer': referer, 'errormsg': str(exc)},
                extra={'spider': info.spider}, exc_info=True
            )
            raise
        except Exception as exc:
            logger.error(
                'File (unknown-error): Error processing file from %(request)s '
                'referred in <%(referer)s>',
                {'request': request, 'referer': referer},
                exc_info=True, extra={'spider': info.spider}
            )
            raise FileException(str(exc))

        return {'url': request.url, 'path': path, 'md5': checksum, 'status': status}

其他

xpath获取:

extract():这个方法返回的是一个数组list,,里面包含了多个string,如果只有一个string,则返回['ABC']这样的形式。

extract_first():这个方法返回的是一个string字符串,是list数组里面的第一个字符串。

案例

百度apk搜索页面的爬虫

# from import_models import *
import scrapy
from lxml import etree

from BaiDuSpider.items import BaiduspiderItem

class BaiDuSpider(scrapy.Spider):
    name = 'BaiDuSpider'
    search_url = u'http://shouji.baidu.com/s?wd='
    keyword = u'京东'
    start_urls = [search_url + keyword]
    pre_url = u'https://shouji.baidu.com'
    start_url = u'http://shouji.baidu.com/s?wd=京东'
    post_url = u'#page'
    page = 1

    def parse(self, response, **kwargs):
        all_elts = response.xpath('//ul[@class="app-list"]/li')
        for elt in all_elts:
            item = BaiduspiderItem()    # 注意!一定要写循环里面,外面则会循环内的item值都一样
            # elt = etree.HTML(elt)
            item['app_name'] = elt.xpath('div/div[@class="info"]/div[1]/a/text()').extract_first().replace('\n', '').replace(' ', '')
            item['app_detail_url'] = self.pre_url + elt.xpath('div/div[@class="info"]/div[1]/a/@href').extract_first()
            item['download_count'] = elt.xpath('div/div[@class="info"]/div[3]/em/span/text()').extract_first()
            item['apk_download_url'] = elt.xpath('div/div[@class="little-install"]/a/@data_url').extract_first()
            item['version'] = elt.xpath('div/div[@class="little-install"]/a/@data_versionname').extract_first()
            app_detail_url = self.pre_url + elt.xpath('div/div[@class="info"]/div[1]/a/@href').extract_first()
            yield scrapy.Request(url=app_detail_url, callback=self.parse_detail, meta={'item': item})   # meta传递item

        if self.page < 11:
            print('next page')
            self.page += 1
            next_page_url = self.start_url + self.post_url + '{}'.format(self.page)
            yield scrapy.Request(url=next_page_url, callback=self.parse)

    def parse_detail(self, response):
        item = response.meta['item']    # 接受传递的item
        item['categories'] = response.xpath('//div[@class="nav"]/span[5]/a/text()').extract_first()
        item['description'] = response.xpath('//div[@class="brief-long"]/p/text()').extract_first()
        yield item
# items.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy

class BaiduspiderItem(scrapy.Item):
    app_name = scrapy.Field()
    app_detail_url = scrapy.Field()
    download_count = scrapy.Field()
    apk_download_url = scrapy.Field()
    categories = scrapy.Field()
    version = scrapy.Field()
    description = scrapy.Field()
# setteings.py
# 需先配置settings.py才能使用pipelines.py
# 取消ITEM_PIPELINES注释即可
ITEM_PIPELINES = {
   'BaiDuSpider.pipelines.BaiduspiderPipeline': 300,
}
# pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

# useful for handling different item types with a single interface
import json

from itemadapter import ItemAdapter

class BaiduspiderPipeline(object):
    def __init__(self):
        self.fp = open('./apk_info.json', 'w+', encoding='utf8')

    def process_item(self, item, spider):
        print('开始存储')
        d = dict(item)
        content = json.dumps(d, ensure_ascii=False)
        self.fp.write(content + '\n')
        return item

    def close_spider(self, spider):
        self.fp.close()

scrapy框架

scrapy教程

xpath的extract

爬取二级子页面

爬取二级子页面,一级页面也提取信息

pipelines使用方法

常见问题

check_hostname requires server_hostname

urllib3 1.26更新了主架构,使用urllib3 1.25可以解决问题

pip install --upgrade pip
pip install urllib3==1.25.11

设置代理

proxy = {
    'https': 'https://127.0.0.1:7890',# vpn代理的端口
    'http': 'http://127.0.0.1:7890'
}
response = requests.get(url, proxies=proxy)

返回数据乱码

# Accept-Encoding = "gzip, deflate, br"
Accept-Encoding = "gzip, deflate"
# 去掉br