内容纲要

基本用法

value not in rule.id_pairs[key]

#return ... if ... else ...
print(a) if a==2 else print('-2')

#set list
>>> a=set("1")
>>> a
set(['1'])
>>> list(a)
['1']

int()

image-20211007004543141

any()

image-20211007004552930

main传参

def main(argv):
    print(argv[1])
    print(argv[2])
    print(argv[3])

if __name__ == "__main__":
    main(sys.argv)

函数内引用函数外变量

x = 1

def func():
    global x
    x = 2

func()
print(x)

正则

https://docs.python.org/zh-cn/3/library/re.html

https://www.runoob.com/python/python-reg-expressions.html

image-20211007020146672

提取URL

reg = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
url = re.findall(reg, data)

package:subprocess

image-20211007020337513

package:logging

python logging

package:optionParser

https://blog.csdn.net/lwnylslwnyls/article/details/8199454

官方文档

usage = "Usage: %prog (-i example.apk | -f /folder/) [-t time] [-r report_folder] \n\t[-a avd_name] [-s screenshot_folder] [-v]"
    parser = OptionParser(usage)
    parser.add_option("-f", '--folder', dest='folder', help='the folder containing dozens of APK files to be analyzed', action='store', type='string')
    parser.add_option("-i", '--input', dest='input', help='path of the APK file to be analyzed', action='store', type='string')
    parser.add_option("-t", '--timeout', dest='timeout', help='maximum analysis time limit in minutes', action='store', type='int')
    parser.add_option("-r", '--report', dest='report', help='folder where store the outputed report', action='store', type='string')
    parser.add_option("-a", '--avd', dest='avd', help='the chosen android virtual device(AVD) name', action='store', type='string')
    parser.add_option("-s", '--screenshot', dest='screenshot', help='path where store the outputed screenshot', action='store', type='string')
    parser.add_option("-v", '--version', dest='version', help='', action='store_true', default=False)

    (options, _) = parser.parse_args()

    if options.version:
        # only show version
        logger.info("Static Analyzer: " + __version__)
        logger.info("Android Guard: " + androconf.ANDROGUARD_VERSION)
        sys.exit(0)

    if options.input:
        # single apk analysis first
        options.input = Path(options.input)
        logger.debug("The provided APK file is: %s" % options.input.absolute())
    elif options.folder:  
        # multiple apks analysis second
        options.folder = Path(options.folder)
        if not options.folder.is_dir():
            logger.error("Please specify the folder option as a folder path")
            sys.exit(-1)
        logger.debug("The provided APK folder is: %s" % options.folder.absolute())
    else:
        # User must provide option "-i" or "-f"
        parser.print_help()
        sys.exit(-1)

    if options.report:  
        options.report = Path(options.report)
#        print options.report
        if not options.report.exists():
            options.report.mkdir(parents=True)
        logger.debug("The specified report output folder is: %s" % options.report.absolute())
    else:
        options.report = Path('report/')
        if not options.report.exists():
            options.report.mkdir(parents=True)

    if options.screenshot:  
        options.screenshot = Path(options.screenshot)
        if not options.screenshot.exists():
            options.screenshot.mkdir(parents=True)
        logger.debug("The specified screenshot output path is: %s" % options.screenshot.absolute())

    # TODO: There still need to deal with the CLI args
    main(options, _)

创建文件夹

if not os.path.exists(path):
    os.makedirs(path)   #多子文件夹路径
    os.mkdir(dst_path)  #文件夹

移动文件

shutil.move('./file.txt', '/data/file.txt')

删除文件夹下所有文件

import os
import shutil
#python删除文件的方法 os.remove(path)path指的是文件的绝对路径,如:
# os.remove(r"E:\code\practice\data\1.py")#删除文件
# os.rmdir(r"E:\code\practice\data\2")#删除文件夹(只能删除空文件夹)
# shutil.rmtree(r"E:\code\practice\data\2")#删除文件夹
# path_data = "E:\code\practice\data"#
def del_file(path_data):
    for i in os.listdir(path_data) :# os.listdir(path_data)#返回一个列表,里面是当前目录下面的所有东西的相对路径
        file_data = path_data + "\\" + i#当前文件夹的下面的所有东西的绝对路径
        if os.path.isfile(file_data) == True:#os.path.isfile判断是否为文件,如果是文件,就删除.如果是文件夹.递归给del_file.
            os.remove(file_data)
        else:
            del_file(file_data)
path_data = r"E:\code\practice\data"
del_file(path_data)

https://blog.csdn.net/weixin_42341608/article/details/90481677

计算哈希值

https://www.cnblogs.com/xiaodekaixin/p/11203857.html

import hashlib
#string
md5hash = hashlib.md5(str)
#file
with open(file_path, 'rb') as fp:
        data = fp.read()
    file_md5= hashlib.md5(data).hexdigest()
#基于文件和文件夹多文件选择的计算哈希值脚本
import hashlib
import json
import os

apk_md5 = {}

if __name__ == '__main__':
    mode = input("input file or dir: ")
    if mode == 'file':
        file_path = input("input file path: ")
        file_name = os.path.basename(file_path)
        with open(file_path, 'rb') as fp:
            data = fp.read()
        print(file_name)
        file_md5 = hashlib.md5(data).hexdigest()
        print(file_md5 + '\n')
        apk_md5[file_name] = file_md5
    elif mode == 'dir':
        dir_path = input("input dir path with \\: ")
        num = 0
        for file_name in os.listdir(dir_path):
            num += 1
            file_path = dir_path + file_name
            with open(file_path, 'rb') as fp:
                data = fp.read()
            print(file_name)
            file_md5 = hashlib.md5(data).hexdigest()
            print(file_md5 + '\n')
            apk_md5[file_name] = file_md5

            json_path = dir_path + 'hash.json'
            json.dump(apk_md5, open(json_path, 'w'), indent=4)
        print("total: %s" % num)

package:os

#os
import os
file_name = os.path.basename(file_path)
dir_path = os.path.dirname(file_path)
#遍历dir下的file
for file_name in os.listdir(dir_path):

def file_path(file_dir):
    for root, dirs, files in os.walk(file_dir):
        print(root, end=' ')    # 当前目录路径
        print(dirs, end=' ')    # 当前路径下的所有子目录
        print(files)            # 当前目录下的所有非目录子文件

https://www.cnblogs.com/Zhanxueyou/p/6610053.html

json dump

//数据以json文件dump
import json
json.dump(apk_md5, open(json_path, 'w'), indent=4)

形式如下:

image-20211007022919650

https://www.cnblogs.com/shapeL/p/9037670.html

json load

#json文件中的字典传入字典变量js_str = json.load('1.json')#str转为字典变量js_str = json.load('2.txt')

generator 迭代器

yield

image-20211007023403336

import sys

def fibonacci(n): # 生成器函数 - 斐波那契
    a, b, counter = 0, 1, 0
    while True:
        if (counter > n): 
            return
        yield a
        a, b = b, a + b
        counter += 1
f = fibonacci(10) # f 是一个迭代器,由生成器返回生成

while True:
    try:
        print (next(f), end=" ")
    except StopIteration:
        sys.exit()

https://www.runoob.com/python3/python3-iterator-generator.html

import itertools as its

words ="1234567890QWERTYUIOPASDFGHJKLZXCVBNM"

r =its.product(words,repeat=5)
for i in r:
    print(list(i))

import itertools as its

words ={'1','2','3'}
a=''
r =its.product(words,repeat=5)
for i in r:
    for j in list(i):
        a+=j
    print(a)
    a=''

telnet

https://www.cnblogs.com/russellluo/archive/2012/02/11/2346501.html

image-20211007031036927

print(tn.read_all())要放telnet.close之后

telnetlib

_ _init__.py文件

https://www.cnblogs.com/tp1226/p/8453854.html

image-20211007031224413

写入文件

image-20211007031519972

gzip解压缩

import gzip
f_name = file_name.replace(".gz", "")
g_file = gzip.GzipFile(file_name)
open(f_name, "wb+").write(g_file.read())
g_file.close()

字典

lists['20111201'][sn_date[:-8]]=getPasswd(sn_date)value

image-20211007031703856

#keyvalue一对多,value设为[],append
if tmp not in dic:
            dic[tmp] = []
        dic[tmp].append(sn_date)

判断有无该key值

if '1' in dicts.keys()

列表去重

lsit(set(array))
#用了哈希和eq方法,比遍历快多了

开启虚拟环境

virtualenv --python=/usr/bin/python venv_name
#启动
source ./bin/activate
#退出
deactivate

查字符串结尾

s.endswith(('1', '2')) #是否以1或2结尾,返回值为bool型

python字节码

def work():
    return '1'
>>> dis.dis(work)

https://docs.python.org/zh-cn/3/library/dis.html

文本打开方式

https://blog.csdn.net/ztf312/article/details/47259805

字符串标志-u r b

https://blog.csdn.net/u010496169/article/details/70045895

u/U:表示unicode字符串

r/R:非转义的原始字符串

b:bytes

文件read、readline、readlines

https://www.jianshu.com/p/a672f39287c4

read([size])方法从文件当前位置起读取size个字节,若无参数size,则表示读取至文件结束为止,它范围为字符串对象

readline()方法从字面意思可以看出,该方法每次读出一行内容,所以,读取时占用内存小,比较适合大文件,该方法返回一个字符串对象。

readlines()方法读取整个文件所有行,保存在一个列表(list)变量中,每行作为一个元素,但读取大文件会比较占内存。

try,except,else/finally

image-20211121155536628

image-20211121155642866

https://www.fujieace.com/python/except-exception-ex-invalid-syntax.html

python2的except Exception, ex,python3报错

image-20211118220941126

reload

python2:

reload(sys)

python3:

import importlib
importlib.reload(sys)

reload(sys)

image-20211119134510505

lambda

def g(x):
    return x+1
# 等于
g = lambda x:x+1
# eg:
print filter(lambda x: x % 3 == 0, foo)

filter

filter(function, iterable)

# eg:
def is_odd(n):
    return n % 2 == 1
newlist = filter(is_odd, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
# [1, 3, 5, 7, 9]
#或者
newlist = filter(lambda n: n%2 == 1, newlist)

sys.path.insert()

sys.path.insert(0, "./xxxxx_dir")
# 0为优先级,优先搜索第二个参数

__init__.py

如果目录中存在该文件,该目录就会被识别为 module package 。

假如目录moduledir有__init_\.py,只需MARKDOWN_HASH0dcca1f22226d07608e763ec9f64501fMARKDOWNHASH就可包含__init_\.py的模块

找不到根目录模块

pycharm中自动包含了根目录的路径,但是如果通过命令行或者终端执行脚本,会出现找不到根目录导入的模块,解决方法:

import sys
sys.path.insert(0, "../")
# 在sys的路径下加入了insert,且优先级为0,也可以用append

scrapy——爬虫框架

新建Scrapy项目

scrapy startproject tutorial

# 创建后的目录如下
tutorial/
    scrapy.cfg            # deploy configuration file

    tutorial/             # project's Python module, you'll import your code from here
        __init__.py

        items.py          # project items definition file

        middlewares.py    # project middlewares file

        pipelines.py      # project pipelines file

        settings.py       # project settings file

        spiders/          # a directory where you'll later put your spiders
            __init__.py

tutorial/tutorial/spiders下创建项目

# 创建类继承scrapy的Spider类
class BaiDuSpider(scrapy.Spider):
    # 识别spider项目。项目中必须是唯一的,不能为不同的 spider 设置相同的名称。
    name = 'BaiDuSpider'

或者

scrapy genspider baidu www.baidu.com

自动生成:

import scrapy

class BaiduSpider(scrapy.Spider):
    name = 'baidu'
    allowed_domains = ['www.baidu.com']
    start_urls = ['http://www.baidu.com/']

    def parse(self, response):
        pass

setting.py配置

USER_AGENT:默认是注释的,这个东西非常重要,如果不写很容易被判断为电脑,简单点洗一个Mozilla/5.0即可

ROBOTSTXT_OBEY:是否遵循机器人协议,默认是true,需要改为false,否则很多东西爬不了

CONCURRENT_REQUESTS:最大并发数,很好理解,就是同时允许开启多少个爬虫线程

DOWNLOAD_DELAY:下载延迟时间,单位是秒,控制爬虫爬取的频率,根据你的项目调整,不要太快也不要太慢,默认是3秒,即爬一个停3秒,设置为1秒性价比较高,如果要爬取的文件较多,写零点几秒也行

COOKIES_ENABLED:是否保存COOKIES,默认关闭,开机可以记录爬取过程中的COKIE,非常好用的一个参数

DEFAULT_REQUEST_HEADERS:默认请求头,上面写了一个USER_AGENT,其实这个东西就是放在请求头里面的,这个东西可以根据你爬取的内容做相应设置。

ITEM_PIPELINES:项目管道,300为优先级,越低越爬取的优先度越高

image-20211214172608571

在setting.py里面可以设置日志的等级与日志存放的路径

# splite是必须写的,目的是把字符串转为列表形式,第一个参数是scrapy,第二个crawl,第三个baidu
from scrapy import cmdline
cmdline.execute('scrapy crawl baidu'.split())

scrapy crawl 项目名 -o *.json

对于json文件,在setting.js文件里添加,设置编码格式,否则会乱码:

FEED_EXPORT_ENCODING='utf-8'

from scrapy import cmdline

cmdline.execute('scrapy crawl baidu -o baidu.csv'.split())

pipelines.py

image-20211214193922241

image-20211214193938214

其他

xpath获取:

extract():这个方法返回的是一个数组list,,里面包含了多个string,如果只有一个string,则返回['ABC']这样的形式。

extract_first():这个方法返回的是一个string字符串,是list数组里面的第一个字符串。

案例

百度apk搜索页面的爬虫

# from import_models import *
import scrapy
from lxml import etree

from BaiDuSpider.items import BaiduspiderItem

class BaiDuSpider(scrapy.Spider):
    name = 'BaiDuSpider'
    search_url = u'http://shouji.baidu.com/s?wd='
    keyword = u'京东'
    start_urls = [search_url + keyword]
    pre_url = u'https://shouji.baidu.com'
    start_url = u'http://shouji.baidu.com/s?wd=京东'
    post_url = u'#page'
    page = 1

    def parse(self, response, **kwargs):
        all_elts = response.xpath('//ul[@class="app-list"]/li')
        for elt in all_elts:
            item = BaiduspiderItem()    # 注意!一定要写循环里面,外面则会循环内的item值都一样
            # elt = etree.HTML(elt)
            item['app_name'] = elt.xpath('div/div[@class="info"]/div[1]/a/text()').extract_first().replace('\n', '').replace(' ', '')
            item['app_detail_url'] = self.pre_url + elt.xpath('div/div[@class="info"]/div[1]/a/@href').extract_first()
            item['download_count'] = elt.xpath('div/div[@class="info"]/div[3]/em/span/text()').extract_first()
            item['apk_download_url'] = elt.xpath('div/div[@class="little-install"]/a/@data_url').extract_first()
            item['version'] = elt.xpath('div/div[@class="little-install"]/a/@data_versionname').extract_first()
            app_detail_url = self.pre_url + elt.xpath('div/div[@class="info"]/div[1]/a/@href').extract_first()
            yield scrapy.Request(url=app_detail_url, callback=self.parse_detail, meta={'item': item})   # meta传递item

        if self.page < 11:
            print('next page')
            self.page += 1
            next_page_url = self.start_url + self.post_url + '{}'.format(self.page)
            yield scrapy.Request(url=next_page_url, callback=self.parse)

    def parse_detail(self, response):
        item = response.meta['item']    # 接受传递的item
        item['categories'] = response.xpath('//div[@class="nav"]/span[5]/a/text()').extract_first()
        item['description'] = response.xpath('//div[@class="brief-long"]/p/text()').extract_first()
        yield item
# items.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy

class BaiduspiderItem(scrapy.Item):
    app_name = scrapy.Field()
    app_detail_url = scrapy.Field()
    download_count = scrapy.Field()
    apk_download_url = scrapy.Field()
    categories = scrapy.Field()
    version = scrapy.Field()
    description = scrapy.Field()
# setteings.py
# 需先配置settings.py才能使用pipelines.py
# 取消ITEM_PIPELINES注释即可
ITEM_PIPELINES = {
   'BaiDuSpider.pipelines.BaiduspiderPipeline': 300,
}
# pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

# useful for handling different item types with a single interface
import json

from itemadapter import ItemAdapter

class BaiduspiderPipeline(object):
    def __init__(self):
        self.fp = open('./apk_info.json', 'w+', encoding='utf8')

    def process_item(self, item, spider):
        print('开始存储')
        d = dict(item)
        content = json.dumps(d, ensure_ascii=False)
        self.fp.write(content + '\n')
        return item

    def close_spider(self, spider):
        self.fp.close()

scrapy框架

scrapy教程

xpath的extract

爬取二级子页面

爬取二级子页面,一级页面也提取信息

pipelines使用方法

unitest——单元测试框架