Python学习笔记(常用扩展)

Python常用扩展使用笔记,包括mysql连接池、Excel、日志等。

python

数据库连接池

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84

import pymysql

from twisted.enterprise import adbapi

from pymysql import cursors



# 连接池方式保存数据库

class JianshuTwistedPipeline(object):



    def __init__(self):

        dbparams = {

            'host': '127.0.0.1',

            'port': 3306,

            'user': 'root',

            'password': '',

            'database': 'python',

            'charset': 'utf8',

            'cursorclass': cursors.DictCursor

        }

        # 建立连接池

        self.dbpool = adbapi.ConnectionPool('pymysql', **dbparams)



    # 插入数据

    def insert_item(self, cursor, item):



        sql = """INSERT INTO jianshu(article_id,article_title,article_content,origin_url) VALUES (%s, %s, %s, %s)"""



        # 游标执行sql

        cursor.execute(sql, (item['article_id'],item['article_title'],item['article_content'], item['origin_url']))



    # 异常处理

    def handle_error(self, error, item, spider):

        print('**********error**********')

        print(error)



    # 执行

    def process_item(self, item, spider):

        # 运行

        defer = self.dbpool.runInteraction(self.insert_item, item)



        # 增加异常处理函数

        defer.addErrback(self.handle_error, item, spider)



        return item

日志

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20



import logging



self.logger = logging.getLogger('jianshuspider')



handle = logging.FileHandler('log.txt')



self.logger.addHandler(handle)



self.logger.info()

pyexcel

1
2
3
4
5
6

import pyexcel



pyexcel.save_as(data, 'xxx.xls')

html读入可以xpath操作

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28

# 打开一个文件

f = open('xxx.html')



# 文件内容赋值给一个变量

text = f.read()



# 引入包

import lxml import etree



# 用html方式读取内容

selector = etree.HTML(text)



# xpath规则

selector.xpath('****')

urllib库

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14

from urllib import request



# r = request.urlopen('https://www.amazon.com/product-reviews/B07211W6X2?sortBy=recent&filterByStar=three_star')



r = request.urlopen('https://www.baidu.com')



print(r.read())
  • read() 读取全部

  • readline() 读取一行

  • readlines() 每行以列表数组的形式展示

  • getcode() 获取状态码

urlretrieve

urlretrieve(url, filename)指定url并保存到本地

url编码解码

urlencode 对数据编码

parse_qs 对数据解码

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26

from urllib import parse



params = {'name': '王二小', 'age': 19}



# 编码

res = parse.urlencode(params)



print(res)



# 解码

res = parse.parse_qs(res)



print(res)

url拆分

  • parse.urlparse

  • parse.urlsplit

urlparse与urlsplit区别是:urlparse解析会有一个params的参数,urlsplit则没有

parsel

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14

import parsel



response = requests.get(url, proxies=proxy, headers=headers)



selector = parsel.Selector(response.text)



selector.xpath()

celery

1
2
3
4
5
6
7
8

celery -A app.celeries.listing_celery worker -l info -P eventlet



# centos后台运行

nohup celery -A app.celeries.listing_celery worker -l info &

gunicorn

1
2
3
4
5
6

gunicorn -b 0.0.0.0:5000 api:app



gunicorn -w 4 -b 0.0.0.0:5000 --threads 16 -k gevent -t 1000 --access-logfile ./log/gun.log -D api:app