Python常用扩展使用笔记,包括mysql连接池、Excel、日志等。
数据库连接池
import pymysql
from twisted.enterprise import adbapi
from pymysql import cursors
# 连接池方式保存数据库
class JianshuTwistedPipeline(object):
def __init__(self):
dbparams = {
'host': '127.0.0.1',
'port': 3306,
'user': 'root',
'password': '',
'database': 'python',
'charset': 'utf8',
'cursorclass': cursors.DictCursor
}
# 建立连接池
self.dbpool = adbapi.ConnectionPool('pymysql', **dbparams)
# 插入数据
def insert_item(self, cursor, item):
sql = """INSERT INTO jianshu(article_id,article_title,article_content,origin_url) VALUES (%s, %s, %s, %s)"""
# 游标执行sql
cursor.execute(sql, (item['article_id'],item['article_title'],item['article_content'], item['origin_url']))
# 异常处理
def handle_error(self, error, item, spider):
print('**********error**********')
print(error)
# 执行
def process_item(self, item, spider):
# 运行
defer = self.dbpool.runInteraction(self.insert_item, item)
# 增加异常处理函数
defer.addErrback(self.handle_error, item, spider)
return item
日志
import logging
self.logger = logging.getLogger('jianshuspider')
handle = logging.FileHandler('log.txt')
self.logger.addHandler(handle)
self.logger.info()
pyexcel
import pyexcel
pyexcel.save_as(data, 'xxx.xls')
html读入可以xpath操作
# 打开一个文件
f = open('xxx.html')
# 文件内容赋值给一个变量
text = f.read()
# 引入包
import lxml import etree
# 用html方式读取内容
selector = etree.HTML(text)
# xpath规则
selector.xpath('****')
urllib库
from urllib import request
# r = request.urlopen('https://www.amazon.com/product-reviews/B07211W6X2?sortBy=recent&filterByStar=three_star')
r = request.urlopen('https://www.baidu.com')
print(r.read())
read()
读取全部readline()
读取一行readlines()
每行以列表数组的形式展示getcode()
获取状态码
urlretrieve
urlretrieve(url, filename)
指定url并保存到本地
url编码解码
urlencode
对数据编码
parse_qs
对数据解码
from urllib import parse
params = {'name': '王二小', 'age': 19}
# 编码
res = parse.urlencode(params)
print(res)
# 解码
res = parse.parse_qs(res)
print(res)
url拆分
parse.urlparse
parse.urlsplit
urlparse与urlsplit区别是:urlparse解析会有一个params的参数,urlsplit则没有
parsel
import parsel
response = requests.get(url, proxies=proxy, headers=headers)
selector = parsel.Selector(response.text)
selector.xpath()
celery
celery -A app.celeries.listing_celery worker -l info -P eventlet
# centos后台运行
nohup celery -A app.celeries.listing_celery worker -l info &
gunicorn
gunicorn -b 0.0.0.0:5000 api:app
gunicorn -w 4 -b 0.0.0.0:5000 --threads 16 -k gevent -t 1000 --access-logfile ./log/gun.log -D api:app