抓取北邮人论坛招聘版及跳槽版每日帖数
互联网寒冬下的暗中观察
前两周趋势图
模拟登陆
headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, compress',
'Accept-Language': 'en-us;q=0.5,en;q=0.3',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36' }
def login():
url = 'https://bbs.byr.cn/user/ajax_login.json'
login_data = {'id': '****', 'passwd': '******'}
session = requests.Session()
req = session.post(url, data=login_data, headers=headers)
return session
解析页面
使用BeautifulSoup快速定位html标签地址
def parse_page(content):
soup = BeautifulSoup(content, features="lxml")
spans = soup.findAll('span', {'class' : 'n-left'})
text = str(spans[0])
cur_user_prefix = "共有"
cur_user_suffix = "人"
prefix_idx = text.find(cur_user_prefix)
suffix_idx = text.find(cur_user_suffix, prefix_idx)
cur_user_num = text[prefix_idx + len(cur_user_prefix) : suffix_idx]
max_user_prefix = "最高"
max_user_suffix = "人"
prefix_idx = text.find(max_user_prefix, suffix_idx)
suffix_idx = text.find(max_user_suffix, prefix_idx)
max_user_num = text[prefix_idx + len(max_user_prefix) : suffix_idx]
cur_post_prefix = "帖数"
cur_post_suffix = "<"
prefix_idx = text.find(cur_post_prefix, suffix_idx)
suffix_idx = text.find(cur_post_suffix, prefix_idx)
post_num = text[prefix_idx + len(cur_post_prefix) : suffix_idx]
return int(cur_user_num), int(max_user_num), int(post_num)
存储至mysql
#-*- coding: utf-8 -*-
"""
File Name: hire_info_db.py
Author: ce39906
Mail: ce39906@163.com
Created Time: 2019-04-13 07:46:29
"""
from db import DB
class HireInfo:
def __init__(self, board, date, cur_user_num, max_user_num, cur_post_num):
self.board = board
self.date = date
self.cur_user_num = cur_user_num
self.max_user_num = max_user_num
self.cur_post_num = cur_post_num
class HireInfoDB(DB):
def __init__(self):
DB.__init__(self)
self.table = 'hire_info'
def insert(self, hire_info):
query = ("insert into %s"
"(board,date,cur_user_num,max_user_num,cur_post_num)"
"values('%s','%s',%d, %d, %d)"
) % (self.table,hire_info.board, hire_info.date, hire_info.cur_user_num,
hire_info.max_user_num, hire_info.cur_post_num)
self.execute(query)
def select_by_board(self, board, days):
query = ("select * from %s "
"where board = '%s' and to_days(now()) - to_days(date) <= %d "
) % (self.table, board, days)
res = self.execute(query)
return res
绘制折线图
def plot(data, board):
dates = [str(x[2]) for x in data]
cur_user_nums = [x[3] for x in data]
cur_post_nums = [x[5] for x in data]
d = {'dates' : dates,
'cur_user_num' : cur_user_nums,
'cur_post_num' : cur_post_nums}
df = pd.DataFrame(d)
plt.cla()
ax = plt.gca()
df.plot(title=board,
kind='line',
x='dates',
y='cur_user_num',
color='red',
marker='o',
label='online_user_num',
ax=ax)
df.plot(kind='line',
x='dates',
y='cur_post_num',
color='green',
marker='x',
label='post_num',
ax=ax)
plt.savefig(board + '.png')
上传至腾讯云
#-*- coding: utf-8 -*-
"""
File Name: cos_client.py
Author: ce39906
Mail: ce39906@163.com
Created Time: 2019-04-12 15:54:31
"""
from qcloud_cos import CosConfig
from qcloud_cos import CosS3Client
import sys
import logging
class TencentCloudClient:
def __init__(self):
secret_id = '***************************'
secret_key = '*****************'
region = 'ap-beijing'
token = None
scheme = 'https'
config = CosConfig(Region=region, SecretId=secret_id, SecretKey=secret_key, Token=token, Scheme=scheme)
self.client = CosS3Client(config)
self.url_prefix = 'https://myblog-********.cos.ap-beijing.myqcloud.com/'
def upload_file(self, filename):
self.client.put_object_from_local_file(Bucket='myblog-*******',
LocalFilePath=filename,
Key=filename,
EnableMD5=False)
完整代码
https://github.com/ce39906/self-practices/tree/master/pycode/crawl_byr_forum