互联网寒冬下的暗中观察

前两周趋势图

pic
pic

模拟登陆

headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
           'Accept-Encoding': 'gzip, deflate, compress',
           'Accept-Language': 'en-us;q=0.5,en;q=0.3',
           'Cache-Control': 'max-age=0',
           'Connection': 'keep-alive',
           'X-Requested-With': 'XMLHttpRequest',
           'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36' }

def login():
    url = 'https://bbs.byr.cn/user/ajax_login.json'
    login_data = {'id': '****', 'passwd': '******'}
    session = requests.Session()
    req = session.post(url, data=login_data, headers=headers)
    return session

解析页面

使用BeautifulSoup快速定位html标签地址

def parse_page(content):
    soup = BeautifulSoup(content, features="lxml")
    spans = soup.findAll('span', {'class' : 'n-left'})
    text = str(spans[0])
    cur_user_prefix = "共有"
    cur_user_suffix = "人"
    prefix_idx = text.find(cur_user_prefix)
    suffix_idx = text.find(cur_user_suffix, prefix_idx)
    cur_user_num = text[prefix_idx + len(cur_user_prefix) : suffix_idx]

    max_user_prefix = "最高"
    max_user_suffix = "人"
    prefix_idx = text.find(max_user_prefix, suffix_idx)
    suffix_idx = text.find(max_user_suffix, prefix_idx)
    max_user_num = text[prefix_idx + len(max_user_prefix) : suffix_idx]

    cur_post_prefix = "帖数"
    cur_post_suffix = "<"
    prefix_idx = text.find(cur_post_prefix, suffix_idx)
    suffix_idx = text.find(cur_post_suffix, prefix_idx)
    post_num = text[prefix_idx + len(cur_post_prefix) : suffix_idx]

    return int(cur_user_num), int(max_user_num), int(post_num)

存储至mysql

#-*- coding: utf-8 -*-
"""
File Name: hire_info_db.py
Author: ce39906
Mail: ce39906@163.com
Created Time: 2019-04-13 07:46:29
"""
from db import DB

class HireInfo:
    def __init__(self, board, date, cur_user_num, max_user_num, cur_post_num):
        self.board = board
        self.date = date
        self.cur_user_num = cur_user_num
        self.max_user_num = max_user_num
        self.cur_post_num = cur_post_num


class HireInfoDB(DB):
    def __init__(self):
        DB.__init__(self)
        self.table = 'hire_info'

    def insert(self, hire_info):
        query = ("insert into %s"
                "(board,date,cur_user_num,max_user_num,cur_post_num)"
                "values('%s','%s',%d, %d, %d)"
                ) % (self.table,hire_info.board, hire_info.date, hire_info.cur_user_num,
                hire_info.max_user_num, hire_info.cur_post_num)

        self.execute(query)

    def select_by_board(self, board, days):
        query = ("select * from %s "
                 "where board = '%s' and to_days(now()) - to_days(date) <= %d "
                ) % (self.table, board, days)
        res = self.execute(query)
        
        return res

绘制折线图

def plot(data, board):
    dates = [str(x[2]) for x in data]
    cur_user_nums = [x[3] for x in data]
    cur_post_nums = [x[5] for x in data]
    d = {'dates' : dates, 
         'cur_user_num' : cur_user_nums,
         'cur_post_num' : cur_post_nums}
    
    df = pd.DataFrame(d)
    plt.cla()
    ax = plt.gca()
    df.plot(title=board,
            kind='line',
            x='dates', 
            y='cur_user_num', 
            color='red',
            marker='o',
            label='online_user_num',
            ax=ax)
    df.plot(kind='line',
            x='dates',
            y='cur_post_num',
            color='green', 
            marker='x',
            label='post_num',
            ax=ax)

    plt.savefig(board + '.png')

上传至腾讯云

#-*- coding: utf-8 -*-
"""
File Name: cos_client.py
Author: ce39906
Mail: ce39906@163.com
Created Time: 2019-04-12 15:54:31
"""
from qcloud_cos import CosConfig
from qcloud_cos import CosS3Client
import sys
import logging

class TencentCloudClient:
    def __init__(self):
        secret_id = '***************************'
        secret_key = '*****************'
        region = 'ap-beijing'
        token = None
        scheme = 'https'
        config = CosConfig(Region=region, SecretId=secret_id, SecretKey=secret_key, Token=token, Scheme=scheme)
        self.client = CosS3Client(config)
        self.url_prefix = 'https://myblog-********.cos.ap-beijing.myqcloud.com/'

    def upload_file(self, filename):
        self.client.put_object_from_local_file(Bucket='myblog-*******',
                                               LocalFilePath=filename,
                                               Key=filename,
                                               EnableMD5=False)

完整代码

https://github.com/ce39906/self-practices/tree/master/pycode/crawl_byr_forum