实验:抓取小说白夜行的正文内容

要求:

  • 抓取网址,并存到redis中名为url_queue的列表中
  • 获取具体内容,并将内容保存到mongoDB中
  • 使用XPath或者BS4

设计

import time
import redis
import requests
from bs4 import BeautifulSoup
from pymongo import MongoClient

# 配置数据库
client = MongoClient()
database = client['baiyexing']
collection = database['content']

# 使用redis
client = redis.StrictRedis()

# url_queue 访问队列 content_list 文章标题和内容
url_queue = []
content_list = []


# get请求
def query(url):
    html = requests.get(url).content.decode()
    return html


# 添加小说内容url至url_queue列队
def add_content_url(url):
    client.rpush('url_queue', url)


# 弹出url
def pop_content_url():
    return client.lpop('url_queue')


# 保存至数据库
def save_to_db():
    collection.insert_many(content_list)


# 获取小说详细url并添加至redis队列
def get_content_url():
    page = query('https://www.qisuu.la/du/24/24704/')
    soup = BeautifulSoup(page, 'lxml')
    toc_link = soup.find_all(class_='pc_list')[1].find_all('a')
    for link in toc_link:
        add_content_url('https://www.qisuu.la/du/24/24704/'+link['href'])


# 获取小说章节内容
def get_content(url):
    source = query(url)
    page = BeautifulSoup(source, 'lxml')
    title = page.find('h1').get_text()
    chapter = page.find(id='content1').get_text(strip=True)
    content_list.append({'title': title, 'chapter': chapter})


# 运行
def run():
    start = time.time()
    get_content_url()
    while client.llen('url_queue') > 0:
        url = pop_content_url()
        get_content(url)
    save_to_db()
    end = time.time()
    print(f'用时{end - start}')


run()