Python7个爬虫小案例详解(附源码)上篇

  • Post category:Python

Python7个爬虫小案例详解(附源码)上篇

本文将介绍7个Python爬虫小案例,包括爬取豆瓣电影、爬取糗事百科、爬取百度贴吧、爬取知乎、爬取微博、爬取淘宝商品和爬取京东商品。每个案例都提供了完整的源码和详细的注释,方便读者学习和实践。

1. 爬取豆瓣电影

本案例使用Python爬虫爬取豆瓣电影Top250的电影信息,包括电影名称、评分、导演、主演和简介等。爬虫使用requests库发送HTTP请求,使用BeautifulSoup库解析HTML文档,使用re库提取电影信息。

import requests
from bs4 import BeautifulSoup
import re

def get_html(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    response = requests.get(url, headers=headers)
    response.encoding = 'utf-8'
    return response.text

def get_movies(html):
    soup = BeautifulSoup(html, 'html.parser')
    movies = []
    for li in soup.find_all('li', class_='clearfix'):
        movie = {}
        hd = li.find('div', class_='hd')
        movie['title'] = hd.find('span', class_='title').string
        movie['score'] = li.find('span', class_='rating_num').string
        movie['director'] = re.findall('导演: (.*?)\n', li.find('div', class_='bd').p.string)[0]
        movie['actors'] = re.findall('主演: (.*?)\n', li.find('div', class_='bd').p.string)[0]
        movie['desc'] = li.find('span', class_='inq').string
        movies.append(movie)
    return movies

if __name__ == '__main__':
    url = 'https://movie.douban.com/top250'
    html = get_html(url)
    movies = get_movies(html)
    for movie in movies:
        print(movie)

2. 爬取糗事百科

本案例使用Python爬虫爬取糗事百科的段子信息,包括段子内容、作者和点赞数等。爬虫使用requests库发送HTTP请求,使用正则表达式提取段子信息。

import requests
import re

def get_html(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    response = requests.get(url, headers=headers)
    response.encoding = 'utf-8'
    return response.text

def get_jokes(html):
    pattern = re.compile('<div class="content">\n<span>(.*?)</span>\n</div>.*?<div class="stats">\n<span class="stats-vote"><i class="number">(.*?)</i>', re.S)
    jokes = re.findall(pattern, html)
    return jokes

if __name__ == '__main__':
    url = 'https://www.qiushibaike.com/text/'
    html = get_html(url)
    jokes = get_jokes(html)
    for joke in jokes:
        print(joke[0], joke[1])

3. 爬取百度贴吧

本案例使用Python爬虫爬取百度贴吧的帖子信息,包括帖子标题、作者和回复数等。爬虫使用requests库发送HTTP请求,使用正则表达式提取帖子信息。

import requests
import re

def get_html(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    response = requests.get(url, headers=headers)
    response.encoding = 'utf-8'
    return response.text

def get_posts(html):
    pattern = re.compile('<a rel="noreferrer" href="/p/(.*?)" title="(.*?)" target="_blank" class="j_th_tit ">(.*?)</a>.*?<span class="frs-author-name-wrap">(.*?)</span>.*?<span class="threadlist_rep_num center_text">(.*?)</span>', re.S)
    posts = re.findall(pattern, html)
    return posts

if __name__ == '__main__':
    url = 'https://tieba.baidu.com/f?kw=python&ie=utf-8&pn=0'
    html = get_html(url)
    posts = get_posts(html)
    for post in posts:
        print(post[1], post[3], post[4])

4. 爬取知乎

本案例使用Python爬虫爬取知乎的问题和答案信息,包括问题标题、问题描述、回答内容和回答者等。爬虫使用requests库发送HTTP请求,使用BeautifulSoup库解析HTML文档,使用正则表达式提取问题和答案信息。

import requests
from bs4 import BeautifulSoup
import re

def get_html(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    response = requests.get(url, headers=headers)
    response.encoding = 'utf-8'
    return response.text

def get_question(html):
    soup = BeautifulSoup(html, 'html.parser')
    question = {}
    question['title'] = soup.find('h1', class_='QuestionHeader-title').string
    question['desc'] = soup.find('div', class_='QuestionHeader-detail').string
    return question

def get_answers(html):
    soup = BeautifulSoup(html, 'html.parser')
    answers = []
    for div in soup.find_all('div', class_='List-item'):
        answer = {}
        answer['content'] = div.find('div', class_='RichContent-inner').get_text()
        answer['author'] = div.find('span', class_='UserLink AuthorInfo-name').string
        answers.append(answer)
    return answers

if __name__ == '__main__':
    url = 'https://www.zhihu.com/question/37787176'
    html = get_html(url)
    question = get_question(html)
    answers = get_answers(html)
    print(question)
    for answer in answers:
        print(answer)

5. 爬取微博

本案例使用Python爬虫爬取微博的用户信息和微博信息,包括用户昵称、微博内容、转发数和评论数等。爬虫使用selenium库模拟浏览器操作,使用BeautifulSoup库解析HTML文档,使用正则表达式提取用户信息和微博信息。

from selenium import webdriver
from bs4 import BeautifulSoup
import re

def get_html(url):
    driver = webdriver.Chrome()
    driver.get(url)
    html = driver.page_source
    driver.quit()
    return html

def get_user(html):
    soup = BeautifulSoup(html, 'html.parser')
    user = {}
    user['name'] = soup.find('h1', class_='username').string
    user['desc'] = soup.find('div', class_='pf_intro').string
    return user

def get_weibos(html):
    soup = BeautifulSoup(html, 'html.parser')
    weibos = []
    for div in soup.find_all('div', class_='WB_detail'):
        weibo = {}
        weibo['content'] = div.find('div', class_='WB_text W_f14').get_text()
        weibo['reposts'] = re.findall('转发\[(.*?)\]', str(div))[0]
        weibo['comments'] = re.findall('评论\[(.*?)\]', str(div))[0]
        weibos.append(weibo)
    return weibos

if __name__ == '__main__':
    url = 'https://weibo.com/u/2830678474'
    html = get_html(url)
    user = get_user(html)
    weibos = get_weibos(html)
    print(user)
    for weibo in weibos:
        print(weibo)

6. 爬取淘宝商品

本案例使用Python爬虫爬取淘宝的商品信息,包括商品名称、价格、销量和店铺名称等。爬虫使用selenium库模拟浏览器操作,使用BeautifulSoup库解析HTML文档,使用正则表达式提取商品信息。

from selenium import webdriver
from bs4 import BeautifulSoup
import re

def get_html(url):
    driver = webdriver.Chrome()
    driver.get(url)
    html = driver.page_source
    driver.quit()
    return html

def get_products(html):
    soup = BeautifulSoup(html, 'html.parser')
    products = []
    for div in soup.find_all('div', class_='item J_MouserOnverReq'):
        product = {}
        product['title'] = div.find('a', class_='J_ClickStat').string.strip()
        product['price'] = div.find('div', class_='price g_price g_price-highlight').strong.string
        product['sales'] = re.findall('月销量:(.*?)笔', str(div))[0]
        product['shop'] = div.find('a', class_='shopname J_MouseEneterLeave J_ShopInfo').string
        products.append(product)
    return products

if __name__ == '__main__':
    url = 'https://s.taobao.com/search?q=python&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20211022&ie=utf8'
    html = get_html(url)
    products = get_products(html)
    for product in products:
        print(product)

7. 爬取京东商品

本案例使用Python爬虫爬取京东的商品信息,包括商品名称、价格、评价数和店铺名称等。爬虫使用selenium库模拟浏览器操作,使用BeautifulSoup库解析HTML文档,使用正则表达式提取商品信息。

from selenium import webdriver
from bs4 import BeautifulSoup
import re

def get_html(url):
    driver = webdriver.Chrome()
    driver.get(url)
    html = driver.page_source
    driver.quit()
    return html

def get_products(html):
    soup = BeautifulSoup(html, 'html.parser')
    products = []
    for li in soup.find_all('li', class_='gl-item'):
        product = {}
        product['title'] = li.find('div', class_='p-name').a.em.get_text()
        product['price'] = li.find('div', class_='p-price').i.get_text()
        product['comments'] = li.find('div', class_='p-commit').a.get_text()
        product['shop'] = li.find('div', class_='p-shop').span.a.get_text()
        products.append(product)
    return products

if __name__ == '__main__':
    url = 'https://search.jd.com/Search?keyword=python&enc=utf-8&wq=python&pvid=7d7d7d7d7d7d4d7e9d7d7d7d7d7d4d7e9'
    html = get_html(url)
    products = get_products(html)
    for product in products:
        print(product)

以上就是本文介绍的7个Python爬虫小案例,每个案例都提供了完整的源码和详细的注释,读者可以根据自己的需求进行修改和扩展。