Python7个爬虫小案例详解(附源码)上篇
本文将介绍7个Python爬虫小案例,包括爬取豆瓣电影、爬取糗事百科、爬取百度贴吧、爬取知乎、爬取微博、爬取淘宝商品和爬取京东商品。每个案例都提供了完整的源码和详细的注释,方便读者学习和实践。
1. 爬取豆瓣电影
本案例使用Python爬虫爬取豆瓣电影Top250的电影信息,包括电影名称、评分、导演、主演和简介等。爬虫使用requests库发送HTTP请求,使用BeautifulSoup库解析HTML文档,使用re库提取电影信息。
import requests
from bs4 import BeautifulSoup
import re
def get_html(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
return response.text
def get_movies(html):
soup = BeautifulSoup(html, 'html.parser')
movies = []
for li in soup.find_all('li', class_='clearfix'):
movie = {}
hd = li.find('div', class_='hd')
movie['title'] = hd.find('span', class_='title').string
movie['score'] = li.find('span', class_='rating_num').string
movie['director'] = re.findall('导演: (.*?)\n', li.find('div', class_='bd').p.string)[0]
movie['actors'] = re.findall('主演: (.*?)\n', li.find('div', class_='bd').p.string)[0]
movie['desc'] = li.find('span', class_='inq').string
movies.append(movie)
return movies
if __name__ == '__main__':
url = 'https://movie.douban.com/top250'
html = get_html(url)
movies = get_movies(html)
for movie in movies:
print(movie)
2. 爬取糗事百科
本案例使用Python爬虫爬取糗事百科的段子信息,包括段子内容、作者和点赞数等。爬虫使用requests库发送HTTP请求,使用正则表达式提取段子信息。
import requests
import re
def get_html(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
return response.text
def get_jokes(html):
pattern = re.compile('<div class="content">\n<span>(.*?)</span>\n</div>.*?<div class="stats">\n<span class="stats-vote"><i class="number">(.*?)</i>', re.S)
jokes = re.findall(pattern, html)
return jokes
if __name__ == '__main__':
url = 'https://www.qiushibaike.com/text/'
html = get_html(url)
jokes = get_jokes(html)
for joke in jokes:
print(joke[0], joke[1])
3. 爬取百度贴吧
本案例使用Python爬虫爬取百度贴吧的帖子信息,包括帖子标题、作者和回复数等。爬虫使用requests库发送HTTP请求,使用正则表达式提取帖子信息。
import requests
import re
def get_html(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
return response.text
def get_posts(html):
pattern = re.compile('<a rel="noreferrer" href="/p/(.*?)" title="(.*?)" target="_blank" class="j_th_tit ">(.*?)</a>.*?<span class="frs-author-name-wrap">(.*?)</span>.*?<span class="threadlist_rep_num center_text">(.*?)</span>', re.S)
posts = re.findall(pattern, html)
return posts
if __name__ == '__main__':
url = 'https://tieba.baidu.com/f?kw=python&ie=utf-8&pn=0'
html = get_html(url)
posts = get_posts(html)
for post in posts:
print(post[1], post[3], post[4])
4. 爬取知乎
本案例使用Python爬虫爬取知乎的问题和答案信息,包括问题标题、问题描述、回答内容和回答者等。爬虫使用requests库发送HTTP请求,使用BeautifulSoup库解析HTML文档,使用正则表达式提取问题和答案信息。
import requests
from bs4 import BeautifulSoup
import re
def get_html(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
return response.text
def get_question(html):
soup = BeautifulSoup(html, 'html.parser')
question = {}
question['title'] = soup.find('h1', class_='QuestionHeader-title').string
question['desc'] = soup.find('div', class_='QuestionHeader-detail').string
return question
def get_answers(html):
soup = BeautifulSoup(html, 'html.parser')
answers = []
for div in soup.find_all('div', class_='List-item'):
answer = {}
answer['content'] = div.find('div', class_='RichContent-inner').get_text()
answer['author'] = div.find('span', class_='UserLink AuthorInfo-name').string
answers.append(answer)
return answers
if __name__ == '__main__':
url = 'https://www.zhihu.com/question/37787176'
html = get_html(url)
question = get_question(html)
answers = get_answers(html)
print(question)
for answer in answers:
print(answer)
5. 爬取微博
本案例使用Python爬虫爬取微博的用户信息和微博信息,包括用户昵称、微博内容、转发数和评论数等。爬虫使用selenium库模拟浏览器操作,使用BeautifulSoup库解析HTML文档,使用正则表达式提取用户信息和微博信息。
from selenium import webdriver
from bs4 import BeautifulSoup
import re
def get_html(url):
driver = webdriver.Chrome()
driver.get(url)
html = driver.page_source
driver.quit()
return html
def get_user(html):
soup = BeautifulSoup(html, 'html.parser')
user = {}
user['name'] = soup.find('h1', class_='username').string
user['desc'] = soup.find('div', class_='pf_intro').string
return user
def get_weibos(html):
soup = BeautifulSoup(html, 'html.parser')
weibos = []
for div in soup.find_all('div', class_='WB_detail'):
weibo = {}
weibo['content'] = div.find('div', class_='WB_text W_f14').get_text()
weibo['reposts'] = re.findall('转发\[(.*?)\]', str(div))[0]
weibo['comments'] = re.findall('评论\[(.*?)\]', str(div))[0]
weibos.append(weibo)
return weibos
if __name__ == '__main__':
url = 'https://weibo.com/u/2830678474'
html = get_html(url)
user = get_user(html)
weibos = get_weibos(html)
print(user)
for weibo in weibos:
print(weibo)
6. 爬取淘宝商品
本案例使用Python爬虫爬取淘宝的商品信息,包括商品名称、价格、销量和店铺名称等。爬虫使用selenium库模拟浏览器操作,使用BeautifulSoup库解析HTML文档,使用正则表达式提取商品信息。
from selenium import webdriver
from bs4 import BeautifulSoup
import re
def get_html(url):
driver = webdriver.Chrome()
driver.get(url)
html = driver.page_source
driver.quit()
return html
def get_products(html):
soup = BeautifulSoup(html, 'html.parser')
products = []
for div in soup.find_all('div', class_='item J_MouserOnverReq'):
product = {}
product['title'] = div.find('a', class_='J_ClickStat').string.strip()
product['price'] = div.find('div', class_='price g_price g_price-highlight').strong.string
product['sales'] = re.findall('月销量:(.*?)笔', str(div))[0]
product['shop'] = div.find('a', class_='shopname J_MouseEneterLeave J_ShopInfo').string
products.append(product)
return products
if __name__ == '__main__':
url = 'https://s.taobao.com/search?q=python&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20211022&ie=utf8'
html = get_html(url)
products = get_products(html)
for product in products:
print(product)
7. 爬取京东商品
本案例使用Python爬虫爬取京东的商品信息,包括商品名称、价格、评价数和店铺名称等。爬虫使用selenium库模拟浏览器操作,使用BeautifulSoup库解析HTML文档,使用正则表达式提取商品信息。
from selenium import webdriver
from bs4 import BeautifulSoup
import re
def get_html(url):
driver = webdriver.Chrome()
driver.get(url)
html = driver.page_source
driver.quit()
return html
def get_products(html):
soup = BeautifulSoup(html, 'html.parser')
products = []
for li in soup.find_all('li', class_='gl-item'):
product = {}
product['title'] = li.find('div', class_='p-name').a.em.get_text()
product['price'] = li.find('div', class_='p-price').i.get_text()
product['comments'] = li.find('div', class_='p-commit').a.get_text()
product['shop'] = li.find('div', class_='p-shop').span.a.get_text()
products.append(product)
return products
if __name__ == '__main__':
url = 'https://search.jd.com/Search?keyword=python&enc=utf-8&wq=python&pvid=7d7d7d7d7d7d4d7e9d7d7d7d7d7d4d7e9'
html = get_html(url)
products = get_products(html)
for product in products:
print(product)
以上就是本文介绍的7个Python爬虫小案例,每个案例都提供了完整的源码和详细的注释,读者可以根据自己的需求进行修改和扩展。