1、requests库
pip install requests
Successfully installed certifi-2022.12.7 charset-normalizer-3.1.0 idna-3.4 requests-2.29.0 urllib3-1.26.15
2、伪装浏览器
import requests
#伪装浏览器
headers = {
"User-Agent":"Mozilla/5.0 (Linux; Andro...."
}
response = requests.get("https://movie.douban.com/top250", headers = headers )
print(response.text)
3.pip install bs4库
Successfully installed beautifulsoup4-4.12.2 bs4-0.0.1 soupsieve-2.4.1
import requests
from bs4 import BeautifulSoup
#伪装浏览器
headers = {
"User-Agent":"Mozilla/5.0 (Linux; Andro..."
}
#自动翻页
for start_num in range(0,250,25):
response = requests.get(f"https://movie.douban.com/top250?start={start_num}", headers=headers)
html = response.text
soup = BeautifulSoup(html, "html.parser")
all_title = soup.findAll("span", attrs={"class": "title"})
for title in all_title:
title_string = title.string
if "/" not in title_string:
print(title_string) # 去除原名