Python爬取豆瓣电影top250排行榜
Python爬取豆瓣电影top250排行榜示例代码,用的parsel和re两个模块,代码如下:
import
requests
import
csv
import
re
import
parsel
with
open
(
"豆瓣top250.csv"
,mode
=
"w"
,encoding
=
"utf_8_sig"
,newline
=
'') as f:
csv_writer
=
csv.writer(f)
csv_writer.writerow([
'片名'
,
'类型'
,
'评价人数'
,
"上映时间"
,
'导演_演员'
,
'国家'
,
'英文名'
,
'简介'
])
#注意headers里面的大小写
headers
=
{
'Cookie'
:
'll="118192"; bid=SxMSLUjm454; __utma=30149280.231185692.1663748575.1663748575.1663748575.1; __utmc=30149280; __utmz=30149280.1663748575.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmt=1; _pk_ref.100001.4cf6=["","",1663748581,"https://www.douban.com/"]; _pk_ses.100001.4cf6=*; __utmc=223695111; ap_v=0,6.0; __gads=ID=614eff214af342d2-221efc6e45d700a2:T=1663748581:RT=1663748581:S=ALNI_MY0JTwsKMOM9E6Uz_e8b88JW-wE9g; __gpi=UID=000009d31ea23134:T=1663748581:RT=1663748581:S=ALNI_MZ4KfeKbaWWs0Aeu0t5jqh2RD0IsA; Hm_lvt_16a14f3002af32bf3a75dfe352478639=1663748600; Hm_lpvt_16a14f3002af32bf3a75dfe352478639=1663748600; _vwo_uuid_v2=D913112FAEA958ABF7FCF7279209CA382|d77575243d8ec24302f391aa8e5672ff; __utma=223695111.399175606.1663748581.1663748581.1663748653.2; __utmb=223695111.0.10.1663748653; __utmz=223695111.1663748653.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmb=30149280.2.10.1663748575; dbcl2="262975336:xPwIu1zP+eU"; ck=AvuC; push_noty_num=0; push_doumail_num=0; _pk_id.100001.4cf6=28fbd6e9a044e121.1663748581.1.1663748823.1663748581.'
,
'Referer'
:
'https://www.baidu.com/link?url=P6mLfMtLSzXHxZYitwSc9UDnuTlARc-CJk-15rb3SfSKZlZQcjj-36ER1uqKcs1bl0s-eI6n1Onsaydsdu9zc_&wd=&eqid=b5b5e8680002085b00000005632acac7'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
}
for
i
in
range
(
10
):
url
=
f
'https://movie.douban.com/top250?start={25*i}&filter='
response
=
requests.get(url
=
url,headers
=
headers)
# print(response.text)
selector
=
parsel.Selector(response.text)
title
=
selector.xpath((
'//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[1]/a/span[1]/text()'
)).getall()
introduction
=
selector.xpath((
'//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[2]/p[2]/span/text()'
)).getall()
judge_num
=
selector.xpath((
'//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[2]/div/span[4]/text()'
)).getall()
director_actor
=
re.findall(
' <p class="">s(.*?)<br>'
,response.text)
sum
=
selector.xpath(
'//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[2]/p[1]/text()[2]'
).getall()
englishname
=
selector.xpath(
'//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[1]/a/span[2]/text()'
).getall()
for
i
in
range
(
25
):
Title
=
title[i]
mm
=
sum
[i].strip()
year
=
mm.split(
'/'
)[
0
]
country
=
mm.split(
'/'
)[
1
]
type
=
mm.split(
'/'
)[
2
]
Director
=
director_actor[i].strip()
Introduction
=
introduction[i].strip()
Englishname
=
englishname[i].strip()
Judge_num
=
judge_num[i].strip()
with
open
(
"豆瓣top250.csv"
,mode
=
"a"
,encoding
=
"utf-8_sig"
,newline
=
'') as f:
csv_writer
=
csv.writer(f)
csv_writer.writerow([Title,
type
,Judge_num,year,Director,country,Englishname,Introduction])
免责声明,若由于商用引起版权纠纷,一切责任均由使用者承担。
1. 本站所有资源来源于用户上传和网络,如有侵权请邮件联系站长!
2. 分享目的仅供大家学习和交流,您必须在下载后24小时内删除!
3. 不得使用于非法商业用途,不得违反国家法律。否则后果自负!
4. 本站提供的源码、模板、插件等等其他资源,都不包含技术服务请大家谅解!
5. 如有链接无法下载、失效或广告,请联系管理员处理!
6. 本站资源售价只是赞助,收取费用仅维持本站的日常运营所需!
7. 如遇到加密压缩包,默认解压密码为"www.ziyuangou.cc",如遇到无法解压的请联系管理员!
资源狗 » Python爬取豆瓣电影top250排行榜