是python鸭

多线程爬取豆瓣前100电影信息

1.代码中的Cookie需要经常更新,更新为登陆了豆瓣电影的Cookie。

2.开启了多线程后数据顺序是乱的,保存到excel里也失败,关闭多线程会好用。(这个问题我会研究下继续更博)

如果代码还不运行成功,那估计是网页结构换了,需更改正则匹配部分。


import requests
import re
import xlwt
from requests.exceptions import RequestException
import multiprocessing

book = xlwt.Workbook(encoding='utf-8', style_compression=0)

sheet = book.add_sheet('豆瓣电影Top100', cell_overwrite_ok=True)
sheet.write(0, 0, '序号')
sheet.write(0, 1, '电影名')
sheet.write(0, 2, '图片地址')
sheet.write(0, 3, '主演')
sheet.write(0, 4, '上映时间')
sheet.write(0, 5, '评分')

n = 1

def get_parse(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
        'Cookie':'__mta=212238251.1598608211430.1599033592248.1599033594317.35; uuid_n_v=v1; _lxsdk_cuid=174347970aac8-02d5519f2a3dea-f7b1332-100200-174347970abc8; mojo-uuid=c5d51fb6c7d8d91a6739c4680e628ab0; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; uuid=DBA03500ECCA11EA928CD948005EDA81242C50EC589744629F480B6934EB1563; _csrf=687c15bc392ba052d52ffed0669773bab928729ae6b2468cd3c3019f46e3705e; lt=z8OmERuEZqx63yskOTxu3lrAtGQAAAAAcwsAAAxhkSqReQ7bVaMpyVVvNRxM5L_H-GahvEX4hAqIvBstd70CVWT_dMA3Y0Cpy2DOog; lt.sig=S2ZmG-40NsiZ5kCVK5_rTPGvlh0; _lxsdk=DBA03500ECCA11EA928CD948005EDA81242C50EC589744629F480B6934EB1563; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1598608242,1599015943,1599016660,1599016668; __mta=212238251.1598608211430.1599027951561.1599028011534.26; mojo-session-id={"id":"f05b9da5f8cd57e160f4869a1c4de25a","time":1599033592065}; mojo-trace-id=2; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1599033594; _lxsdk_s=1744dd44763-fa4-f96-e9f%7C991164006%7C5'
    }
    req = requests.get(url, headers=headers).text

    pattern = re.compile(
        '<dd>.*?board-index.*?>(\d+)</i>.*?title="(.*?)".*?src="//(.*?)".*?<p.*?star">(.*?)</p>.*?<p.*?releasetime">上映时间:(.*?)</p>.*?<i.*?integer">(.*?)</i>.*?<i.*?fraction">(.*?)</i>.*?</dd>',
        re.S)
    results = re.findall(pattern, req)
    # print(results)
    return results

def get_show(results):
    for result in results:
        print(result[0], '\n', '电影名:', result[1], '\n', '图片地址:', result[2], result[3], result[4], '\n', '评分:',
              result[5], result[6])
        # print(result[3].replace(" ", ""))
        global n

        sheet.write(n, 0, result[0])
        sheet.write(n, 1, result[1])
        sheet.write(n, 2, result[2])
        sheet.write(n, 3, result[3].replace(" ", ""))
        sheet.write(n, 4, result[4])
        sheet.write(n, 5, result[5] + result[6])

        n = n + 1

def main(page):
    url = 'https://maoyan.com/board/4?offset=' + str(page * 10)
    print(url)
    results = get_parse(url)
    get_show(results)

if __name__ == '__main__':
    #for i in range(0,10):
        #main(i)
    cores = multiprocessing.cpu_count()
    pool = multiprocessing.Pool(processes=cores)
    pool.map(main, [i for i in range(10)])


book.save(u'豆瓣最受欢迎的100部电影.xlsx')
多线程爬取豆瓣前100电影信息已关闭评论