抖音个人主页视频批量下载

磊落不羁 by:磊落不羁 分类:爬虫 时间:2年前 阅读:110 评论:0

个人研究所用 


'''
本程序使用selenium编写  本程序在pycharm中运行是没有问题的
整个程序完整 采用多进程下载技术 所以下载速度也相对较快
主要用途 下载抖音个人主页中所有视频
问题:使用pyinstaller进修打包操作后运行出现障碍,  问题在哪里不清楚

经验  selenium中获取元素 必须使用By引用 原来的获取方式似乎失效因为没有查询到相关资料
     所以对于selenium的技术发展还是要多多关注
     By。刘磊  2022年9月


'''

from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import requests
import re
import json
from urllib.parse import unquote
import warnings
from urllib import parse
import tqdm
import os
from concurrent.futures import ProcessPoolExecutor
warnings.filterwarnings("ignore")

def getvideo(url,now,all):
    try:
        #1、 通过url获取script内内容
        warnings.filterwarnings("ignore")

        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36',
            'referer': url,
            'cookie': 'douyin.com; __ac_nonce=06317e37700308002c952; __ac_signature=_02B4Z6wo00f01onlKrQAAIDDKuRlcbJJXraJxS4AAMF6b6; ttwid=1%7CpR53EihdTN2mlWXH16pjjz6o-HWccVIUgOYTAuaXWus%7C1662509944%7C312579cf34d39b11e4f9c7b50ded8602298923556584b9a2c692560d8c52360e; douyin.com; strategyABtestKey=1662509944.784; s_v_web_id=verify_l7qvhvjy_8Ye3fupH_VbNA_4Kx6_ARuH_UcWzPBA2zLmZ; passport_csrf_token=c060c99d6d1106ebe36a14b73808efe9; passport_csrf_token_default=c060c99d6d1106ebe36a14b73808efe9; ttcid=b581befcd57b4302995d49d179618d1532; THEME_STAY_TIME=%22299506%22; IS_HIDE_THEME_CHANGE=%221%22; n_mh=H6gxeDoYMh6xC93fDYqruk8PV3w2EnpOwTaJ3F6h9Io; sso_uid_tt=d99e706574c9b0dca31552b084d0b4df; sso_uid_tt_ss=d99e706574c9b0dca31552b084d0b4df; toutiao_sso_user=10d9268ff9b588001f8ea05fbad87024; toutiao_sso_user_ss=10d9268ff9b588001f8ea05fbad87024; sid_ucp_sso_v1=1.0.0-KGM0MDA1MTQyY2FjMjI2OWI0NmUwZDliZWNjYjk5Njk4YjFkMGRjNjkKHQjIp-CG6gIQ98vfmAYY7zEgDDDMsNXWBTgGQPQHGgJobCIgMTBkOTI2OGZmOWI1ODgwMDFmOGVhMDVmYmFkODcwMjQ; ssid_ucp_sso_v1=1.0.0-KGM0MDA1MTQyY2FjMjI2OWI0NmUwZDliZWNjYjk5Njk4YjFkMGRjNjkKHQjIp-CG6gIQ98vfmAYY7zEgDDDMsNXWBTgGQPQHGgJobCIgMTBkOTI2OGZmOWI1ODgwMDFmOGVhMDVmYmFkODcwMjQ; odin_tt=39edd9c5d7bd41d3a8f1c9cc421c94fa8d1f4dfc358c845c32ab306f67ce2540f7dd848b652e6924f9cd2aa98b83f65a; passport_auth_status=a7282adbf496fb7332f517b356b9916b%2C; passport_auth_status_ss=a7282adbf496fb7332f517b356b9916b%2C; sid_guard=a5fd6644a9c1c0e934335fa5ae32a33d%7C1662510584%7C5183999%7CSun%2C+06-Nov-2022+00%3A29%3A43+GMT; uid_tt=fc3a93858ae728f9b40b81b79e359a7b; uid_tt_ss=fc3a93858ae728f9b40b81b79e359a7b; sid_tt=a5fd6644a9c1c0e934335fa5ae32a33d; sessionid=a5fd6644a9c1c0e934335fa5ae32a33d; sessionid_ss=a5fd6644a9c1c0e934335fa5ae32a33d; sid_ucp_v1=1.0.0-KGI3NzNlZjAwMGM3ZGEzOTQ5ZTE2MGE4NTIzZDMzMzNmZTBlZDlhMDAKFwjIp-CG6gIQ-MvfmAYY7zEgDDgGQPQHGgJobCIgYTVmZDY2NDRhOWMxYzBlOTM0MzM1ZmE1YWUzMmEzM2Q; ssid_ucp_v1=1.0.0-KGI3NzNlZjAwMGM3ZGEzOTQ5ZTE2MGE4NTIzZDMzMzNmZTBlZDlhMDAKFwjIp-CG6gIQ-MvfmAYY7zEgDDgGQPQHGgJobCIgYTVmZDY2NDRhOWMxYzBlOTM0MzM1ZmE1YWUzMmEzM2Q; FOLLOW_LIVE_POINT_INFO=%22MS4wLjABAAAAAu4N8ormI9OckkfhpNG_osIz0rCMHUj-RsEOhccuXRo%2F1662566400000%2F0%2F1662510586124%2F0%22; download_guide=%223%2F20220907%22; msToken=_tWWvRPka3SKDzelOtOrT6fWorU-evZmNcPM8TJK_KnIwDo0oLcMas20L1CL1yiApl4-n_5upPefwPIuhr5QDF0kEegxExV_VOE9cq8Uyxmx3j9NRpR6Jg==; msToken=6sJ1N_4jhMZpK4BhOaU9sQKDBHSPgOABdlIYR88QiPRyZhbGJGwAQcXjpSuQJWEYjM9EOfw5CVMcGpeeLeG7Y3AhhWvgmwH5-UtG-WcdjNwR702X5fGBYw==; home_can_add_dy_2_desktop=%221%22; tt_scid=qbnU1AuyFbk919tiQLcFYmBQwcGRzHyH0P7EoAgzXiXX2eT-9VYFvM1BGm4KtlXreb50',

        }
        res = requests.get(url=url,headers=headers,verify=False).text

        getjson=re.findall(r'<script id="RENDER_DATA" type="application/json">(.*?)</script><script>',res)[0]

        dejson = unquote(getjson,'utf-8')
        # print(dejson)
        dejson=json.loads(dejson) #转换字符串成json
        # dejson=dict(dejson)
        keys=[]
        for key, value in dejson.items():
            keys.append(key)
        # print(keys)
        address="https:"+dejson[keys[1]]['aweme']['detail']['video']['playAddr'][0]['src']  #获取视频播放地址
        tit=dejson[keys[1]]['aweme']['detail']['desc']  #获取视频名称
        # title= re.sub(r'[^*"/:?\|<>#]',"0",tit) #去除特殊字符并取前20字符
        a = re.findall(r'[^\*"/:?\\|<>]', tit, re.S)
        title = "".join(a)
        title=title.split("#")[0]
        title = title.strip()
        print('【'+title+'】'+'视频解析完成!开始下载...')
        #2、通过获取的视频名称和地址对视频进行下载保存到video文件夹
        filepath='video'
        if not os.path.exists(filepath):
            os.makedirs(filepath)
        filename=filepath+"/"+title+".mp4"
        if not os.path.exists(filename):
            response = requests.get(url=address, headers=headers, verify=False).content  # 获取视频内容数据
            with open(filename,'wb') as f:
                f.write(response)
                print(str(now)+"/"+all+"----"+title+".mp4 下载完成")
        else:
            print(str(now)+"/"+all+"----"+"视频已存在,跳过!")
    except Exception as e:
        print(e)

#这段代码 将浏览器拉倒最下端
def scroll_to_bottom():
    """控制浏览器自动拉倒底部"""

    js = "return action=document.body.scrollHeight"
    # 初始化现在滚动条所在高度为0
    height = 0
    # 当前窗口总高度
    new_height = web.execute_script(js)

    while height < new_height:
        # 将滚动条调整至页面底部
        for i in range(height, new_height, 100):
            web.execute_script('window.scrollTo(0, {})'.format(i))
            time.sleep(0.2)
        height = new_height
        time.sleep(0.1)
        new_height = web.execute_script(js)

if __name__=="__main__":
    print("本程序用途为下载个人抖音主页所有视频,采用selenium技术\n 网址的样式:https://www.douyin.com/user/MS4wLjABAAAA8wLRtkaj3dA4zkC6mlHWo3CKHtilK7LaI8DjnyYzwX9LPvB8_1p2RSDdBOlB_T4B\n 请严格依照具体网址格式使用。\n程序作者:刘磊 QQ247483085\n=================================================================================================================================")
    url = input("输入抖音个人主页地址:")
    # url="https://www.douyin.com/user/MS4wLjABAAAATG70AclerZtTLz_CceUTShSIAEhDAyIUj_VI7m1ga-0"
    chrome_option = webdriver.ChromeOptions()
    chrome_option.add_argument('headless')  # 静默模式
    warnings.filterwarnings("ignore")
    web = webdriver.Chrome(options=chrome_option)
    # web.maximize_window()
    web.get(url=url)

    print('程序启动打开网页中..需要时间 请耐心等待..')
    scroll_to_bottom()
    print("启动视频地址统计程序..")

    #获取列表
    lis=web.find_elements(By.CSS_SELECTOR,'li')
    lists=[]

    for li in lis:
        try:
            adr=li.find_element(By.CSS_SELECTOR,'a').get_attribute('href')
            middle=adr.split('/')[-2]
            if middle != "video":
                continue
            lists.append(adr)
        except:
            continue
    print("共统计视频地址"+str(len(lists))+"个")
    # 获取视频主人名称
    # tt = web.find_element(By.XPATH,'/html/body/div[1]/div/div[2]/div/div/div[2]/div[1]/div[2]/h1/span/span/span/span/span/span')
    # title=tt.text
    # title=re.sub("的主页 - 抖音","_全部视频",title)
    # print(title)
    print("启动视频下载程序.. ")
    now=0
    all=str(len(lists))
    with ProcessPoolExecutor(20) as t:
        for url in lists:
            now+=1
            t.submit(getvideo,url=url,now=now,all=all)

    print("视频下载完毕,启动浏览器关闭程序")
    web.quit()


非特殊说明,本文版权归原作者所有,转载请注明出处

本文地址:http://php.liulei.com.cn/?type=acticle&id=46

评论列表

发表评论

  • 昵称(必填)
  • 邮箱
  • 网址

TOP