需求以及成果

此代码源自同事在看极**院(jkxy)教学视频然后我把账号要了过来看了一下既然提供了视频下载功能,点击下载后还是走的ajax请求然后返回下载地址,所以结合之前了解的一些node知识利用原生node+fs+path+ 第三方 axios+ request模块构建而成。当时写的比较随意,大神勿喷!

代码分析

  1. create.js 主要负责分析创建dom结构的json

    • 针对链接 http://ke.jikexueyuan.com/zhiye/web/ 进行爬取

    • 分析当前页面的dom结构当前为树形结构主要分为三层

      第一层(文件夹)是大标题也就是对应的阶段 初级、中级、高级、资深、专家

      第二层(文件夹)是小标题 视频涉及大知识点

      第三层(文件夹)小标题中的知识点

      第四层(视频)知识点视频

    • 第四层由于不在大纲内,但是通过分析数据我们可以在第三层中可以拿到第四层的id所以可以通过爬虫拿到第四层的视频列表。到此就生成了整个 dom结构的json。

  /**
  * author: zhaopeng
  * date: 2018-10-08 21:35:37 
  */
  const axios = require('axios')
  const request = require('request')
  const fs = require('fs')
  const path = require('path')
  const cheerio = require('cheerio')

  const stringToTrim = (str)=>{
      return str.trim()
  }

  class JkxyDownload {
      constructor(fileName) {
          this.url = 'http://ke.jikexueyuan.com/zhiye/web/'
          this.fileName= fileName
      }
      getDownloadId(tags) {
          return tags.split('|')[3].split(':')[1]
      }
      getChildLinks(data) {
          return new Promise((resolve, reject) => {
              const $ = cheerio.load(data);
              const list = []
              $('.lesson-box li').each((index, item) => {
                  const title = (index + 1) + '.' + stringToTrim($(item).find('h2 a').text())
                  if (title) list.push({ title })
              })
              resolve(list)
          })
      }
      createDomTree($) {
          return new Promise((resolve, reject) => {
              let json = []
              $('.lesson-unit').each((index, item) => {
                  $('.tips-help').empty(); //清空多余html
                  const title = stringToTrim((index + 1) + '.' + $(item).find('header h3').text())
                  json.push({
                      title,
                      list: []
                  })
                  $(item).find('.lesson-step').each((stepIndex, stepItem) => {
                      let stepTitle = stringToTrim((stepIndex + 1) + '.' + $(stepItem).find('thead th').text())
                      json[index].list.push({
                          title: stepTitle,
                          list: []
                      })
                      const href = $(stepItem).find('tbody a')
                      $(href).each( (linkIndex, linkItem) =>{
                          let thirdDirTitle = stringToTrim($(linkItem).text())
                          let thirdDirHref = 'http:'+$(linkItem).attr('href')
                          let id = this.getDownloadId($(linkItem).attr('jktag'))
                          try {
                              json[index].list[stepIndex].list.push({
                                  title: thirdDirTitle,
                                  id: id,
                                  href: thirdDirHref,
                                  list: []
                              })
                          } catch (error) { }
                      })
                  })
              })
              resolve(json)
          })
      }
      //开始任务
      async start() {
          let { data } = await axios.get(this.url)
          let $ = cheerio.load(data);
          let domData = await this.createDomTree($);
          for (let item1 of domData) {
              for (let item2 of item1.list) {
                  for (let item3 of item2.list) {
                      let pro =() =>{
                         return axios.get(item3.href).then(({data})=>{
                              return this.getChildLinks(data)
                          }).then(childLinks=>{
                              item3.list = childLinks
                          }).catch(()=>{

                          })
                      }
                      await pro();
                  }
              }
          }
          fs.writeFileSync(this.fileName, JSON.stringify(domData))
      }
  }

  const t = new JkxyDownload('web.json')
  t.start()
  1. index.js 主要负责下载视频流的功能

    • 运行完create.js创建完web.json后我们就可以进行视频的下载了

    • 遍历一下web.json 创建下载任务和目录

    • 第三层遍历的时候就可以从web.json中拿到我们的下载参数了用axios请求

      通过分析第四层我们无法从主页大纲中读取视频所需要的参数,但是我们

    • 通过cookie进行鉴权(需要在极客网站登录然后在Chrome中复制cookie参数放到请求头中)

    • 然后同步写流到已创建的目录中,下载成功则在json上标记isdownloaded = true 避免异常中断或是手动取消后又从第一个开始下载

    const axios = require('axios')
    const request = require('request')
    const fs = require('fs')
    const path = require('path')
    class JkxyDownload {
        constructor() {
            this.domData = JSON.parse(fs.readFileSync('web.json', 'utf-8'))
        }
        saveFile(filePath, url) {
            return new Promise((resolve, reject) => {
                let httpStream = request({ method: 'GET', url });
                let writeStream = fs.createWriteStream(filePath);
                // 联接Readable和Writable
                httpStream.pipe(writeStream);
                let totalLength = 0;
                let startNow = 0
                // 当获取到第一个HTTP请求的响应获取
                httpStream.on('response', (response) => {
                    console.log('当前文件大小: ', response.headers['content-length']/1024/1024+'M');
                });
                httpStream.on('data', (chunk) => {
                    totalLength += chunk.length;
                    let endNow = Date.now();
                    if (endNow - startNow > 1000) {
                        startNow = endNow
                        console.log('已下载:' + (totalLength / 1024 / 1024).toFixed(2) + 'M');
                    }
                });
                // 下载完成
                writeStream.on('close', () => {
                    resolve(true);
                    console.log(filePath + '下载完成');
                });
            });
        }
        //判断是否有此目录 没有则创建
        createDir(path) {
            return new Promise((resolve, reject) => {
                fs.exists(path, (res) => {
                    if (!res) {
                        fs.mkdirSync(path)
                        resolve(true)
                    } else {
                        resolve(false)
                    }
                })
            })
        }
        async start() {
            await this.createDir('video')
            for (let item1 of this.domData) {
                let fristPath = path.resolve('video', item1.title)
                await this.createDir(fristPath)
                for (let item2 of item1.list) {
                    let secondPath = path.resolve(fristPath, item2.title)
                    this.createDir(secondPath)
                    for (let item3 of item2.list) {
                        let thirdPath = path.resolve(secondPath, item3.title)
                        await this.createDir(thirdPath)
                        for (let i = 0; i < item3.list.length; i++) {
                            if (!('isdownloaded' in item3.list[i])) {
                                item3.list[i].isdownloaded = false;
                                let pathname = path.join(`${thirdPath}/${item3.list[i].title}.mp4`)
                                fs.existsSync(pathname, (res) => {
                                    if (res) {
                                        fs.unlinkSync(pathname)
                                    }
                                })
                                const url = `http://www.jikexueyuan.com/course/video_download`
                                const seq = i + 1;
                                const { data } = await axios.get(url, {
                                                params:{
                                            seq,
                                            course_id: item3.id
                                        },
                                    headers: {
                                        "Host": "www.jikexueyuan.com",
                                        "X-Requested-With": "XMLHttpRequest",
                                        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
                                        "Content-Type": "application/json; charset=utf-8",
                                        "Cookie": "PHPSESSID=i6q5u7p52nqka7er1j1tg51kt1; gr_user_id=9e9dc1e8-92a4-4404-973d-93e5972a1090; jkxyid_v2=65084fa0-c858-4d48-8723-7578e7b2a902; _ga=GA1.2.660881958.1536390000; _gid=GA1.2.1878645958.1536390000; ca_status=0; vip_status=1; level_id=1; is_expire=0; domain=0SWkqjWgq; svip_status=0; svip_is_expire=1; MEIQIA_EXTRA_TRACK_ID=19iqmnJt58qiOSzKM8ddpobSvYE; avatar=https%3A%2F%2Fassets.jikexueyuan.com%2Fuser%2Favtar%2Fdefault.gif; MEIQIA_VISIT_ID=1A3VScAXcL9pEU1NYT4k5DVXP7k; uname=jike_1723898; uid=6143654; code=H1DPOX; authcode=abe9R%2B%2F%2ByHLGE0CWVa2O830EA4BOIP0s3LnlJiFnPI30nABG%2B1Au8HwKetFd6wA%2F28oUVhOHHlR%2BcW90n%2Foia50KSjIF2DvQkIAGQWjBuJyCsV%2B79wjICq1XE4X9eoWc; QINGCLOUDELB=e65dc5a1f59a4b9f49f021e25faee075d562b81daa2a899c46d3a1e304c7eb2b|W5ijD|W5ijC"
                                    }
                                })
                                if (data.code == 200) {
                                    let { urls } = data.data
                                    let result = await this.saveFile(pathname, urls)
                                    if (result) {
                                        item3.list[i].isdownloaded = true
                                        fs.writeFileSync("web.json", JSON.stringify(this.domData))
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    }


    new JkxyDownload().start()

Xnip2019-09-18_10-30-13

下载完视频总大小为14.65GB,有需要的同学可以留下你的邮箱地址。。

总结

  1. 做爬虫必须先分析目标网站

  2. 避免重复爬和大批量爬

  3. 以上代码有很多不足之处 仅供新手参考 !!!