_ Git - youtube-dl/blob - youtube_dl/extractor/camdemy.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import re
   5
   6 from .common import InfoExtractor
   7 from ..compat import (
   8     compat_urllib_parse_urlencode,
   9     compat_urlparse,
  10 )
  11 from ..utils import (
  12     clean_html,
  13     str_to_int,
  14     unified_strdate,
  15 )
  16
  17
  18 class CamdemyIE(InfoExtractor):
  19     _VALID_URL = r'https?://(?:www\.)?camdemy\.com/media/(?P<id>\d+)'
  20     _TESTS = [{
  21         # single file
  22         'url': 'http://www.camdemy.com/media/5181/',
  23         'md5': '5a5562b6a98b37873119102e052e311b',
  24         'info_dict': {
  25             'id': '5181',
  26             'ext': 'mp4',
  27             'title': 'Ch1-1 Introduction, Signals (02-23-2012)',
  28             'thumbnail': 're:^https?://.*\.jpg$',
  29             'creator': 'ss11spring',
  30             'upload_date': '20130114',
  31             'view_count': int,
  32         }
  33     }, {
  34         # With non-empty description
  35         # webpage returns "No permission or not login"
  36         'url': 'http://www.camdemy.com/media/13885',
  37         'md5': '4576a3bb2581f86c61044822adbd1249',
  38         'info_dict': {
  39             'id': '13885',
  40             'ext': 'mp4',
  41             'title': 'EverCam + Camdemy QuickStart',
  42             'thumbnail': 're:^https?://.*\.jpg$',
  43             'description': 'md5:2a9f989c2b153a2342acee579c6e7db6',
  44             'creator': 'evercam',
  45         }
  46     }, {
  47         # External source (YouTube)
  48         'url': 'http://www.camdemy.com/media/14842',
  49         'info_dict': {
  50             'id': '2vsYQzNIsJo',
  51             'ext': 'mp4',
  52             'title': 'Excel 2013 Tutorial - How to add Password Protection',
  53             'description': 'Excel 2013 Tutorial for Beginners - How to add Password Protection',
  54             'upload_date': '20130211',
  55             'uploader': 'Hun Kim',
  56             'uploader_id': 'hunkimtutorials',
  57         },
  58         'params': {
  59             'skip_download': True,
  60         },
  61     }]
  62
  63     def _real_extract(self, url):
  64         video_id = self._match_id(url)
  65
  66         webpage = self._download_webpage(url, video_id)
  67
  68         src_from = self._html_search_regex(
  69             r"class=['\"]srcFrom['\"][^>]*>Sources?(?:\s+from)?\s*:\s*<a[^>]+(?:href|title)=(['\"])(?P<url>(?:(?!\1).)+)\1",
  70             webpage, 'external source', default=None, group='url')
  71         if src_from:
  72             return self.url_result(src_from)
  73
  74         oembed_obj = self._download_json(
  75             'http://www.camdemy.com/oembed/?format=json&url=' + url, video_id)
  76
  77         thumb_url = oembed_obj['thumbnail_url']
  78         video_folder = compat_urlparse.urljoin(thumb_url, 'video/')
  79         file_list_doc = self._download_xml(
  80             compat_urlparse.urljoin(video_folder, 'fileList.xml'),
  81             video_id, 'Downloading filelist XML')
  82         file_name = file_list_doc.find('./video/item/fileName').text
  83         video_url = compat_urlparse.urljoin(video_folder, file_name)
  84
  85         # Some URLs return "No permission or not login" in a webpage despite being
  86         # freely available via oembed JSON URL (e.g. http://www.camdemy.com/media/13885)
  87         upload_date = unified_strdate(self._search_regex(
  88             r'>published on ([^<]+)<', webpage,
  89             'upload date', default=None))
  90         view_count = str_to_int(self._search_regex(
  91             r'role=["\']viewCnt["\'][^>]*>([\d,.]+) views',
  92             webpage, 'view count', default=None))
  93         description = self._html_search_meta(
  94             'description', webpage, default=None) or clean_html(
  95             oembed_obj.get('description'))
  96
  97         return {
  98             'id': video_id,
  99             'url': video_url,
 100             'title': oembed_obj['title'],
 101             'thumbnail': thumb_url,
 102             'description': description,
 103             'creator': oembed_obj['author_name'],
 104             'duration': oembed_obj['duration'],
 105             'upload_date': upload_date,
 106             'view_count': view_count,
 107         }
 108
 109
 110 class CamdemyFolderIE(InfoExtractor):
 111     _VALID_URL = r'https?://www.camdemy.com/folder/(?P<id>\d+)'
 112     _TESTS = [{
 113         # links with trailing slash
 114         'url': 'http://www.camdemy.com/folder/450',
 115         'info_dict': {
 116             'id': '450',
 117             'title': '信號與系統 2012 & 2011 (Signals and Systems)',
 118         },
 119         'playlist_mincount': 145
 120     }, {
 121         # links without trailing slash
 122         # and multi-page
 123         'url': 'http://www.camdemy.com/folder/853',
 124         'info_dict': {
 125             'id': '853',
 126             'title': '科學計算 - 使用 Matlab'
 127         },
 128         'playlist_mincount': 20
 129     }, {
 130         # with displayMode parameter. For testing the codes to add parameters
 131         'url': 'http://www.camdemy.com/folder/853/?displayMode=defaultOrderByOrg',
 132         'info_dict': {
 133             'id': '853',
 134             'title': '科學計算 - 使用 Matlab'
 135         },
 136         'playlist_mincount': 20
 137     }]
 138
 139     def _real_extract(self, url):
 140         folder_id = self._match_id(url)
 141
 142         # Add displayMode=list so that all links are displayed in a single page
 143         parsed_url = list(compat_urlparse.urlparse(url))
 144         query = dict(compat_urlparse.parse_qsl(parsed_url[4]))
 145         query.update({'displayMode': 'list'})
 146         parsed_url[4] = compat_urllib_parse_urlencode(query)
 147         final_url = compat_urlparse.urlunparse(parsed_url)
 148
 149         page = self._download_webpage(final_url, folder_id)
 150         matches = re.findall(r"href='(/media/\d+/?)'", page)
 151
 152         entries = [self.url_result('http://www.camdemy.com' + media_path)
 153                    for media_path in matches]
 154
 155         folder_title = self._html_search_meta('keywords', page)
 156
 157         return self.playlist_result(entries, folder_id, folder_title)