From: remitamine Date: Fri, 4 Dec 2015 08:10:02 +0000 (+0100) Subject: Merge pull request #7686 from remitamine/acast X-Git-Url: http://git.bitcoin.ninja/index.cgi?p=youtube-dl;a=commitdiff_plain;h=07b88cffcea87326dd7ab42baf40fbcbc106dd26;hp=50e12e9df186b475d0e500f61426d9e72d5b4320 Merge pull request #7686 from remitamine/acast [acast] Add new extractor --- diff --git a/Makefile b/Makefile index fdb1abb60..f826c1685 100644 --- a/Makefile +++ b/Makefile @@ -61,34 +61,34 @@ youtube-dl: youtube_dl/*.py youtube_dl/*/*.py chmod a+x youtube-dl README.md: youtube_dl/*.py youtube_dl/*/*.py - COLUMNS=80 python youtube_dl/__main__.py --help | python devscripts/make_readme.py + COLUMNS=80 $(PYTHON) youtube_dl/__main__.py --help | $(PYTHON) devscripts/make_readme.py CONTRIBUTING.md: README.md - python devscripts/make_contributing.py README.md CONTRIBUTING.md + $(PYTHON) devscripts/make_contributing.py README.md CONTRIBUTING.md supportedsites: - python devscripts/make_supportedsites.py docs/supportedsites.md + $(PYTHON) devscripts/make_supportedsites.py docs/supportedsites.md README.txt: README.md pandoc -f markdown -t plain README.md -o README.txt youtube-dl.1: README.md - python devscripts/prepare_manpage.py >youtube-dl.1.temp.md + $(PYTHON) devscripts/prepare_manpage.py >youtube-dl.1.temp.md pandoc -s -f markdown -t man youtube-dl.1.temp.md -o youtube-dl.1 rm -f youtube-dl.1.temp.md youtube-dl.bash-completion: youtube_dl/*.py youtube_dl/*/*.py devscripts/bash-completion.in - python devscripts/bash-completion.py + $(PYTHON) devscripts/bash-completion.py bash-completion: youtube-dl.bash-completion youtube-dl.zsh: youtube_dl/*.py youtube_dl/*/*.py devscripts/zsh-completion.in - python devscripts/zsh-completion.py + $(PYTHON) devscripts/zsh-completion.py zsh-completion: youtube-dl.zsh youtube-dl.fish: youtube_dl/*.py youtube_dl/*/*.py devscripts/fish-completion.in - python devscripts/fish-completion.py + $(PYTHON) devscripts/fish-completion.py fish-completion: youtube-dl.fish diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 92765a3f9..b5a3e1167 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -13,6 +13,7 @@ from ..utils import ( encodeArgument, encodeFilename, sanitize_open, + handle_youtubedl_headers, ) @@ -33,9 +34,10 @@ class HlsFD(FileDownloader): if info_dict['http_headers'] and re.match(r'^https?://', url): # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. + headers = handle_youtubedl_headers(info_dict['http_headers']) args += [ '-headers', - ''.join('%s: %s\r\n' % (key, val) for key, val in info_dict['http_headers'].items() if key.lower() != 'accept-encoding')] + ''.join('%s: %s\r\n' % (key, val) for key, val in headers.items())] args += ['-i', url, '-f', 'mp4', '-c', 'copy', '-bsf:a', 'aac_adtstoasc'] diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f7594832f..c1dd87550 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -10,6 +10,8 @@ from .acast import ( from .addanime import AddAnimeIE from .adobetv import ( AdobeTVIE, + AdobeTVShowIE, + AdobeTVChannelIE, AdobeTVVideoIE, ) from .adultswim import AdultSwimIE @@ -42,6 +44,7 @@ from .arte import ( ) from .atresplayer import AtresPlayerIE from .atttechchannel import ATTTechChannelIE +from .audimedia import AudiMediaIE from .audiomack import AudiomackIE, AudiomackAlbumIE from .azubu import AzubuIE from .baidu import BaiduVideoIE @@ -204,6 +207,7 @@ from .freesound import FreesoundIE from .freespeech import FreespeechIE from .freevideo import FreeVideoIE from .funnyordie import FunnyOrDieIE +from .gameinformer import GameInformerIE from .gamekings import GamekingsIE from .gameone import ( GameOneIE, @@ -558,6 +562,10 @@ from .shahid import ShahidIE from .shared import SharedIE from .sharesix import ShareSixIE from .sina import SinaIE +from .skynewsarabia import ( + SkyNewsArabiaIE, + SkyNewsArabiaArticleIE, +) from .slideshare import SlideshareIE from .slutload import SlutloadIE from .smotri import ( diff --git a/youtube_dl/extractor/adobetv.py b/youtube_dl/extractor/adobetv.py index 5e43adc51..8753ee2cf 100644 --- a/youtube_dl/extractor/adobetv.py +++ b/youtube_dl/extractor/adobetv.py @@ -1,23 +1,32 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( parse_duration, unified_strdate, str_to_int, + int_or_none, float_or_none, ISO639Utils, + determine_ext, ) -class AdobeTVIE(InfoExtractor): - _VALID_URL = r'https?://tv\.adobe\.com/watch/[^/]+/(?P[^/]+)' +class AdobeTVBaseIE(InfoExtractor): + _API_BASE_URL = 'http://tv.adobe.com/api/v4/' + + +class AdobeTVIE(AdobeTVBaseIE): + _VALID_URL = r'https?://tv\.adobe\.com/(?:(?Pfr|de|es|jp)/)?watch/(?P[^/]+)/(?P[^/]+)' _TEST = { 'url': 'http://tv.adobe.com/watch/the-complete-picture-with-julieanne-kost/quick-tip-how-to-draw-a-circle-around-an-object-in-photoshop/', 'md5': '9bc5727bcdd55251f35ad311ca74fa1e', 'info_dict': { - 'id': 'quick-tip-how-to-draw-a-circle-around-an-object-in-photoshop', + 'id': '10981', 'ext': 'mp4', 'title': 'Quick Tip - How to Draw a Circle Around an Object in Photoshop', 'description': 'md5:99ec318dc909d7ba2a1f2b038f7d2311', @@ -29,50 +38,106 @@ class AdobeTVIE(InfoExtractor): } def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - player = self._parse_json( - self._search_regex(r'html5player:\s*({.+?})\s*\n', webpage, 'player'), - video_id) + language, show_urlname, urlname = re.match(self._VALID_URL, url).groups() + if not language: + language = 'en' - title = player.get('title') or self._search_regex( - r'data-title="([^"]+)"', webpage, 'title') - description = self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) - - upload_date = unified_strdate( - self._html_search_meta('datepublished', webpage, 'upload date')) - - duration = parse_duration( - self._html_search_meta('duration', webpage, 'duration') or - self._search_regex( - r'Runtime:\s*(\d{2}:\d{2}:\d{2})', - webpage, 'duration', fatal=False)) - - view_count = str_to_int(self._search_regex( - r'
\s*Views?:\s*([\d,.]+)\s*
', - webpage, 'view count')) + video_data = self._download_json( + self._API_BASE_URL + 'episode/get/?language=%s&show_urlname=%s&urlname=%s&disclosure=standard' % (language, show_urlname, urlname), + urlname)['data'][0] formats = [{ - 'url': source['src'], - 'format_id': source.get('quality') or source['src'].split('-')[-1].split('.')[0] or None, - 'tbr': source.get('bitrate'), - } for source in player['sources']] + 'url': source['url'], + 'format_id': source.get('quality_level') or source['url'].split('-')[-1].split('.')[0] or None, + 'width': int_or_none(source.get('width')), + 'height': int_or_none(source.get('height')), + 'tbr': int_or_none(source.get('video_data_rate')), + } for source in video_data['videos']] self._sort_formats(formats) return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'upload_date': upload_date, - 'duration': duration, - 'view_count': view_count, + 'id': compat_str(video_data['id']), + 'title': video_data['title'], + 'description': video_data.get('description'), + 'thumbnail': video_data.get('thumbnail'), + 'upload_date': unified_strdate(video_data.get('start_date')), + 'duration': parse_duration(video_data.get('duration')), + 'view_count': str_to_int(video_data.get('playcount')), 'formats': formats, } +class AdobeTVPlaylistBaseIE(AdobeTVBaseIE): + def _parse_page_data(self, page_data): + return [self.url_result(self._get_element_url(element_data)) for element_data in page_data] + + def _extract_playlist_entries(self, url, display_id): + page = self._download_json(url, display_id) + entries = self._parse_page_data(page['data']) + for page_num in range(2, page['paging']['pages'] + 1): + entries.extend(self._parse_page_data( + self._download_json(url + '&page=%d' % page_num, display_id)['data'])) + return entries + + +class AdobeTVShowIE(AdobeTVPlaylistBaseIE): + _VALID_URL = r'https?://tv\.adobe\.com/(?:(?Pfr|de|es|jp)/)?show/(?P[^/]+)' + + _TEST = { + 'url': 'http://tv.adobe.com/show/the-complete-picture-with-julieanne-kost', + 'info_dict': { + 'id': '36', + 'title': 'The Complete Picture with Julieanne Kost', + 'description': 'md5:fa50867102dcd1aa0ddf2ab039311b27', + }, + 'playlist_mincount': 136, + } + + def _get_element_url(self, element_data): + return element_data['urls'][0] + + def _real_extract(self, url): + language, show_urlname = re.match(self._VALID_URL, url).groups() + if not language: + language = 'en' + query = 'language=%s&show_urlname=%s' % (language, show_urlname) + + show_data = self._download_json(self._API_BASE_URL + 'show/get/?%s' % query, show_urlname)['data'][0] + + return self.playlist_result( + self._extract_playlist_entries(self._API_BASE_URL + 'episode/?%s' % query, show_urlname), + compat_str(show_data['id']), + show_data['show_name'], + show_data['show_description']) + + +class AdobeTVChannelIE(AdobeTVPlaylistBaseIE): + _VALID_URL = r'https?://tv\.adobe\.com/(?:(?Pfr|de|es|jp)/)?channel/(?P[^/]+)(?:/(?P[^/]+))?' + + _TEST = { + 'url': 'http://tv.adobe.com/channel/development', + 'info_dict': { + 'id': 'development', + }, + 'playlist_mincount': 96, + } + + def _get_element_url(self, element_data): + return element_data['url'] + + def _real_extract(self, url): + language, channel_urlname, category_urlname = re.match(self._VALID_URL, url).groups() + if not language: + language = 'en' + query = 'language=%s&channel_urlname=%s' % (language, channel_urlname) + if category_urlname: + query += '&category_urlname=%s' % category_urlname + + return self.playlist_result( + self._extract_playlist_entries(self._API_BASE_URL + 'show/?%s' % query, channel_urlname), + channel_urlname) + + class AdobeTVVideoIE(InfoExtractor): _VALID_URL = r'https?://video\.tv\.adobe\.com/v/(?P\d+)' @@ -91,28 +156,25 @@ class AdobeTVVideoIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - player_params = self._parse_json(self._search_regex( - r'var\s+bridge\s*=\s*([^;]+);', webpage, 'player parameters'), - video_id) + video_data = self._download_json(url + '?format=json', video_id) formats = [{ + 'format_id': '%s-%s' % (determine_ext(source['src']), source.get('height')), 'url': source['src'], - 'width': source.get('width'), - 'height': source.get('height'), - 'tbr': source.get('bitrate'), - } for source in player_params['sources']] + 'width': int_or_none(source.get('width')), + 'height': int_or_none(source.get('height')), + 'tbr': int_or_none(source.get('bitrate')), + } for source in video_data['sources']] + self._sort_formats(formats) # For both metadata and downloaded files the duration varies among # formats. I just pick the max one duration = max(filter(None, [ float_or_none(source.get('duration'), scale=1000) - for source in player_params['sources']])) + for source in video_data['sources']])) subtitles = {} - for translation in player_params.get('translations', []): + for translation in video_data.get('translations', []): lang_id = translation.get('language_w3c') or ISO639Utils.long2short(translation['language_medium']) if lang_id not in subtitles: subtitles[lang_id] = [] @@ -124,8 +186,9 @@ class AdobeTVVideoIE(InfoExtractor): return { 'id': video_id, 'formats': formats, - 'title': player_params['title'], - 'description': self._og_search_description(webpage), + 'title': video_data['title'], + 'description': video_data.get('description'), + 'thumbnail': video_data['video'].get('poster'), 'duration': duration, 'subtitles': subtitles, } diff --git a/youtube_dl/extractor/audimedia.py b/youtube_dl/extractor/audimedia.py new file mode 100644 index 000000000..b0b089dee --- /dev/null +++ b/youtube_dl/extractor/audimedia.py @@ -0,0 +1,80 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, + sanitized_Request, +) + + +class AudiMediaIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?audimedia\.tv/(?:en|de)/vid/(?P[^/?#]+)' + _TEST = { + 'url': 'https://audimedia.tv/en/vid/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test', + 'md5': '79a8b71c46d49042609795ab59779b66', + 'info_dict': { + 'id': '1564', + 'ext': 'mp4', + 'title': '60 Seconds of Audi Sport 104/2015 - WEC Bahrain, Rookie Test', + 'description': 'md5:60e5d30a78ced725f7b8d34370762941', + 'upload_date': '20151124', + 'timestamp': 1448354940, + 'duration': 74022, + 'view_count': int, + } + } + # extracted from https://audimedia.tv/assets/embed/embedded-player.js (dataSourceAuthToken) + _AUTH_TOKEN = 'e25b42847dba18c6c8816d5d8ce94c326e06823ebf0859ed164b3ba169be97f2' + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + raw_payload = self._search_regex(r']+class="amtv-embed"[^>]+id="([^"]+)"', webpage, 'raw payload') + _, stage_mode, video_id, lang = raw_payload.split('-') + + # TODO: handle s and e stage_mode (live streams and ended live streams) + if stage_mode not in ('s', 'e'): + request = sanitized_Request( + 'https://audimedia.tv/api/video/v1/videos/%s?embed[]=video_versions&embed[]=thumbnail_image&where[content_language_iso]=%s' % (video_id, lang), + headers={'X-Auth-Token': self._AUTH_TOKEN}) + json_data = self._download_json(request, video_id)['results'] + formats = [] + + stream_url_hls = json_data.get('stream_url_hls') + if stream_url_hls: + m3u8_formats = self._extract_m3u8_formats(stream_url_hls, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + + stream_url_hds = json_data.get('stream_url_hds') + if stream_url_hds: + f4m_formats = self._extract_f4m_formats(json_data.get('stream_url_hds') + '?hdcore=3.4.0', video_id, -1, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) + + for video_version in json_data.get('video_versions'): + video_version_url = video_version.get('download_url') or video_version.get('stream_url') + if not video_version_url: + continue + formats.append({ + 'url': video_version_url, + 'width': int_or_none(video_version.get('width')), + 'height': int_or_none(video_version.get('height')), + 'abr': int_or_none(video_version.get('audio_bitrate')), + 'vbr': int_or_none(video_version.get('video_bitrate')), + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': json_data['title'], + 'description': json_data.get('subtitle'), + 'thumbnail': json_data.get('thumbnail_image', {}).get('file'), + 'timestamp': parse_iso8601(json_data.get('publication_date')), + 'duration': int_or_none(json_data.get('duration')), + 'view_count': int_or_none(json_data.get('view_count')), + 'formats': formats, + } diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 33b296eaf..7fb80aa38 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -22,7 +22,8 @@ from ..compat import ( class BBCCoUkIE(InfoExtractor): IE_NAME = 'bbc.co.uk' IE_DESC = 'BBC iPlayer' - _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:programmes/(?!articles/)|iplayer(?:/[^/]+)?/(?:episode/|playlist/))|music/clips[/#])(?P[\da-z]{8})' + _ID_REGEX = r'[pb][\da-z]{7}' + _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:programmes/(?!articles/)|iplayer(?:/[^/]+)?/(?:episode/|playlist/))|music/clips[/#])(?P%s)' % _ID_REGEX _MEDIASELECTOR_URLS = [ # Provides HQ HLS streams with even better quality that pc mediaset but fails @@ -465,7 +466,7 @@ class BBCCoUkIE(InfoExtractor): if not programme_id: programme_id = self._search_regex( - r'"vpid"\s*:\s*"([\da-z]{8})"', webpage, 'vpid', fatal=False, default=None) + r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None) if programme_id: formats, subtitles = self._download_media_selector(programme_id) @@ -780,8 +781,9 @@ class BBCIE(BBCCoUkIE): # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) programme_id = self._search_regex( - [r'data-video-player-vpid="([\da-z]{8})"', - r']+name="externalIdentifier"[^>]+value="([\da-z]{8})"'], + [r'data-video-player-vpid="(%s)"' % self._ID_REGEX, + r']+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX, + r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX], webpage, 'vpid', default=None) if programme_id: @@ -816,7 +818,7 @@ class BBCIE(BBCCoUkIE): # Multiple video article (e.g. # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460) - EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+[\da-z]{8}(?:\b[^"]+)?' + EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX entries = [] for match in extract_all(r'new\s+SMP\(({.+?})\)'): embed_url = match.get('playerSettings', {}).get('externalEmbedUrl') diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py index 61bc2f744..e63c2ac00 100644 --- a/youtube_dl/extractor/beeg.py +++ b/youtube_dl/extractor/beeg.py @@ -1,6 +1,11 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import ( + compat_chr, + compat_ord, + compat_urllib_parse_unquote, +) from ..utils import ( int_or_none, parse_iso8601, @@ -29,7 +34,24 @@ class BeegIE(InfoExtractor): video_id = self._match_id(url) video = self._download_json( - 'http://beeg.com/api/v1/video/%s' % video_id, video_id) + 'http://beeg.com/api/v3/video/%s' % video_id, video_id) + + def decrypt_key(key): + # Reverse engineered from http://static.beeg.com/cpl/1067.js + a = '8RPUUCS35ZWp3ADnKcSmpH71ZusrROo' + e = compat_urllib_parse_unquote(key) + return ''.join([ + compat_chr(compat_ord(e[n]) - compat_ord(a[n % len(a)]) % 25) + for n in range(len(e))]) + + def decrypt_url(encrypted_url): + encrypted_url = self._proto_relative_url( + encrypted_url.replace('{DATA_MARKERS}', ''), 'http:') + key = self._search_regex( + r'/key=(.*?)%2Cend=', encrypted_url, 'key', default=None) + if not key: + return encrypted_url + return encrypted_url.replace(key, decrypt_key(key)) formats = [] for format_id, video_url in video.items(): @@ -40,7 +62,7 @@ class BeegIE(InfoExtractor): if not height: continue formats.append({ - 'url': self._proto_relative_url(video_url.replace('{DATA_MARKERS}', ''), 'http:'), + 'url': decrypt_url(video_url), 'format_id': format_id, 'height': int(height), }) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 6c66a1236..59beb11bc 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -2,143 +2,109 @@ from __future__ import unicode_literals import re -import itertools -import json from .common import InfoExtractor -from ..compat import ( - compat_etree_fromstring, -) +from ..compat import compat_str from ..utils import ( int_or_none, - unified_strdate, + unescapeHTML, ExtractorError, + xpath_text, ) class BiliBiliIE(InfoExtractor): - _VALID_URL = r'http://www\.bilibili\.(?:tv|com)/video/av(?P[0-9]+)/' + _VALID_URL = r'http://www\.bilibili\.(?:tv|com)/video/av(?P\d+)(?:/index_(?P\d+).html)?' _TESTS = [{ 'url': 'http://www.bilibili.tv/video/av1074402/', 'md5': '2c301e4dab317596e837c3e7633e7d86', 'info_dict': { - 'id': '1074402_part1', + 'id': '1554319', 'ext': 'flv', 'title': '【金坷垃】金泡沫', - 'duration': 308, + 'duration': 308313, 'upload_date': '20140420', 'thumbnail': 're:^https?://.+\.jpg', + 'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923', + 'timestamp': 1397983878, + 'uploader': '菊子桑', }, }, { 'url': 'http://www.bilibili.com/video/av1041170/', 'info_dict': { 'id': '1041170', 'title': '【BD1080P】刀语【诸神&异域】', + 'description': '这是个神奇的故事~每个人不留弹幕不给走哦~切利哦!~', + 'uploader': '枫叶逝去', + 'timestamp': 1396501299, }, 'playlist_count': 9, }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - if '(此视频不存在或被删除)' in webpage: - raise ExtractorError( - 'The video does not exist or was deleted', expected=True) - - if '>你没有权限浏览! 由于版权相关问题 我们不对您所在的地区提供服务<' in webpage: - raise ExtractorError( - 'The video is not available in your region due to copyright reasons', - expected=True) - - video_code = self._search_regex( - r'(?s)
(.*?)
', webpage, 'video code') - - title = self._html_search_meta( - 'media:title', video_code, 'title', fatal=True) - duration_str = self._html_search_meta( - 'duration', video_code, 'duration') - if duration_str is None: - duration = None - else: - duration_mobj = re.match( - r'^T(?:(?P[0-9]+)H)?(?P[0-9]+)M(?P[0-9]+)S$', - duration_str) - duration = ( - int_or_none(duration_mobj.group('hours'), default=0) * 3600 + - int(duration_mobj.group('minutes')) * 60 + - int(duration_mobj.group('seconds'))) - upload_date = unified_strdate(self._html_search_meta( - 'uploadDate', video_code, fatal=False)) - thumbnail = self._html_search_meta( - 'thumbnailUrl', video_code, 'thumbnail', fatal=False) - - cid = self._search_regex(r'cid=(\d+)', webpage, 'cid') - - entries = [] - - lq_page = self._download_webpage( - 'http://interface.bilibili.com/v_cdn_play?appkey=1&cid=%s' % cid, - video_id, - note='Downloading LQ video info' + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + page_num = mobj.group('page_num') or '1' + + view_data = self._download_json( + 'http://api.bilibili.com/view?type=json&appkey=8e9fc618fbd41e28&id=%s&page=%s' % (video_id, page_num), + video_id) + if 'error' in view_data: + raise ExtractorError('%s said: %s' % (self.IE_NAME, view_data['error']), expected=True) + + cid = view_data['cid'] + title = unescapeHTML(view_data['title']) + + doc = self._download_xml( + 'http://interface.bilibili.com/v_cdn_play?appkey=8e9fc618fbd41e28&cid=%s' % cid, + cid, + 'Downloading page %s/%s' % (page_num, view_data['pages']) ) - try: - err_info = json.loads(lq_page) - raise ExtractorError( - 'BiliBili said: ' + err_info['error_text'], expected=True) - except ValueError: - pass - lq_doc = compat_etree_fromstring(lq_page) - lq_durls = lq_doc.findall('./durl') + if xpath_text(doc, './result') == 'error': + raise ExtractorError('%s said: %s' % (self.IE_NAME, xpath_text(doc, './message')), expected=True) - hq_doc = self._download_xml( - 'http://interface.bilibili.com/playurl?appkey=1&cid=%s' % cid, - video_id, - note='Downloading HQ video info', - fatal=False, - ) - if hq_doc is not False: - hq_durls = hq_doc.findall('./durl') - assert len(lq_durls) == len(hq_durls) - else: - hq_durls = itertools.repeat(None) + entries = [] - i = 1 - for lq_durl, hq_durl in zip(lq_durls, hq_durls): + for durl in doc.findall('./durl'): + size = xpath_text(durl, ['./filesize', './size']) formats = [{ - 'format_id': 'lq', - 'quality': 1, - 'url': lq_durl.find('./url').text, - 'filesize': int_or_none( - lq_durl.find('./size'), get_attr='text'), + 'url': durl.find('./url').text, + 'filesize': int_or_none(size), + 'ext': 'flv', }] - if hq_durl is not None: - formats.append({ - 'format_id': 'hq', - 'quality': 2, - 'ext': 'flv', - 'url': hq_durl.find('./url').text, - 'filesize': int_or_none( - hq_durl.find('./size'), get_attr='text'), - }) - self._sort_formats(formats) + backup_urls = durl.find('./backup_url') + if backup_urls is not None: + for backup_url in backup_urls.findall('./url'): + formats.append({'url': backup_url.text}) + formats.reverse() entries.append({ - 'id': '%s_part%d' % (video_id, i), + 'id': '%s_part%s' % (cid, xpath_text(durl, './order')), 'title': title, + 'duration': int_or_none(xpath_text(durl, './length'), 1000), 'formats': formats, - 'duration': duration, - 'upload_date': upload_date, - 'thumbnail': thumbnail, }) - i += 1 - - return { - '_type': 'multi_video', - 'entries': entries, - 'id': video_id, - 'title': title + info = { + 'id': compat_str(cid), + 'title': title, + 'description': view_data.get('description'), + 'thumbnail': view_data.get('pic'), + 'uploader': view_data.get('author'), + 'timestamp': int_or_none(view_data.get('created')), + 'view_count': int_or_none(view_data.get('play')), + 'duration': int_or_none(xpath_text(doc, './timelength')), } + + if len(entries) == 1: + entries[0].update(info) + return entries[0] + else: + info.update({ + '_type': 'multi_video', + 'id': video_id, + 'entries': entries, + }) + return info diff --git a/youtube_dl/extractor/bloomberg.py b/youtube_dl/extractor/bloomberg.py index 11ace91dd..ebeef8f2a 100644 --- a/youtube_dl/extractor/bloomberg.py +++ b/youtube_dl/extractor/bloomberg.py @@ -6,7 +6,7 @@ from .common import InfoExtractor class BloombergIE(InfoExtractor): - _VALID_URL = r'https?://www\.bloomberg\.com/news/[^/]+/[^/]+/(?P[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?bloomberg\.com/(?:[^/]+/)*(?P[^/?#]+)' _TESTS = [{ 'url': 'http://www.bloomberg.com/news/videos/b/aaeae121-5949-481e-a1ce-4562db6f5df2', @@ -20,22 +20,36 @@ class BloombergIE(InfoExtractor): }, { 'url': 'http://www.bloomberg.com/news/articles/2015-11-12/five-strange-things-that-have-been-happening-in-financial-markets', 'only_matching': True, + }, { + 'url': 'http://www.bloomberg.com/politics/videos/2015-11-25/karl-rove-on-jeb-bush-s-struggles-stopping-trump', + 'only_matching': True, }] def _real_extract(self, url): name = self._match_id(url) webpage = self._download_webpage(url, name) - video_id = self._search_regex(r'"bmmrId":"(.+?)"', webpage, 'id') + video_id = self._search_regex( + r'["\']bmmrId["\']\s*:\s*(["\'])(?P.+?)\1', + webpage, 'id', group='url') title = re.sub(': Video$', '', self._og_search_title(webpage)) embed_info = self._download_json( 'http://www.bloomberg.com/api/embed?id=%s' % video_id, video_id) formats = [] for stream in embed_info['streams']: - if stream["muxing_format"] == "TS": - formats.extend(self._extract_m3u8_formats(stream['url'], video_id)) + stream_url = stream.get('url') + if not stream_url: + continue + if stream['muxing_format'] == 'TS': + m3u8_formats = self._extract_m3u8_formats( + stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) else: - formats.extend(self._extract_f4m_formats(stream['url'], video_id)) + f4m_formats = self._extract_f4m_formats( + stream_url, video_id, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/byutv.py b/youtube_dl/extractor/byutv.py index 3b2de517e..ce25816f0 100644 --- a/youtube_dl/extractor/byutv.py +++ b/youtube_dl/extractor/byutv.py @@ -14,9 +14,10 @@ class BYUtvIE(InfoExtractor): 'info_dict': { 'id': 'studio-c-season-5-episode-5', 'ext': 'mp4', - 'description': 'md5:5438d33774b6bdc662f9485a340401cc', + 'description': 'md5:e07269172baff037f8e8bf9956bc9747', 'title': 'Season 5 Episode 5', - 'thumbnail': 're:^https?://.*\.jpg$' + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 1486486, }, 'params': { 'skip_download': True, diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py index 7af903571..d142e326f 100644 --- a/youtube_dl/extractor/clipfish.py +++ b/youtube_dl/extractor/clipfish.py @@ -4,11 +4,8 @@ import re from .common import InfoExtractor from ..utils import ( - determine_ext, int_or_none, - js_to_json, - parse_iso8601, - remove_end, + unified_strdate, ) @@ -21,48 +18,37 @@ class ClipfishIE(InfoExtractor): 'id': '3966754', 'ext': 'mp4', 'title': 'FIFA 14 - E3 2013 Trailer', - 'timestamp': 1370938118, + 'description': 'Video zu FIFA 14: E3 2013 Trailer', 'upload_date': '20130611', 'duration': 82, + 'view_count': int, } } def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + video_info = self._download_json('http://www.clipfish.de/devapi/id/%s?format=json&apikey=hbbtv' % video_id, video_id)['items'][0] - video_info = self._parse_json( - js_to_json(self._html_search_regex( - '(?s)videoObject\s*=\s*({.+?});', webpage, 'video object')), - video_id) - - formats = [] - for video_url in re.findall(r'var\s+videourl\s*=\s*"([^"]+)"', webpage): - ext = determine_ext(video_url) - if ext == 'm3u8': - formats.append({ - 'url': video_url.replace('de.hls.fra.clipfish.de', 'hls.fra.clipfish.de'), - 'ext': 'mp4', - 'format_id': 'hls', - }) - else: - formats.append({ - 'url': video_url, - 'format_id': ext, - }) - self._sort_formats(formats) - - title = remove_end(self._og_search_title(webpage), ' - Video') - thumbnail = self._og_search_thumbnail(webpage) - duration = int_or_none(video_info.get('length')) - timestamp = parse_iso8601(self._html_search_meta('uploadDate', webpage, 'upload date')) + formats = [{ + 'url': video_info['media_videourl_hls'].replace('de.hls.fra.clipfish.de', 'hls.fra.clipfish.de'), + 'ext': 'mp4', + 'format_id': 'hls', + },{ + 'url': video_info['media_videourl'], + 'format_id': 'mp4', + 'width': int_or_none(video_info.get('width')), + 'height': int_or_none(video_info.get('height')), + 'tbr': int_or_none(video_info.get('bitrate')), + }] return { 'id': video_id, - 'title': title, + 'title': video_info['title'], + 'description': video_info.get('descr'), 'formats': formats, - 'thumbnail': thumbnail, - 'duration': duration, - 'timestamp': timestamp, + 'thumbnail': video_info.get('media_content_thumbnail_large') or video_info.get('media_thumbnail'), + 'duration': int_or_none(video_info.get('media_length')), + 'upload_date': unified_strdate(video_info.get('pubDate')), + 'view_count': int_or_none(video_info.get('media_views')) } diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index eb9bfa3d1..6ab2d68d6 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -167,7 +167,7 @@ class InfoExtractor(object): "ext" will be calculated from URL if missing automatic_captions: Like 'subtitles', used by the YoutubeIE for automatically generated captions - duration: Length of the video in seconds, as an integer. + duration: Length of the video in seconds, as an integer or float. view_count: How many users have watched the video on the platform. like_count: Number of positive ratings of the video dislike_count: Number of negative ratings of the video diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index fbefd37d0..7b685d157 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -9,6 +9,7 @@ from ..utils import ( find_xpath_attr, smuggle_url, determine_ext, + ExtractorError, ) from .senateisvp import SenateISVPIE @@ -18,33 +19,32 @@ class CSpanIE(InfoExtractor): IE_DESC = 'C-SPAN' _TESTS = [{ 'url': 'http://www.c-span.org/video/?313572-1/HolderonV', - 'md5': '8e44ce11f0f725527daccc453f553eb0', + 'md5': '94b29a4f131ff03d23471dd6f60b6a1d', 'info_dict': { 'id': '315139', 'ext': 'mp4', 'title': 'Attorney General Eric Holder on Voting Rights Act Decision', - 'description': 'Attorney General Eric Holder spoke to reporters following the Supreme Court decision in Shelby County v. Holder in which the court ruled that the preclearance provisions of the Voting Rights Act could not be enforced until Congress established new guidelines for review.', + 'description': 'Attorney General Eric Holder speaks to reporters following the Supreme Court decision in [Shelby County v. Holder], in which the court ruled that the preclearance provisions of the Voting Rights Act could not be enforced.', }, 'skip': 'Regularly fails on travis, for unknown reasons', }, { 'url': 'http://www.c-span.org/video/?c4486943/cspan-international-health-care-models', - # For whatever reason, the served video alternates between - # two different ones + 'md5': '8e5fbfabe6ad0f89f3012a7943c1287b', 'info_dict': { - 'id': '340723', + 'id': 'c4486943', 'ext': 'mp4', - 'title': 'International Health Care Models', + 'title': 'CSPAN - International Health Care Models', 'description': 'md5:7a985a2d595dba00af3d9c9f0783c967', } }, { 'url': 'http://www.c-span.org/video/?318608-1/gm-ignition-switch-recall', - 'md5': '446562a736c6bf97118e389433ed88d4', + 'md5': '2ae5051559169baadba13fc35345ae74', 'info_dict': { 'id': '342759', 'ext': 'mp4', 'title': 'General Motors Ignition Switch Recall', 'duration': 14848, - 'description': 'md5:70c7c3b8fa63fa60d42772440596034c' + 'description': 'md5:118081aedd24bf1d3b68b3803344e7f3' }, }, { # Video from senate.gov @@ -57,67 +57,77 @@ class CSpanIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - page_id = mobj.group('id') - webpage = self._download_webpage(url, page_id) - video_id = self._search_regex(r'progid=\'?([0-9]+)\'?>', webpage, 'video id') + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + matches = re.search(r'data-(prog|clip)id=\'([0-9]+)\'', webpage) + if matches: + video_type, video_id = matches.groups() + if video_type == 'prog': + video_type = 'program' + else: + senate_isvp_url = SenateISVPIE._search_iframe_url(webpage) + if senate_isvp_url: + title = self._og_search_title(webpage) + surl = smuggle_url(senate_isvp_url, {'force_title': title}) + return self.url_result(surl, 'SenateISVP', video_id, title) - description = self._html_search_regex( - [ - # The full description - r'
(.*?)(.*?)

' - ], - webpage, 'description', flags=re.DOTALL, default=None) + def get_text_attr(d, attr): + return d.get(attr, {}).get('#text') - info_url = 'http://c-spanvideo.org/videoLibrary/assets/player/ajax-player.php?os=android&html5=program&id=' + video_id - data = self._download_json(info_url, video_id) + data = self._download_json( + 'http://www.c-span.org/assets/player/ajax-player.php?os=android&html5=%s&id=%s' % (video_type, video_id), + video_id)['video'] + if data['@status'] != 'Success': + raise ExtractorError('%s said: %s' % (self.IE_NAME, get_text_attr(data, 'error')), expected=True) doc = self._download_xml( - 'http://www.c-span.org/common/services/flashXml.php?programid=' + video_id, + 'http://www.c-span.org/common/services/flashXml.php?%sid=%s' % (video_type, video_id), video_id) + description = self._html_search_meta('description', webpage) + title = find_xpath_attr(doc, './/string', 'name', 'title').text thumbnail = find_xpath_attr(doc, './/string', 'name', 'poster').text - senate_isvp_url = SenateISVPIE._search_iframe_url(webpage) - if senate_isvp_url: - surl = smuggle_url(senate_isvp_url, {'force_title': title}) - return self.url_result(surl, 'SenateISVP', video_id, title) - - files = data['video']['files'] - try: - capfile = data['video']['capfile']['#text'] - except KeyError: - capfile = None + files = data['files'] + capfile = get_text_attr(data, 'capfile') - entries = [{ - 'id': '%s_%d' % (video_id, partnum + 1), - 'title': ( - title if len(files) == 1 else - '%s part %d' % (title, partnum + 1)), - 'url': unescapeHTML(f['path']['#text']), - 'description': description, - 'thumbnail': thumbnail, - 'duration': int_or_none(f.get('length', {}).get('#text')), - 'subtitles': { - 'en': [{ - 'url': capfile, - 'ext': determine_ext(capfile, 'dfxp') - }], - } if capfile else None, - } for partnum, f in enumerate(files)] + entries = [] + for partnum, f in enumerate(files): + formats = [] + for quality in f['qualities']: + formats.append({ + 'format_id': '%s-%sp' % (get_text_attr(quality, 'bitrate'), get_text_attr(quality, 'height')), + 'url': unescapeHTML(get_text_attr(quality, 'file')), + 'height': int_or_none(get_text_attr(quality, 'height')), + 'tbr': int_or_none(get_text_attr(quality, 'bitrate')), + }) + self._sort_formats(formats) + entries.append({ + 'id': '%s_%d' % (video_id, partnum + 1), + 'title': ( + title if len(files) == 1 else + '%s part %d' % (title, partnum + 1)), + 'formats': formats, + 'description': description, + 'thumbnail': thumbnail, + 'duration': int_or_none(get_text_attr(f, 'length')), + 'subtitles': { + 'en': [{ + 'url': capfile, + 'ext': determine_ext(capfile, 'dfxp') + }], + } if capfile else None, + }) if len(entries) == 1: entry = dict(entries[0]) - entry['id'] = video_id + entry['id'] = 'c' + video_id if video_type == 'clip' else video_id return entry else: return { '_type': 'playlist', 'entries': entries, 'title': title, - 'id': video_id, + 'id': 'c' + video_id if video_type == 'clip' else video_id, } diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index fd854411b..321eec59e 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -164,7 +164,7 @@ class FacebookIE(InfoExtractor): if not video_title: video_title = self._html_search_regex( r'(?s)(.*?)', - webpage, 'alternative title', fatal=False) + webpage, 'alternative title', default=None) video_title = limit_length(video_title, 80) if not video_title: video_title = 'Facebook video #%s' % video_id diff --git a/youtube_dl/extractor/gameinformer.py b/youtube_dl/extractor/gameinformer.py new file mode 100644 index 000000000..25870c131 --- /dev/null +++ b/youtube_dl/extractor/gameinformer.py @@ -0,0 +1,43 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import int_or_none + + +class GameInformerIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?gameinformer\.com/(?:[^/]+/)*(?P.+)\.aspx' + _TEST = { + 'url': 'http://www.gameinformer.com/b/features/archive/2015/09/26/replay-animal-crossing.aspx', + 'info_dict': { + 'id': '4515472681001', + 'ext': 'm3u8', + 'title': 'Replay - Animal Crossing', + 'description': 'md5:2e211891b215c85d061adc7a4dd2d930', + 'timestamp': 1443457610706, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + bc_api_url = self._search_regex(r"getVideo\('([^']+)'", webpage, 'brightcove api url') + json_data = self._download_json( + bc_api_url + '&video_fields=id,name,shortDescription,publishedDate,videoStillURL,length,IOSRenditions', + display_id) + + return { + 'id': compat_str(json_data['id']), + 'display_id': display_id, + 'url': json_data['IOSRenditions'][0]['url'], + 'title': json_data['name'], + 'description': json_data.get('shortDescription'), + 'timestamp': int_or_none(json_data.get('publishedDate')), + 'duration': int_or_none(json_data.get('length')), + } diff --git a/youtube_dl/extractor/gametrailers.py b/youtube_dl/extractor/gametrailers.py index a6ab795ae..c3f031d9c 100644 --- a/youtube_dl/extractor/gametrailers.py +++ b/youtube_dl/extractor/gametrailers.py @@ -1,19 +1,62 @@ from __future__ import unicode_literals -from .mtv import MTVServicesInfoExtractor +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_age_limit, + url_basename, +) -class GametrailersIE(MTVServicesInfoExtractor): - _VALID_URL = r'http://www\.gametrailers\.com/(?Pvideos|reviews|full-episodes)/(?P.*?)/(?P.*)' +class GametrailersIE(InfoExtractor): + _VALID_URL = r'http://www\.gametrailers\.com/videos/view/[^/]+/(?P<id>.+)' + _TEST = { - 'url': 'http://www.gametrailers.com/videos/zbvr8i/mirror-s-edge-2-e3-2013--debut-trailer', - 'md5': '4c8e67681a0ea7ec241e8c09b3ea8cf7', + 'url': 'http://www.gametrailers.com/videos/view/gametrailers-com/116437-Just-Cause-3-Review', + 'md5': 'f28c4efa0bdfaf9b760f6507955b6a6a', 'info_dict': { - 'id': '70e9a5d7-cf25-4a10-9104-6f3e7342ae0d', + 'id': '2983958', 'ext': 'mp4', - 'title': 'E3 2013: Debut Trailer', - 'description': 'Faith is back! Check out the World Premiere trailer for Mirror\'s Edge 2 straight from the EA Press Conference at E3 2013!', + 'display_id': '116437-Just-Cause-3-Review', + 'title': 'Just Cause 3 - Review', + 'description': 'It\'s a lot of fun to shoot at things and then watch them explode in Just Cause 3, but should there be more to the experience than that?', }, } - _FEED_URL = 'http://www.gametrailers.com/feeds/mrss' + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + title = self._html_search_regex( + r'<title>(.+?)\|', webpage, 'title').strip() + embed_url = self._proto_relative_url( + self._search_regex( + r'src=\'(//embed.gametrailers.com/embed/[^\']+)\'', webpage, + 'embed url'), + scheme='http:') + video_id = url_basename(embed_url) + embed_page = self._download_webpage(embed_url, video_id) + embed_vars_json = self._search_regex( + r'(?s)var embedVars = (\{.*?\})\s*</script>', embed_page, + 'embed vars') + info = self._parse_json(embed_vars_json, video_id) + + formats = [] + for media in info['media']: + if media['mediaPurpose'] == 'play': + formats.append({ + 'url': media['uri'], + 'height': media['height'], + 'width:': media['width'], + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'formats': formats, + 'thumbnail': info.get('thumbUri'), + 'description': self._og_search_description(webpage), + 'duration': int_or_none(info.get('videoLengthInSeconds')), + 'age_limit': parse_age_limit(info.get('audienceRating')), + } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 5075d131e..b60684f98 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -339,6 +339,7 @@ class GenericIE(InfoExtractor): 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ', 'ext': 'mp4', 'title': '2cc213299525360.mov', # that's what we get + 'duration': 238231, }, 'add_ie': ['Ooyala'], }, @@ -350,6 +351,7 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': '"Steve Jobs: Man in the Machine" trailer', 'description': 'The first trailer for the Alex Gibney documentary "Steve Jobs: Man in the Machine."', + 'duration': 135427, }, 'params': { 'skip_download': True, @@ -960,8 +962,9 @@ class GenericIE(InfoExtractor): 'info_dict': { 'id': '50YnY4czr4ms1vJ7yz3xzq0excz_pUMs', 'ext': 'mp4', - 'description': 'VIDEO: Index/Match versus VLOOKUP.', + 'description': 'VIDEO: INDEX/MATCH versus VLOOKUP.', 'title': 'This is what separates the Excel masters from the wannabes', + 'duration': 191933, }, 'params': { # m3u8 downloads @@ -1501,7 +1504,7 @@ class GenericIE(InfoExtractor): re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage)) if mobj is not None: - return OoyalaIE._build_url_result(mobj.group('ec')) + return OoyalaIE._build_url_result(smuggle_url(mobj.group('ec'), {'domain': url})) # Look for multiple Ooyala embeds on SBN network websites mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage) @@ -1509,7 +1512,7 @@ class GenericIE(InfoExtractor): embeds = self._parse_json(mobj.group(1), video_id, fatal=False) if embeds: return _playlist_from_matches( - embeds, getter=lambda v: OoyalaIE._url_for_embed_code(v['provider_video_id']), ie='Ooyala') + embeds, getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala') # Look for Aparat videos mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage) diff --git a/youtube_dl/extractor/groupon.py b/youtube_dl/extractor/groupon.py index 8b9e0e2f8..22ff7182f 100644 --- a/youtube_dl/extractor/groupon.py +++ b/youtube_dl/extractor/groupon.py @@ -18,6 +18,8 @@ class GrouponIE(InfoExtractor): 'id': 'tubGNycTo_9Uxg82uESj4i61EYX8nyuf', 'ext': 'mp4', 'title': 'Bikram Yoga Huntington Beach | Orange County', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'duration': 44961, }, }], 'params': { diff --git a/youtube_dl/extractor/howcast.py b/youtube_dl/extractor/howcast.py index 16677f179..165b9f39e 100644 --- a/youtube_dl/extractor/howcast.py +++ b/youtube_dl/extractor/howcast.py @@ -16,6 +16,7 @@ class HowcastIE(InfoExtractor): 'description': 'md5:dbe792e5f6f1489027027bf2eba188a3', 'timestamp': 1276081287, 'upload_date': '20100609', + 'duration': 56823, }, 'params': { # m3u8 download diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py index 944096e1c..77a3b49ef 100644 --- a/youtube_dl/extractor/nba.py +++ b/youtube_dl/extractor/nba.py @@ -1,63 +1,92 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( - remove_end, parse_duration, + int_or_none, + xpath_text, + xpath_attr, ) class NBAIE(InfoExtractor): - _VALID_URL = r'https?://(?:watch\.|www\.)?nba\.com/(?:nba/)?video(?P<id>/[^?]*?)/?(?:/index\.html)?(?:\?.*)?$' + _VALID_URL = r'https?://(?:watch\.|www\.)?nba\.com/(?P<path>(?:[^/]+/)?video/(?P<id>[^?]*?))/?(?:/index\.html)?(?:\?.*)?$' _TESTS = [{ 'url': 'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html', - 'md5': 'c0edcfc37607344e2ff8f13c378c88a4', + 'md5': '9e7729d3010a9c71506fd1248f74e4f4', 'info_dict': { - 'id': '0021200253-okc-bkn-recap.nba', - 'ext': 'mp4', + 'id': '0021200253-okc-bkn-recap', + 'ext': 'flv', 'title': 'Thunder vs. Nets', 'description': 'Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.', 'duration': 181, + 'timestamp': 1354638466, + 'upload_date': '20121204', }, }, { 'url': 'http://www.nba.com/video/games/hornets/2014/12/05/0021400276-nyk-cha-play5.nba/', 'only_matching': True, }, { - 'url': 'http://watch.nba.com/nba/video/channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba', + 'url': 'http://watch.nba.com/video/channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba', + 'md5': 'b2b39b81cf28615ae0c3360a3f9668c4', 'info_dict': { - 'id': '0041400301-cle-atl-recap.nba', + 'id': '0041400301-cle-atl-recap', 'ext': 'mp4', - 'title': 'NBA GAME TIME | Video: Hawks vs. Cavaliers Game 1', + 'title': 'Hawks vs. Cavaliers Game 1', 'description': 'md5:8094c3498d35a9bd6b1a8c396a071b4d', 'duration': 228, - }, - 'params': { - 'skip_download': True, + 'timestamp': 1432134543, + 'upload_date': '20150520', } }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - video_url = 'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4' + path, video_id = re.match(self._VALID_URL, url).groups() + video_info = self._download_xml('http://www.nba.com/%s.xml' % path, video_id) + video_id = xpath_text(video_info, 'slug') + title = xpath_text(video_info, 'headline') + description = xpath_text(video_info, 'description') + duration = parse_duration(xpath_text(video_info, 'length')) + timestamp = int_or_none(xpath_attr(video_info, 'dateCreated', 'uts')) - shortened_video_id = video_id.rpartition('/')[2] - title = remove_end( - self._og_search_title(webpage, default=shortened_video_id), ' : NBA.com') + thumbnails = [] + for image in video_info.find('images'): + thumbnails.append({ + 'id': image.attrib.get('cut'), + 'url': image.text, + 'width': int_or_none(image.attrib.get('width')), + 'height': int_or_none(image.attrib.get('height')), + }) - description = self._og_search_description(webpage) - duration_str = self._html_search_meta( - 'duration', webpage, 'duration', default=None) - if not duration_str: - duration_str = self._html_search_regex( - r'Duration:</b>\s*(\d+:\d+)', webpage, 'duration', fatal=False) - duration = parse_duration(duration_str) + formats = [] + for video_file in video_info.find('files').iter('file'): + video_url = video_file.text + if video_url.startswith('/'): + continue + if video_url.endswith('.m3u8'): + formats.extend(self._extract_m3u8_formats(video_url, video_id, m3u8_id='hls')) + elif video_url.endswith('.f4m'): + formats.extend(self._extract_f4m_formats(video_url + '?hdcore=3.4.1.1', video_id, f4m_id='hds')) + else: + key = video_file.attrib.get('bitrate') + width, height, bitrate = re.search(r'(\d+)x(\d+)(?:_(\d+))?', key).groups() + formats.append({ + 'format_id': key, + 'url': video_url, + 'width': int_or_none(width), + 'height': int_or_none(height), + 'tbr': int_or_none(bitrate), + }) + self._sort_formats(formats) return { - 'id': shortened_video_id, - 'url': video_url, + 'id': video_id, 'title': title, 'description': description, 'duration': duration, + 'timestamp': timestamp, + 'thumbnails': thumbnails, + 'formats': formats, } diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 8ac38a174..6ff13050d 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -6,6 +6,7 @@ import re from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( + determine_ext, ExtractorError, float_or_none, parse_duration, @@ -48,12 +49,22 @@ class NRKIE(InfoExtractor): 'http://v8.psapi.nrk.no/mediaelement/%s' % video_id, video_id, 'Downloading media JSON') - if data['usageRights']['isGeoBlocked']: - raise ExtractorError( - 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge', - expected=True) + media_url = data.get('mediaUrl') - video_url = data['mediaUrl'] + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81' + if not media_url: + if data['usageRights']['isGeoBlocked']: + raise ExtractorError( + 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge', + expected=True) + + if determine_ext(media_url) == 'f4m': + formats = self._extract_f4m_formats( + media_url + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81', video_id, f4m_id='hds') + else: + formats = [{ + 'url': media_url, + 'ext': 'flv', + }] duration = parse_duration(data.get('duration')) @@ -67,12 +78,11 @@ class NRKIE(InfoExtractor): return { 'id': video_id, - 'url': video_url, - 'ext': 'flv', 'title': data['title'], 'description': data['description'], 'duration': duration, 'thumbnail': thumbnail, + 'formats': formats, } diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index a262a9f6d..3b692e903 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -1,108 +1,69 @@ from __future__ import unicode_literals import re -import json import base64 from .common import InfoExtractor from ..utils import ( - unescapeHTML, - ExtractorError, - determine_ext, int_or_none, + float_or_none, + ExtractorError, + unsmuggle_url, ) +from ..compat import compat_urllib_parse class OoyalaBaseIE(InfoExtractor): - def _extract_result(self, info, more_info): - embedCode = info['embedCode'] - video_url = info.get('ipad_url') or info['url'] - - if determine_ext(video_url) == 'm3u8': - formats = self._extract_m3u8_formats(video_url, embedCode, ext='mp4') - else: - formats = [{ - 'url': video_url, - 'ext': 'mp4', - }] - - return { - 'id': embedCode, - 'title': unescapeHTML(info['title']), - 'formats': formats, - 'description': unescapeHTML(more_info['description']), - 'thumbnail': more_info['promo'], + def _extract(self, content_tree_url, video_id, domain='example.org'): + content_tree = self._download_json(content_tree_url, video_id)['content_tree'] + metadata = content_tree[list(content_tree)[0]] + embed_code = metadata['embed_code'] + pcode = metadata.get('asset_pcode') or embed_code + video_info = { + 'id': embed_code, + 'title': metadata['title'], + 'description': metadata.get('description'), + 'thumbnail': metadata.get('thumbnail_image') or metadata.get('promo_image'), + 'duration': int_or_none(metadata.get('duration')), } - def _extract(self, player_url, video_id): - player = self._download_webpage(player_url, video_id) - mobile_url = self._search_regex(r'mobile_player_url="(.+?)&device="', - player, 'mobile player url') - # Looks like some videos are only available for particular devices - # (e.g. http://player.ooyala.com/player.js?embedCode=x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0 - # is only available for ipad) - # Working around with fetching URLs for all the devices found starting with 'unknown' - # until we succeed or eventually fail for each device. - devices = re.findall(r'device\s*=\s*"([^"]+)";', player) - devices.remove('unknown') - devices.insert(0, 'unknown') - for device in devices: - mobile_player = self._download_webpage( - '%s&device=%s' % (mobile_url, device), video_id, - 'Downloading mobile player JS for %s device' % device) - videos_info = self._search_regex( - r'var streams=window.oo_testEnv\?\[\]:eval\("\((\[{.*?}\])\)"\);', - mobile_player, 'info', fatal=False, default=None) - if videos_info: - break - - if not videos_info: - formats = [] + formats = [] + for supported_format in ('mp4', 'm3u8', 'hds', 'rtmp'): auth_data = self._download_json( - 'http://player.ooyala.com/sas/player_api/v1/authorization/embed_code/%s/%s?domain=www.example.org&supportedFormats=mp4,webm' % (video_id, video_id), - video_id) - - cur_auth_data = auth_data['authorization_data'][video_id] - - for stream in cur_auth_data['streams']: - formats.append({ - 'url': base64.b64decode(stream['url']['data'].encode('ascii')).decode('utf-8'), - 'ext': stream.get('delivery_type'), - 'format': stream.get('video_codec'), - 'format_id': stream.get('profile'), - 'width': int_or_none(stream.get('width')), - 'height': int_or_none(stream.get('height')), - 'abr': int_or_none(stream.get('audio_bitrate')), - 'vbr': int_or_none(stream.get('video_bitrate')), - }) - if formats: - return { - 'id': video_id, - 'formats': formats, - 'title': 'Ooyala video', - } - - if not cur_auth_data['authorized']: - raise ExtractorError(cur_auth_data['message'], expected=True) - - if not videos_info: - raise ExtractorError('Unable to extract info') - videos_info = videos_info.replace('\\"', '"') - videos_more_info = self._search_regex( - r'eval\("\(({.*?\\"promo\\".*?})\)"', mobile_player, 'more info').replace('\\"', '"') - videos_info = json.loads(videos_info) - videos_more_info = json.loads(videos_more_info) - - if videos_more_info.get('lineup'): - videos = [self._extract_result(info, more_info) for (info, more_info) in zip(videos_info, videos_more_info['lineup'])] - return { - '_type': 'playlist', - 'id': video_id, - 'title': unescapeHTML(videos_more_info['title']), - 'entries': videos, - } - else: - return self._extract_result(videos_info[0], videos_more_info) + 'http://player.ooyala.com/sas/player_api/v1/authorization/embed_code/%s/%s?' % (pcode, embed_code) + compat_urllib_parse.urlencode({'domain': domain, 'supportedFormats': supported_format}), + video_id, 'Downloading %s JSON' % supported_format) + + cur_auth_data = auth_data['authorization_data'][embed_code] + + if cur_auth_data['authorized']: + for stream in cur_auth_data['streams']: + url = base64.b64decode(stream['url']['data'].encode('ascii')).decode('utf-8') + delivery_type = stream['delivery_type'] + if delivery_type == 'remote_asset': + video_info['url'] = url + return video_info + if delivery_type == 'hls': + formats.extend(self._extract_m3u8_formats(url, embed_code, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + elif delivery_type == 'hds': + formats.extend(self._extract_f4m_formats(url, embed_code, -1, 'hds', fatal=False)) + else: + formats.append({ + 'url': url, + 'ext': stream.get('delivery_type'), + 'vcodec': stream.get('video_codec'), + 'format_id': '%s-%s-%sp' % (stream.get('profile'), delivery_type, stream.get('height')), + 'width': int_or_none(stream.get('width')), + 'height': int_or_none(stream.get('height')), + 'abr': int_or_none(stream.get('audio_bitrate')), + 'vbr': int_or_none(stream.get('video_bitrate')), + 'fps': float_or_none(stream.get('framerate')), + }) + else: + raise ExtractorError('%s said: %s' % (self.IE_NAME, cur_auth_data['message']), expected=True) + self._sort_formats(formats) + + video_info['formats'] = formats + return video_info class OoyalaIE(OoyalaBaseIE): @@ -117,6 +78,7 @@ class OoyalaIE(OoyalaBaseIE): 'ext': 'mp4', 'title': 'Explaining Data Recovery from Hard Drives and SSDs', 'description': 'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.', + 'duration': 853386, }, }, { # Only available for ipad @@ -125,7 +87,7 @@ class OoyalaIE(OoyalaBaseIE): 'id': 'x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0', 'ext': 'mp4', 'title': 'Simulation Overview - Levels of Simulation', - 'description': '', + 'duration': 194948, }, }, { @@ -136,7 +98,8 @@ class OoyalaIE(OoyalaBaseIE): 'info_dict': { 'id': 'FiOG81ZTrvckcchQxmalf4aQj590qTEx', 'ext': 'mp4', - 'title': 'Ooyala video', + 'title': 'Divide Tool Path.mp4', + 'duration': 204405, } } ] @@ -151,9 +114,11 @@ class OoyalaIE(OoyalaBaseIE): ie=cls.ie_key()) def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) embed_code = self._match_id(url) - player_url = 'http://player.ooyala.com/player.js?embedCode=%s' % embed_code - return self._extract(player_url, embed_code) + domain = smuggled_data.get('domain') + content_tree_url = 'http://player.ooyala.com/player_api/v1/content_tree/embed_code/%s/%s' % (embed_code, embed_code) + return self._extract(content_tree_url, embed_code, domain) class OoyalaExternalIE(OoyalaBaseIE): @@ -170,7 +135,7 @@ class OoyalaExternalIE(OoyalaBaseIE): .*?&pcode= ) (?P<pcode>.+?) - (&|$) + (?:&|$) ''' _TEST = { @@ -179,7 +144,7 @@ class OoyalaExternalIE(OoyalaBaseIE): 'id': 'FkYWtmazr6Ed8xmvILvKLWjd4QvYZpzG', 'ext': 'mp4', 'title': 'dm_140128_30for30Shorts___JudgingJewellv2', - 'description': '', + 'duration': 1302000, }, 'params': { # m3u8 download @@ -188,9 +153,6 @@ class OoyalaExternalIE(OoyalaBaseIE): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - partner_id = mobj.group('partner_id') - video_id = mobj.group('id') - pcode = mobj.group('pcode') - player_url = 'http://player.ooyala.com/player.js?externalId=%s:%s&pcode=%s' % (partner_id, video_id, pcode) - return self._extract(player_url, video_id) + partner_id, video_id, pcode = re.match(self._VALID_URL, url).groups() + content_tree_url = 'http://player.ooyala.com/player_api/v1/content_tree/external_id/%s/%s:%s' % (pcode, partner_id, video_id) + return self._extract(content_tree_url, video_id) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 965940a4b..08275687d 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -147,7 +147,8 @@ class PornHubPlaylistIE(InfoExtractor): entries = [ self.url_result('http://www.pornhub.com/%s' % video_url, 'PornHub') - for video_url in set(re.findall('href="/?(view_video\.php\?viewkey=\d+[^"]*)"', webpage)) + for video_url in set(re.findall( + r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"', webpage)) ] playlist = self._parse_json( diff --git a/youtube_dl/extractor/skynewsarabia.py b/youtube_dl/extractor/skynewsarabia.py new file mode 100644 index 000000000..f09fee102 --- /dev/null +++ b/youtube_dl/extractor/skynewsarabia.py @@ -0,0 +1,117 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + parse_iso8601, + parse_duration, +) + + +class SkyNewArabiaBaseIE(InfoExtractor): + _IMAGE_BASE_URL = 'http://www.skynewsarabia.com/web/images' + + def _call_api(self, path, value): + return self._download_json('http://api.skynewsarabia.com/web/rest/v2/%s/%s.json' % (path, value), value) + + def _get_limelight_media_id(self, url): + return self._search_regex(r'/media/[^/]+/([a-z0-9]{32})', url, 'limelight media id') + + def _get_image_url(self, image_path_template, width='1600', height='1200'): + return self._IMAGE_BASE_URL + image_path_template.format(width=width, height=height) + + def _extract_video_info(self, video_data): + video_id = compat_str(video_data['id']) + topic = video_data.get('topicTitle') + return { + '_type': 'url_transparent', + 'url': 'limelight:media:%s' % self._get_limelight_media_id(video_data['videoUrl'][0]['url']), + 'id': video_id, + 'title': video_data['headline'], + 'description': video_data.get('summary'), + 'thumbnail': self._get_image_url(video_data['mediaAsset']['imageUrl']), + 'timestamp': parse_iso8601(video_data.get('date')), + 'duration': parse_duration(video_data.get('runTime')), + 'tags': video_data.get('tags', []), + 'categories': [topic] if topic else [], + 'webpage_url': 'http://www.skynewsarabia.com/web/video/%s' % video_id, + 'ie_key': 'LimelightMedia', + } + + +class SkyNewsArabiaIE(SkyNewArabiaBaseIE): + IE_NAME = 'skynewsarabia:video' + _VALID_URL = r'https?://(?:www\.)?skynewsarabia\.com/web/video/(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.skynewsarabia.com/web/video/794902/%D9%86%D8%B5%D9%81-%D9%85%D9%84%D9%8A%D9%88%D9%86-%D9%85%D8%B5%D8%A8%D8%A7%D8%AD-%D8%B4%D8%AC%D8%B1%D8%A9-%D9%83%D8%B1%D9%8A%D8%B3%D9%85%D8%A7%D8%B3', + 'info_dict': { + 'id': '794902', + 'ext': 'flv', + 'title': 'نصف مليون مصباح على شجرة كريسماس', + 'description': 'md5:22f1b27f0850eeb10c7e59b1f16eb7c6', + 'upload_date': '20151128', + 'timestamp': 1448697198, + 'duration': 2119, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._call_api('video', video_id) + return self._extract_video_info(video_data) + + +class SkyNewsArabiaArticleIE(SkyNewArabiaBaseIE): + IE_NAME = 'skynewsarabia:video' + _VALID_URL = r'https?://(?:www\.)?skynewsarabia\.com/web/article/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://www.skynewsarabia.com/web/article/794549/%D8%A7%D9%94%D8%AD%D8%AF%D8%A7%D8%AB-%D8%A7%D9%84%D8%B4%D8%B1%D9%82-%D8%A7%D9%84%D8%A7%D9%94%D9%88%D8%B3%D8%B7-%D8%AE%D8%B1%D9%8A%D8%B7%D8%A9-%D8%A7%D9%84%D8%A7%D9%94%D9%84%D8%B9%D8%A7%D8%A8-%D8%A7%D9%84%D8%B0%D9%83%D9%8A%D8%A9', + 'info_dict': { + 'id': '794549', + 'ext': 'flv', + 'title': 'بالفيديو.. ألعاب ذكية تحاكي واقع المنطقة', + 'description': 'md5:0c373d29919a851e080ee4edd0c5d97f', + 'upload_date': '20151126', + 'timestamp': 1448559336, + 'duration': 281.6, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, { + 'url': 'http://www.skynewsarabia.com/web/article/794844/%D8%A7%D8%B3%D8%AA%D9%87%D8%AF%D8%A7%D9%81-%D9%82%D9%88%D8%A7%D8%B1%D8%A8-%D8%A7%D9%94%D8%B3%D9%84%D8%AD%D8%A9-%D9%84%D9%85%D9%8A%D9%84%D9%8A%D8%B4%D9%8A%D8%A7%D8%AA-%D8%A7%D9%84%D8%AD%D9%88%D8%AB%D9%8A-%D9%88%D8%B5%D8%A7%D9%84%D8%AD', + 'info_dict': { + 'id': '794844', + 'title': 'إحباط تهريب أسلحة لميليشيات الحوثي وصالح بجنوب اليمن', + 'description': 'md5:5c927b8b2e805796e7f693538d96fc7e', + }, + 'playlist_mincount': 2, + }] + + def _real_extract(self, url): + article_id = self._match_id(url) + article_data = self._call_api('article', article_id) + media_asset = article_data['mediaAsset'] + if media_asset['type'] == 'VIDEO': + topic = article_data.get('topicTitle') + return { + '_type': 'url_transparent', + 'url': 'limelight:media:%s' % self._get_limelight_media_id(media_asset['videoUrl'][0]['url']), + 'id': article_id, + 'title': article_data['headline'], + 'description': article_data.get('summary'), + 'thumbnail': self._get_image_url(media_asset['imageUrl']), + 'timestamp': parse_iso8601(article_data.get('date')), + 'tags': article_data.get('tags', []), + 'categories': [topic] if topic else [], + 'webpage_url': url, + 'ie_key': 'LimelightMedia', + } + entries = [self._extract_video_info(item) for item in article_data.get('inlineItems', []) if item['type'] == 'VIDEO'] + return self.playlist_result(entries, article_id, article_data['headline'], article_data.get('summary')) diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py index 5bd3c0087..39a7aaf9d 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/youtube_dl/extractor/spiegel.py @@ -58,7 +58,8 @@ class SpiegelIE(InfoExtractor): description = self._html_search_meta('description', webpage, 'description') base_url = self._search_regex( - r'var\s+server\s*=\s*"([^"]+)\"', webpage, 'server URL') + [r'server\s*:\s*(["\'])(?P<url>.+?)\1', r'var\s+server\s*=\s*"(?P<url>[^"]+)\"'], + webpage, 'server URL', group='url') xml_url = base_url + video_id + '.xml' idoc = self._download_xml(xml_url, video_id) diff --git a/youtube_dl/extractor/teachingchannel.py b/youtube_dl/extractor/teachingchannel.py index 117afa9bf..36a6fc679 100644 --- a/youtube_dl/extractor/teachingchannel.py +++ b/youtube_dl/extractor/teachingchannel.py @@ -16,6 +16,7 @@ class TeachingChannelIE(InfoExtractor): 'ext': 'mp4', 'title': 'A History of Teaming', 'description': 'md5:2a9033db8da81f2edffa4c99888140b3', + 'duration': 422255, }, 'params': { # m3u8 download diff --git a/youtube_dl/extractor/trilulilu.py b/youtube_dl/extractor/trilulilu.py index 185accc4b..515632527 100644 --- a/youtube_dl/extractor/trilulilu.py +++ b/youtube_dl/extractor/trilulilu.py @@ -1,80 +1,93 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import ( + ExtractorError, + int_or_none, + parse_iso8601, +) class TriluliluIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?trilulilu\.ro/(?:video-[^/]+/)?(?P<id>[^/#\?]+)' - _TEST = { - 'url': 'http://www.trilulilu.ro/video-animatie/big-buck-bunny-1', - 'md5': 'c1450a00da251e2769b74b9005601cac', + _VALID_URL = r'https?://(?:(?:www|m)\.)?trilulilu\.ro/(?:[^/]+/)?(?P<id>[^/#\?]+)' + _TESTS = [{ + 'url': 'http://www.trilulilu.ro/big-buck-bunny-1', + 'md5': '68da087b676a6196a413549212f60cc6', 'info_dict': { 'id': 'ae2899e124140b', 'ext': 'mp4', 'title': 'Big Buck Bunny', 'description': ':) pentru copilul din noi', + 'uploader_id': 'chipy', + 'upload_date': '20120304', + 'timestamp': 1330830647, + 'uploader': 'chipy', + 'view_count': int, + 'like_count': int, + 'comment_count': int, }, - } + }, { + 'url': 'http://www.trilulilu.ro/adena-ft-morreti-inocenta', + 'md5': '929dfb8729dc71750463af88bbbbf4a4', + 'info_dict': { + 'id': 'f299710e3c91c5', + 'ext': 'mp4', + 'title': 'Adena ft. Morreti - Inocenta', + 'description': 'pop music', + 'uploader_id': 'VEVOmixt', + 'upload_date': '20151204', + 'uploader': 'VEVOmixt', + 'timestamp': 1449187937, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + }, + }] def _real_extract(self, url): display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - if re.search(r'Fişierul nu este disponibil pentru vizionare în ţara dumneavoastră', webpage): - raise ExtractorError( - 'This video is not available in your country.', expected=True) - elif re.search('Fişierul poate fi accesat doar de către prietenii lui', webpage): - raise ExtractorError('This video is private.', expected=True) + media_info = self._download_json('http://m.trilulilu.ro/%s?format=json' % display_id, display_id) - flashvars_str = self._search_regex( - r'block_flash_vars\s*=\s*(\{[^\}]+\})', webpage, 'flashvars', fatal=False, default=None) + media_class = media_info.get('class') + if media_class not in ('video', 'audio'): + raise ExtractorError('not a video or an audio') - if flashvars_str: - flashvars = self._parse_json(flashvars_str, display_id) - else: - raise ExtractorError( - 'This page does not contain videos', expected=True) - - if flashvars['isMP3'] == 'true': - raise ExtractorError( - 'Audio downloads are currently not supported', expected=True) - - video_id = flashvars['hash'] - title = self._og_search_title(webpage) - thumbnail = self._og_search_thumbnail(webpage) - description = self._og_search_description(webpage, default=None) + user = media_info.get('user', {}) - format_url = ('http://fs%(server)s.trilulilu.ro/%(hash)s/' - 'video-formats2' % flashvars) - format_doc = self._download_xml( - format_url, video_id, - note='Downloading formats', - errnote='Error while downloading formats') + thumbnail = media_info.get('cover_url') + if thumbnail: + thumbnail.format(width='1600', height='1200') - video_url_template = ( - 'http://fs%(server)s.trilulilu.ro/stream.php?type=video' - '&source=site&hash=%(hash)s&username=%(userid)s&' - 'key=ministhebest&format=%%s&sig=&exp=' % - flashvars) - formats = [ - { - 'format_id': fnode.text.partition('-')[2], - 'url': video_url_template % fnode.text, - 'ext': fnode.text.partition('-')[0] - } - - for fnode in format_doc.findall('./formats/format') - ] + # TODO: get correct ext for audio files + stream_type = media_info.get('stream_type') + formats = [{ + 'url': media_info['href'], + 'ext': stream_type, + }] + if media_info.get('is_hd'): + formats.append({ + 'format_id': 'hd', + 'url': media_info['hrefhd'], + 'ext': stream_type, + }) + if media_class == 'audio': + formats[0]['vcodec'] = 'none' + else: + formats[0]['format_id'] = 'sd' return { - 'id': video_id, + 'id': media_info['identifier'].split('|')[1], 'display_id': display_id, 'formats': formats, - 'title': title, - 'description': description, + 'title': media_info['title'], + 'description': media_info.get('description'), 'thumbnail': thumbnail, + 'uploader_id': user.get('username'), + 'uploader': user.get('fullname'), + 'timestamp': parse_iso8601(media_info.get('published'), ' '), + 'duration': int_or_none(media_info.get('duration')), + 'view_count': int_or_none(media_info.get('count_views')), + 'like_count': int_or_none(media_info.get('count_likes')), + 'comment_count': int_or_none(media_info.get('count_comments')), } diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 825172806..59832b1ec 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -1,14 +1,15 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..compat import ( + compat_HTTPError, compat_urllib_parse, compat_urllib_request, ) from ..utils import ( ExtractorError, + float_or_none, + int_or_none, sanitized_Request, ) @@ -18,6 +19,8 @@ class UdemyIE(InfoExtractor): _VALID_URL = r'https?://www\.udemy\.com/(?:[^#]+#/lecture/|lecture/view/?\?lectureId=)(?P<id>\d+)' _LOGIN_URL = 'https://www.udemy.com/join/login-popup/?displayType=ajax&showSkipButton=1' _ORIGIN_URL = 'https://www.udemy.com' + _SUCCESSFULLY_ENROLLED = '>You have enrolled in this course!<' + _ALREADY_ENROLLED = '>You are already taking this course.<' _NETRC_MACHINE = 'udemy' _TESTS = [{ @@ -33,6 +36,29 @@ class UdemyIE(InfoExtractor): 'skip': 'Requires udemy account credentials', }] + def _enroll_course(self, webpage, course_id): + enroll_url = self._search_regex( + r'href=(["\'])(?P<url>https?://(?:www\.)?udemy\.com/course/subscribe/.+?)\1', + webpage, 'enroll url', group='url', + default='https://www.udemy.com/course/subscribe/?courseId=%s' % course_id) + webpage = self._download_webpage(enroll_url, course_id, 'Enrolling in the course') + if self._SUCCESSFULLY_ENROLLED in webpage: + self.to_screen('%s: Successfully enrolled in' % course_id) + elif self._ALREADY_ENROLLED in webpage: + self.to_screen('%s: Already enrolled in' % course_id) + + def _download_lecture(self, course_id, lecture_id): + return self._download_json( + 'https://www.udemy.com/api-2.0/users/me/subscribed-courses/%s/lectures/%s?%s' % ( + course_id, lecture_id, compat_urllib_parse.urlencode({ + 'video_only': '', + 'auto_play': '', + 'fields[lecture]': 'title,description,asset', + 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,data', + 'instructorPreviewMode': 'False', + })), + lecture_id, 'Downloading lecture JSON') + def _handle_error(self, response): if not isinstance(response, dict): return @@ -54,6 +80,7 @@ class UdemyIE(InfoExtractor): headers['X-Udemy-Client-Id'] = cookie.value elif cookie.name == 'access_token': headers['X-Udemy-Bearer-Token'] = cookie.value + headers['X-Udemy-Authorization'] = 'Bearer %s' % cookie.value if isinstance(url_or_request, compat_urllib_request.Request): for header, value in headers.items(): @@ -71,7 +98,7 @@ class UdemyIE(InfoExtractor): def _login(self): (username, password) = self._get_login_info() if username is None: - self.raise_login_required('Udemy account is required') + return login_popup = self._download_webpage( self._LOGIN_URL, None, 'Downloading login popup') @@ -109,44 +136,76 @@ class UdemyIE(InfoExtractor): def _real_extract(self, url): lecture_id = self._match_id(url) - lecture = self._download_json( - 'https://www.udemy.com/api-1.1/lectures/%s' % lecture_id, - lecture_id, 'Downloading lecture JSON') + webpage = self._download_webpage(url, lecture_id) + + course_id = self._search_regex( + r'data-course-id=["\'](\d+)', webpage, 'course id') + + try: + lecture = self._download_lecture(course_id, lecture_id) + except ExtractorError as e: + # Error could possibly mean we are not enrolled in the course + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + self._enroll_course(webpage, course_id) + lecture_id = self._download_lecture(course_id, lecture_id) + else: + raise + + title = lecture['title'] + description = lecture.get('description') - asset_type = lecture.get('assetType') or lecture.get('asset_type') + asset = lecture['asset'] + + asset_type = asset.get('assetType') or asset.get('asset_type') if asset_type != 'Video': raise ExtractorError( 'Lecture %s is not a video' % lecture_id, expected=True) - asset = lecture['asset'] - stream_url = asset.get('streamUrl') or asset.get('stream_url') - mobj = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', stream_url) - if mobj: - return self.url_result(mobj.group(1), 'Youtube') + if stream_url: + youtube_url = self._search_regex( + r'(https?://www\.youtube\.com/watch\?v=.*)', stream_url, 'youtube URL', default=None) + if youtube_url: + return self.url_result(youtube_url, 'Youtube') video_id = asset['id'] thumbnail = asset.get('thumbnailUrl') or asset.get('thumbnail_url') - duration = asset['data']['duration'] - - download_url = asset.get('downloadUrl') or asset.get('download_url') - - video = download_url.get('Video') or download_url.get('video') - video_480p = download_url.get('Video480p') or download_url.get('video_480p') - - formats = [ - { - 'url': video_480p[0], - 'format_id': '360p', - }, - { - 'url': video[0], - 'format_id': '720p', - }, - ] - - title = lecture['title'] - description = lecture['description'] + duration = float_or_none(asset.get('data', {}).get('duration')) + outputs = asset.get('data', {}).get('outputs', {}) + + formats = [] + for format_ in asset.get('download_urls', {}).get('Video', []): + video_url = format_.get('file') + if not video_url: + continue + format_id = format_.get('label') + f = { + 'url': format_['file'], + 'height': int_or_none(format_id), + } + if format_id: + # Some videos contain additional metadata (e.g. + # https://www.udemy.com/ios9-swift/learn/#/lecture/3383208) + output = outputs.get(format_id) + if isinstance(output, dict): + f.update({ + 'format_id': '%sp' % (output.get('label') or format_id), + 'width': int_or_none(output.get('width')), + 'height': int_or_none(output.get('height')), + 'vbr': int_or_none(output.get('video_bitrate_in_kbps')), + 'vcodec': output.get('video_codec'), + 'fps': int_or_none(output.get('frame_rate')), + 'abr': int_or_none(output.get('audio_bitrate_in_kbps')), + 'acodec': output.get('audio_codec'), + 'asr': int_or_none(output.get('audio_sample_rate')), + 'tbr': int_or_none(output.get('total_bitrate_in_kbps')), + 'filesize': int_or_none(output.get('file_size_in_bytes')), + }) + else: + f['format_id'] = '%sp' % format_id + formats.append(f) + + self._sort_formats(formats) return { 'id': video_id, @@ -160,9 +219,7 @@ class UdemyIE(InfoExtractor): class UdemyCourseIE(UdemyIE): IE_NAME = 'udemy:course' - _VALID_URL = r'https?://www\.udemy\.com/(?P<coursepath>[\da-z-]+)' - _SUCCESSFULLY_ENROLLED = '>You have enrolled in this course!<' - _ALREADY_ENROLLED = '>You are already taking this course.<' + _VALID_URL = r'https?://www\.udemy\.com/(?P<id>[\da-z-]+)' _TESTS = [] @classmethod @@ -170,24 +227,18 @@ class UdemyCourseIE(UdemyIE): return False if UdemyIE.suitable(url) else super(UdemyCourseIE, cls).suitable(url) def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - course_path = mobj.group('coursepath') + course_path = self._match_id(url) + + webpage = self._download_webpage(url, course_path) response = self._download_json( 'https://www.udemy.com/api-1.1/courses/%s' % course_path, course_path, 'Downloading course JSON') - course_id = int(response['id']) - course_title = response['title'] + course_id = response['id'] + course_title = response.get('title') - webpage = self._download_webpage( - 'https://www.udemy.com/course/subscribe/?courseId=%s' % course_id, - course_id, 'Enrolling in the course') - - if self._SUCCESSFULLY_ENROLLED in webpage: - self.to_screen('%s: Successfully enrolled in' % course_id) - elif self._ALREADY_ENROLLED in webpage: - self.to_screen('%s: Already enrolled in' % course_id) + self._enroll_course(webpage, course_id) response = self._download_json( 'https://www.udemy.com/api-1.1/courses/%s/curriculum' % course_id, diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py index 01af7a995..7df87c31c 100644 --- a/youtube_dl/extractor/vice.py +++ b/youtube_dl/extractor/vice.py @@ -15,6 +15,7 @@ class ViceIE(InfoExtractor): 'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp', 'ext': 'mp4', 'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov', + 'duration': 725983, }, 'params': { # Requires ffmpeg (m3u8 manifest) diff --git a/youtube_dl/extractor/vodlocker.py b/youtube_dl/extractor/vodlocker.py index be0a2780f..357594a11 100644 --- a/youtube_dl/extractor/vodlocker.py +++ b/youtube_dl/extractor/vodlocker.py @@ -3,11 +3,14 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_urllib_parse -from ..utils import sanitized_Request +from ..utils import ( + ExtractorError, + sanitized_Request, +) class VodlockerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vodlocker\.com/(?P<id>[0-9a-zA-Z]+)(?:\..*?)?' + _VALID_URL = r'https?://(?:www\.)?vodlocker\.com/(?:embed-)?(?P<id>[0-9a-zA-Z]+)(?:\..*?)?' _TESTS = [{ 'url': 'http://vodlocker.com/e8wvyzz4sl42', @@ -24,6 +27,12 @@ class VodlockerIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + if any(p in webpage for p in ( + '>THIS FILE WAS DELETED<', + '>File Not Found<', + 'The file you were looking for could not be found, sorry for any inconvenience.<')): + raise ExtractorError('Video %s does not exist' % video_id, expected=True) + fields = self._hidden_inputs(webpage) if fields['op'] == 'download1': diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 1c2420a33..9b39505ba 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -258,7 +258,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): |(?: # or the v= param in all its forms (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) (?:\?|\#!?) # the params delimiter ? or # or #! - (?:.*?&)?? # any other preceding param (like /?s=tuff&v=xxxx) + (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&v=V36LpHqtcDY) v= ) )) @@ -346,6 +346,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, + # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug) '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'}, '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'}, @@ -730,6 +731,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'skip_download': True, }, }, + { + 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;v=V36LpHqtcDY', + 'only_matching': True, + } ] def __init__(self, *args, **kwargs): @@ -1475,6 +1480,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): manifest_url = video_info['hlsvp'][0] url_map = self._extract_from_m3u8(manifest_url, video_id) formats = _map_to_format_list(url_map) + # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming + for a_format in formats: + a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True' else: raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') @@ -1559,7 +1567,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor, YoutubePlaylistBaseInfoExtract youtube\.com/ (?: (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries) - \? (?:.*?&)*? (?:p|a|list)= + \? (?:.*?[&;])*? (?:p|a|list)= | p/ ) ( diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index d7b737e21..d0606b4bc 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -663,6 +663,16 @@ def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs): return hc +def handle_youtubedl_headers(headers): + filtered_headers = headers + + if 'Youtubedl-no-compression' in filtered_headers: + filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding') + del filtered_headers['Youtubedl-no-compression'] + + return filtered_headers + + class YoutubeDLHandler(compat_urllib_request.HTTPHandler): """Handler for HTTP requests and responses. @@ -670,7 +680,7 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): the standard headers to every HTTP request and handles gzipped and deflated responses from web servers. If compression is to be avoided in a particular request, the original request in the program code only has - to include the HTTP header "Youtubedl-No-Compression", which will be + to include the HTTP header "Youtubedl-no-compression", which will be removed before making the real request. Part of this code was copied from: @@ -731,10 +741,8 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): # The dict keys are capitalized because of this bug by urllib if h.capitalize() not in req.headers: req.add_header(h, v) - if 'Youtubedl-no-compression' in req.headers: - if 'Accept-encoding' in req.headers: - del req.headers['Accept-encoding'] - del req.headers['Youtubedl-no-compression'] + + req.headers = handle_youtubedl_headers(req.headers) if sys.version_info < (2, 7) and '#' in req.get_full_url(): # Python 2.6 is brain-dead when it comes to fragments