From: Yen Chi Hsuan Date: Wed, 3 Jun 2015 15:59:52 +0000 (+0800) Subject: Merge branch 'iqiyi' of https://github.com/PeterDing/youtube-dl into PeterDing-iqiyi X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=commitdiff_plain;h=9bac8c57e3db49c6639c115478a36dde8d465ea7;hp=670861bd206ab4063baeb6b80d06a054ce4e1d62;p=youtube-dl Merge branch 'iqiyi' of https://github.com/PeterDing/youtube-dl into PeterDing-iqiyi --- diff --git a/AUTHORS b/AUTHORS index 3410e1fb9..bf2a25cb8 100644 --- a/AUTHORS +++ b/AUTHORS @@ -126,3 +126,4 @@ Matthias Küch Julian Richen Ping O. Mister Hat +Peter Ding diff --git a/README.md b/README.md index e51bb5343..f3d83c89f 100644 --- a/README.md +++ b/README.md @@ -168,7 +168,7 @@ which means you can modify it, redistribute it or use it however you like. --no-progress Do not print progress bar --console-title Display progress in console titlebar -v, --verbose Print various debugging information - --dump-pages Print downloaded pages to debug problems (very verbose) + --dump-pages Print downloaded pages encoded using base64 to debug problems (very verbose) --write-pages Write downloaded intermediary pages to files in the current directory to debug problems --print-traffic Display sent and read HTTP traffic -C, --call-home Contact the youtube-dl server for debugging @@ -220,7 +220,7 @@ which means you can modify it, redistribute it or use it however you like. --embed-thumbnail Embed thumbnail in the audio as cover art --add-metadata Write metadata to the video file --metadata-from-title FORMAT Parse additional metadata like song title / artist from the video title. The format syntax is the same as --output, the parsed - parameters replace existing values. Additional templates: %(album), %(artist). Example: --metadata-from-title "%(artist)s - + parameters replace existing values. Additional templates: %(album)s, %(artist)s. Example: --metadata-from-title "%(artist)s - %(title)s" matches a title like "Coldplay - Paradise" --xattrs Write metadata to the video file's xattrs (using dublin core and xdg standards) --fixup POLICY Automatically correct known faults of the file. One of never (do nothing), warn (only emit a warning), detect_or_warn(the default; diff --git a/docs/supportedsites.md b/docs/supportedsites.md index a4879bd9a..a421ae62b 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -26,8 +26,7 @@ - **anitube.se** - **AnySex** - **Aparat** - - **AppleDailyAnimationNews** - - **AppleDailyRealtimeNews** + - **AppleDaily** - **AppleTrailers** - **archive.org**: archive.org videos - **ARD** @@ -152,7 +151,6 @@ - **fc2** - **fernsehkritik.tv** - **fernsehkritik.tv:postecke** - - **Firedrive** - **Firstpost** - **Flickr** - **Folketinget**: Folketinget (ft.dk; Danish parliament) @@ -230,6 +228,7 @@ - **KanalPlay**: Kanal 5/9/11 Play - **Kankan** - **Karaoketv** + - **KarriereVideos** - **keek** - **KeezMovies** - **KhanAcademy** @@ -322,6 +321,7 @@ - **NosVideo** - **novamov**: NovaMov - **Nowness** + - **NowTV** - **nowvideo**: NowVideo - **npo.nl** - **npo.nl:live** @@ -393,7 +393,6 @@ - **Rte** - **rtl.nl**: rtl.nl and rtlxl.nl - **RTL2** - - **RTLnow** - **RTP** - **RTS**: RTS.ch - **rtve.es:alacarta**: RTVE a la carta @@ -431,7 +430,6 @@ - **smotri:community**: Smotri.com community videos - **smotri:user**: Smotri.com user videos - **Snotr** - - **Sockshare** - **Sohu** - **soundcloud** - **soundcloud:playlist** @@ -564,6 +562,7 @@ - **vier:videos** - **Viewster** - **viki** + - **viki:channel** - **vimeo** - **vimeo:album** - **vimeo:channel** diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index d1953c18f..aa6ec9d9a 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -49,6 +49,7 @@ from .utils import ( ExtractorError, format_bytes, formatSeconds, + HEADRequest, locked_file, make_HTTPS_handler, MaxDownloadsReached, @@ -923,8 +924,9 @@ class YoutubeDL(object): if f.get('vcodec') != 'none' and f.get('acodec') != 'none'] if audiovideo_formats: return audiovideo_formats[format_idx] - # for audio only urls, select the best/worst audio format - elif all(f.get('acodec') != 'none' for f in available_formats): + # for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format + elif (all(f.get('acodec') != 'none' for f in available_formats) or + all(f.get('vcodec') != 'none' for f in available_formats)): return available_formats[format_idx] elif format_spec == 'bestaudio': audio_formats = [ @@ -1047,6 +1049,8 @@ class YoutubeDL(object): if not formats: raise ExtractorError('No video formats found!') + formats_dict = {} + # We check that all the formats have the format and format_id fields for i, format in enumerate(formats): if 'url' not in format: @@ -1054,6 +1058,18 @@ class YoutubeDL(object): if format.get('format_id') is None: format['format_id'] = compat_str(i) + format_id = format['format_id'] + if format_id not in formats_dict: + formats_dict[format_id] = [] + formats_dict[format_id].append(format) + + # Make sure all formats have unique format_id + for format_id, ambiguous_formats in formats_dict.items(): + if len(ambiguous_formats) > 1: + for i, format in enumerate(ambiguous_formats): + format['format_id'] = '%s-%d' % (format_id, i) + + for i, format in enumerate(formats): if format.get('format') is None: format['format'] = '{id} - {res}{note}'.format( id=format['format_id'], @@ -1706,7 +1722,8 @@ class YoutubeDL(object): if req_is_string: req = url_escaped else: - req = compat_urllib_request.Request( + req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request + req = req_type( url_escaped, data=req.data, headers=req.headers, origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 85c1b1a3a..be464271a 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -353,8 +353,10 @@ from .ninegag import NineGagIE from .noco import NocoIE from .normalboots import NormalbootsIE from .nosvideo import NosVideoIE +from .nova import NovaIE from .novamov import NovaMovIE from .nowness import NownessIE +from .nowtv import NowTVIE from .nowvideo import NowVideoIE from .npo import ( NPOIE, @@ -401,6 +403,7 @@ from .playfm import PlayFMIE from .playvid import PlayvidIE from .playwire import PlaywireIE from .podomatic import PodomaticIE +from .porn91 import Porn91IE from .pornhd import PornHdIE from .pornhub import ( PornHubIE, @@ -438,7 +441,6 @@ from .roxwel import RoxwelIE from .rtbf import RTBFIE from .rte import RteIE from .rtlnl import RtlNlIE -from .rtlnow import RTLnowIE from .rtl2 import RTL2IE from .rtp import RTPIE from .rts import RTSIE @@ -481,6 +483,10 @@ from .smotri import ( ) from .snotr import SnotrIE from .sohu import SohuIE +from .soompi import ( + SoompiIE, + SoompiShowIE, +) from .soundcloud import ( SoundcloudIE, SoundcloudSetIE, @@ -566,6 +572,7 @@ from .traileraddict import TrailerAddictIE from .trilulilu import TriluliluIE from .trutube import TruTubeIE from .tube8 import Tube8IE +from .tubitv import TubiTvIE from .tudou import TudouIE from .tumblr import TumblrIE from .tunein import TuneInIE diff --git a/youtube_dl/extractor/aftonbladet.py b/youtube_dl/extractor/aftonbladet.py index a117502bc..e0518cf26 100644 --- a/youtube_dl/extractor/aftonbladet.py +++ b/youtube_dl/extractor/aftonbladet.py @@ -6,11 +6,11 @@ from ..utils import int_or_none class AftonbladetIE(InfoExtractor): - _VALID_URL = r'http://tv\.aftonbladet\.se/webbtv.+?(?Particle[0-9]+)\.ab(?:$|[?#])' + _VALID_URL = r'http://tv\.aftonbladet\.se/abtv/articles/(?P[0-9]+)' _TEST = { - 'url': 'http://tv.aftonbladet.se/webbtv/nyheter/vetenskap/rymden/article36015.ab', + 'url': 'http://tv.aftonbladet.se/abtv/articles/36015', 'info_dict': { - 'id': 'article36015', + 'id': '36015', 'ext': 'mp4', 'title': 'Vulkanutbrott i rymden - nu släpper NASA bilderna', 'description': 'Jupiters måne mest aktiv av alla himlakroppar', @@ -25,8 +25,9 @@ class AftonbladetIE(InfoExtractor): # find internal video meta data meta_url = 'http://aftonbladet-play.drlib.aptoma.no/video/%s.json' - internal_meta_id = self._html_search_regex( - r'data-aptomaId="([\w\d]+)"', webpage, 'internal_meta_id') + player_config = self._parse_json(self._html_search_regex( + r'data-player-config="([^"]+)"', webpage, 'player config'), video_id) + internal_meta_id = player_config['videoId'] internal_meta_url = meta_url % internal_meta_id internal_meta_json = self._download_json( internal_meta_url, video_id, 'Downloading video meta data') diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 1c77df47e..41f0c736d 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -76,8 +76,8 @@ class CrunchyrollIE(InfoExtractor): self._login() def _decrypt_subtitles(self, data, iv, id): - data = bytes_to_intlist(data) - iv = bytes_to_intlist(iv) + data = bytes_to_intlist(base64.b64decode(data.encode('utf-8'))) + iv = bytes_to_intlist(base64.b64decode(iv.encode('utf-8'))) id = int(id) def obfuscate_key_aux(count, modulo, start): @@ -179,6 +179,16 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text return output + def _extract_subtitles(self, subtitle): + sub_root = xml.etree.ElementTree.fromstring(subtitle) + return [{ + 'ext': 'srt', + 'data': self._convert_subtitles_to_srt(sub_root), + }, { + 'ext': 'ass', + 'data': self._convert_subtitles_to_ass(sub_root), + }] + def _get_subtitles(self, video_id, webpage): subtitles = {} for sub_id, sub_name in re.findall(r'\?ssid=([0-9]+)" title="([^"]+)', webpage): @@ -190,25 +200,11 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text data = self._search_regex(r'([^<]+)', sub_page, 'subtitle_data', fatal=False) if not id or not iv or not data: continue - id = int(id) - iv = base64.b64decode(iv) - data = base64.b64decode(data) - subtitle = self._decrypt_subtitles(data, iv, id).decode('utf-8') lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False) if not lang_code: continue - sub_root = xml.etree.ElementTree.fromstring(subtitle) - subtitles[lang_code] = [ - { - 'ext': 'srt', - 'data': self._convert_subtitles_to_srt(sub_root), - }, - { - 'ext': 'ass', - 'data': self._convert_subtitles_to_ass(sub_root), - }, - ] + subtitles[lang_code] = self._extract_subtitles(subtitle) return subtitles def _real_extract(self, url): diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index e8d682716..82dc27bc6 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -152,7 +152,7 @@ class FacebookIE(InfoExtractor): raise ExtractorError('Cannot find video formats') video_title = self._html_search_regex( - r'

([^<]*)

', webpage, 'title', + r']*class="uiHeaderTitle"[^>]*>([^<]*)', webpage, 'title', default=None) if not video_title: video_title = self._html_search_regex( diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 9a7b0d25d..96ca398de 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -9,6 +9,8 @@ from .common import InfoExtractor from .youtube import YoutubeIE from ..compat import ( compat_urllib_parse, + compat_urllib_parse_unquote, + compat_urllib_request, compat_urlparse, compat_xml_parse_error, ) @@ -46,6 +48,97 @@ class GenericIE(InfoExtractor): _VALID_URL = r'.*' IE_NAME = 'generic' _TESTS = [ + # Direct link to a video + { + 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4', + 'md5': '67d406c2bcb6af27fa886f31aa934bbe', + 'info_dict': { + 'id': 'trailer', + 'ext': 'mp4', + 'title': 'trailer', + 'upload_date': '20100513', + } + }, + # Direct link to media delivered compressed (until Accept-Encoding is *) + { + 'url': 'http://calimero.tk/muzik/FictionJunction-Parallel_Hearts.flac', + 'md5': '128c42e68b13950268b648275386fc74', + 'info_dict': { + 'id': 'FictionJunction-Parallel_Hearts', + 'ext': 'flac', + 'title': 'FictionJunction-Parallel_Hearts', + 'upload_date': '20140522', + }, + 'expected_warnings': [ + 'URL could be a direct video link, returning it as such.' + ] + }, + # Direct download with broken HEAD + { + 'url': 'http://ai-radio.org:8000/radio.opus', + 'info_dict': { + 'id': 'radio', + 'ext': 'opus', + 'title': 'radio', + }, + 'params': { + 'skip_download': True, # infinite live stream + }, + 'expected_warnings': [ + r'501.*Not Implemented' + ], + }, + # Direct link with incorrect MIME type + { + 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', + 'md5': '4ccbebe5f36706d85221f204d7eb5913', + 'info_dict': { + 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', + 'id': '5_Lennart_Poettering_-_Systemd', + 'ext': 'webm', + 'title': '5_Lennart_Poettering_-_Systemd', + 'upload_date': '20141120', + }, + 'expected_warnings': [ + 'URL could be a direct video link, returning it as such.' + ] + }, + # RSS feed + { + 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml', + 'info_dict': { + 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml', + 'title': 'Zero Punctuation', + 'description': 're:.*groundbreaking video review series.*' + }, + 'playlist_mincount': 11, + }, + # RSS feed with enclosure + { + 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', + 'info_dict': { + 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624', + 'ext': 'm4v', + 'upload_date': '20150228', + 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624', + } + }, + # google redirect + { + 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', + 'info_dict': { + 'id': 'cmQHVoWB5FY', + 'ext': 'mp4', + 'upload_date': '20130224', + 'uploader_id': 'TheVerge', + 'description': 're:^Chris Ziegler takes a look at the\.*', + 'uploader': 'The Verge', + 'title': 'First Firefox OS phones side-by-side', + }, + 'params': { + 'skip_download': False, + } + }, { 'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html', 'md5': '85b90ccc9d73b4acd9138d3af4c27f89', @@ -125,17 +218,6 @@ class GenericIE(InfoExtractor): 'skip_download': True, # m3u8 download }, }, - # Direct link to a video - { - 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4', - 'md5': '67d406c2bcb6af27fa886f31aa934bbe', - 'info_dict': { - 'id': 'trailer', - 'ext': 'mp4', - 'title': 'trailer', - 'upload_date': '20100513', - } - }, # ooyala video { 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219', @@ -160,22 +242,6 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['Ooyala'], }, - # google redirect - { - 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', - 'info_dict': { - 'id': 'cmQHVoWB5FY', - 'ext': 'mp4', - 'upload_date': '20130224', - 'uploader_id': 'TheVerge', - 'description': 're:^Chris Ziegler takes a look at the\.*', - 'uploader': 'The Verge', - 'title': 'First Firefox OS phones side-by-side', - }, - 'params': { - 'skip_download': False, - } - }, # embed.ly video { 'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/', @@ -407,16 +473,6 @@ class GenericIE(InfoExtractor): 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com', } }, - # RSS feed - { - 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml', - 'info_dict': { - 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml', - 'title': 'Zero Punctuation', - 'description': 're:.*groundbreaking video review series.*' - }, - 'playlist_mincount': 11, - }, # Multiple brightcove videos # https://github.com/rg3/youtube-dl/issues/2283 { @@ -470,21 +526,6 @@ class GenericIE(InfoExtractor): 'uploader': 'thoughtworks.wistia.com', }, }, - # Direct download with broken HEAD - { - 'url': 'http://ai-radio.org:8000/radio.opus', - 'info_dict': { - 'id': 'radio', - 'ext': 'opus', - 'title': 'radio', - }, - 'params': { - 'skip_download': True, # infinite live stream - }, - 'expected_warnings': [ - r'501.*Not Implemented' - ], - }, # Soundcloud embed { 'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/', @@ -516,21 +557,6 @@ class GenericIE(InfoExtractor): }, 'playlist_mincount': 2, }, - # Direct link with incorrect MIME type - { - 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', - 'md5': '4ccbebe5f36706d85221f204d7eb5913', - 'info_dict': { - 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', - 'id': '5_Lennart_Poettering_-_Systemd', - 'ext': 'webm', - 'title': '5_Lennart_Poettering_-_Systemd', - 'upload_date': '20141120', - }, - 'expected_warnings': [ - 'URL could be a direct video link, returning it as such.' - ] - }, # Cinchcast embed { 'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/', @@ -689,16 +715,6 @@ class GenericIE(InfoExtractor): 'age_limit': 0, }, }, - # RSS feed with enclosure - { - 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', - 'info_dict': { - 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624', - 'ext': 'm4v', - 'upload_date': '20150228', - 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624', - } - }, # Crooks and Liars embed { 'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists', @@ -894,7 +910,7 @@ class GenericIE(InfoExtractor): force_videoid = smuggled_data['force_videoid'] video_id = force_videoid else: - video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0] + video_id = compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0]) self.to_screen('%s: Requesting header' % video_id) @@ -916,7 +932,9 @@ class GenericIE(InfoExtractor): full_response = None if head_response is False: - full_response = self._request_webpage(url, video_id) + request = compat_urllib_request.Request(url) + request.add_header('Accept-Encoding', '*') + full_response = self._request_webpage(request, video_id) head_response = full_response # Check for direct link to a video @@ -927,7 +945,7 @@ class GenericIE(InfoExtractor): head_response.headers.get('Last-Modified')) return { 'id': video_id, - 'title': os.path.splitext(url_basename(url))[0], + 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]), 'direct': True, 'formats': [{ 'format_id': m.group('format_id'), @@ -941,7 +959,17 @@ class GenericIE(InfoExtractor): self._downloader.report_warning('Falling back on generic information extractor.') if not full_response: - full_response = self._request_webpage(url, video_id) + request = compat_urllib_request.Request(url) + # Some webservers may serve compressed content of rather big size (e.g. gzipped flac) + # making it impossible to download only chunk of the file (yet we need only 512kB to + # test whether it's HTML or not). According to youtube-dl default Accept-Encoding + # that will always result in downloading the whole file that is not desirable. + # Therefore for extraction pass we have to override Accept-Encoding to any in order + # to accept raw bytes and being able to download only a chunk. + # It may probably better to solve this by checking Content-Type for application/octet-stream + # after HEAD request finishes, but not sure if we can rely on this. + request.add_header('Accept-Encoding', '*') + full_response = self._request_webpage(request, video_id) # Maybe it's a direct link to a video? # Be careful not to download the whole thing! @@ -953,7 +981,7 @@ class GenericIE(InfoExtractor): head_response.headers.get('Last-Modified')) return { 'id': video_id, - 'title': os.path.splitext(url_basename(url))[0], + 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]), 'direct': True, 'url': url, 'upload_date': upload_date, diff --git a/youtube_dl/extractor/imgur.py b/youtube_dl/extractor/imgur.py index fe5d95e2c..d692ea79a 100644 --- a/youtube_dl/extractor/imgur.py +++ b/youtube_dl/extractor/imgur.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( int_or_none, js_to_json, @@ -12,7 +13,7 @@ from ..utils import ( class ImgurIE(InfoExtractor): - _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?P[a-zA-Z0-9]+)(?:\.mp4|\.gifv)?' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?P[a-zA-Z0-9]+)' _TESTS = [{ 'url': 'https://i.imgur.com/A61SaA1.gifv', @@ -34,7 +35,8 @@ class ImgurIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage( + compat_urlparse.urljoin(url, video_id), video_id) width = int_or_none(self._search_regex( r'[^?#]+)' + _VALID_URL = r'https?://play\.iprima\.cz/(?:[^/]+/)*(?P[^?#]+)' _TESTS = [{ 'url': 'http://play.iprima.cz/particka/particka-92', @@ -23,7 +24,7 @@ class IPrimaIE(InfoExtractor): 'id': '39152', 'ext': 'flv', 'title': 'Partička (92)', - 'description': 'md5:3740fda51464da35a2d4d0670b8e4fd6', + 'description': 'md5:74e9617e51bca67c3ecfb2c6f9766f45', 'thumbnail': 'http://play.iprima.cz/sites/default/files/image_crops/image_620x349/3/491483_particka-92_image_620x349.jpg', }, 'params': { @@ -35,13 +36,14 @@ class IPrimaIE(InfoExtractor): 'id': '9718337', 'ext': 'flv', 'title': 'Tchibo Partička - Jarní móda', - 'description': 'md5:589f8f59f414220621ff8882eb3ce7be', 'thumbnail': 're:^http:.*\.jpg$', }, 'params': { 'skip_download': True, # requires rtmpdump }, - 'skip': 'Do not have permission to access this page', + }, { + 'url': 'http://play.iprima.cz/zpravy-ftv-prima-2752015', + 'only_matching': True, }] def _real_extract(self, url): @@ -102,8 +104,10 @@ class IPrimaIE(InfoExtractor): return { 'id': real_id, - 'title': self._og_search_title(webpage), + 'title': remove_end(self._og_search_title(webpage), ' | Prima PLAY'), 'thumbnail': self._og_search_thumbnail(webpage), 'formats': formats, - 'description': self._og_search_description(webpage), + 'description': self._search_regex( + r']+itemprop="description"[^>]*>([^<]+)', + webpage, 'description', default=None), } diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py new file mode 100644 index 000000000..4e999b237 --- /dev/null +++ b/youtube_dl/extractor/nova.py @@ -0,0 +1,138 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class NovaIE(InfoExtractor): + IE_DESC = 'TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz' + _VALID_URL = 'http://(?:[^.]+\.)?(?Ptv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P[^/]+)(?:\.html|/?)' + _TESTS = [{ + 'url': 'http://tvnoviny.nova.cz/clanek/novinky/co-na-sebe-sportaci-praskli-vime-jestli-pujde-hrdlicka-na-materskou.html', + 'info_dict': { + 'id': '1608920', + 'display_id': 'co-na-sebe-sportaci-praskli-vime-jestli-pujde-hrdlicka-na-materskou', + 'ext': 'flv', + 'title': 'Duel: Michal Hrdlička a Petr Suchoň', + 'description': 'md5:d0cc509858eee1b1374111c588c6f5d5', + 'thumbnail': 're:^https?://.*\.(?:jpg)', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + 'url': 'http://tn.nova.cz/clanek/tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci.html', + 'md5': '1dd7b9d5ea27bc361f110cd855a19bd3', + 'info_dict': { + 'id': '1757139', + 'display_id': 'tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci', + 'ext': 'mp4', + 'title': 'Podzemní nemocnice v pražské Krči', + 'description': 'md5:f0a42dd239c26f61c28f19e62d20ef53', + 'thumbnail': 're:^https?://.*\.(?:jpg)', + } + }, { + 'url': 'http://novaplus.nova.cz/porad/policie-modrava/video/5591-policie-modrava-15-dil-blondynka-na-hrbitove/', + 'info_dict': { + 'id': '1756825', + 'display_id': '5591-policie-modrava-15-dil-blondynka-na-hrbitove', + 'ext': 'flv', + 'title': 'Policie Modrava - 15. díl - Blondýnka na hřbitově', + 'description': 'md5:d804ba6b30bc7da2705b1fea961bddfe', + 'thumbnail': 're:^https?://.*\.(?:jpg)', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + 'url': 'http://sport.tn.nova.cz/clanek/sport/hokej/nhl/zivot-jde-dal-hodnotil-po-vyrazeni-z-playoff-jiri-sekac.html', + 'only_matching': True, + }, { + 'url': 'http://fanda.nova.cz/clanek/fun-and-games/krvavy-epos-zaklinac-3-divoky-hon-vychazi-vyhrajte-ho-pro-sebe.html', + 'only_matching': True, + }, { + 'url': 'http://doma.nova.cz/clanek/zdravi/prijdte-se-zapsat-do-registru-kostni-drene-jiz-ve-stredu-3-cervna.html', + 'only_matching': True, + }, { + 'url': 'http://prask.nova.cz/clanek/novinky/co-si-na-sobe-nase-hvezdy-nechaly-pojistit.html', + 'only_matching': True, + }, { + 'url': 'http://tv.nova.cz/clanek/novinky/zivot-je-zivot-bondovsky-trailer.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('id') + site = mobj.group('site') + + webpage = self._download_webpage(url, display_id) + + video_id = self._search_regex( + [r"(?:media|video_id)\s*:\s*'(\d+)'", + r'media=(\d+)', + r'id="article_video_(\d+)"', + r'id="player_(\d+)"'], + webpage, 'video id') + + config_url = self._search_regex( + r'src="(http://tn\.nova\.cz/bin/player/videojs/config\.php\?[^"]+)"', + webpage, 'config url', default=None) + + if not config_url: + DEFAULT_SITE_ID = '23000' + SITES = { + 'tvnoviny': DEFAULT_SITE_ID, + 'novaplus': DEFAULT_SITE_ID, + 'vymena': DEFAULT_SITE_ID, + 'krasna': DEFAULT_SITE_ID, + 'fanda': '30', + 'tn': '30', + 'doma': '30', + } + + site_id = self._search_regex( + r'site=(\d+)', webpage, 'site id', default=None) or SITES.get(site, DEFAULT_SITE_ID) + + config_url = ('http://tn.nova.cz/bin/player/videojs/config.php?site=%s&media=%s&jsVar=vjsconfig' + % (site_id, video_id)) + + config = self._download_json( + config_url, display_id, + 'Downloading config JSON', + transform_source=lambda s: re.sub(r'var\s+[\da-zA-Z_]+\s*=\s*({.+?});', r'\1', s)) + + mediafile = config['mediafile'] + video_url = mediafile['src'] + + m = re.search(r'^(?Prtmpe?://[^/]+/(?P[^/]+?))/&*(?P.+)$', video_url) + if m: + formats = [{ + 'url': m.group('url'), + 'app': m.group('app'), + 'play_path': m.group('playpath'), + 'player_path': 'http://tvnoviny.nova.cz/static/shared/app/videojs/video-js.swf', + 'ext': 'flv', + }] + else: + formats = [{ + 'url': video_url, + }] + self._sort_formats(formats) + + title = mediafile.get('meta', {}).get('title') or self._og_search_title(webpage) + description = self._og_search_description(webpage) + thumbnail = config.get('poster') + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'formats': formats, + } diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py new file mode 100644 index 000000000..173e46cd8 --- /dev/null +++ b/youtube_dl/extractor/nowtv.py @@ -0,0 +1,192 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + parse_iso8601, + parse_duration, + remove_start, +) + + +class NowTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?nowtv\.de/(?Prtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P.+?)/player' + + _TESTS = [{ + # rtl + 'url': 'http://www.nowtv.de/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/player', + 'info_dict': { + 'id': '203519', + 'display_id': 'bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit', + 'ext': 'mp4', + 'title': 'Die neuen Bauern und eine Hochzeit', + 'description': 'md5:e234e1ed6d63cf06be5c070442612e7e', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1432580700, + 'upload_date': '20150525', + 'duration': 2786, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # rtl2 + 'url': 'http://www.nowtv.de/rtl2/berlin-tag-nacht/berlin-tag-nacht-folge-934/player', + 'info_dict': { + 'id': '203481', + 'display_id': 'berlin-tag-nacht/berlin-tag-nacht-folge-934', + 'ext': 'mp4', + 'title': 'Berlin - Tag & Nacht (Folge 934)', + 'description': 'md5:c85e88c2e36c552dfe63433bc9506dd0', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1432666800, + 'upload_date': '20150526', + 'duration': 2641, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # rtlnitro + 'url': 'http://www.nowtv.de/rtlnitro/alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00/player', + 'info_dict': { + 'id': '165780', + 'display_id': 'alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00', + 'ext': 'mp4', + 'title': 'Hals- und Beinbruch', + 'description': 'md5:b50d248efffe244e6f56737f0911ca57', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1432415400, + 'upload_date': '20150523', + 'duration': 2742, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # superrtl + 'url': 'http://www.nowtv.de/superrtl/medicopter-117/angst/player', + 'info_dict': { + 'id': '99205', + 'display_id': 'medicopter-117/angst', + 'ext': 'mp4', + 'title': 'Angst!', + 'description': 'md5:30cbc4c0b73ec98bcd73c9f2a8c17c4e', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1222632900, + 'upload_date': '20080928', + 'duration': 3025, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # ntv + 'url': 'http://www.nowtv.de/ntv/ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch/player', + 'info_dict': { + 'id': '203521', + 'display_id': 'ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch', + 'ext': 'mp4', + 'title': 'Thema u.a.: Der erste Blick: Die Apple Watch', + 'description': 'md5:4312b6c9d839ffe7d8caf03865a531af', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1432751700, + 'upload_date': '20150527', + 'duration': 1083, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # vox + 'url': 'http://www.nowtv.de/vox/der-hundeprofi/buero-fall-chihuahua-joel/player', + 'info_dict': { + 'id': '128953', + 'display_id': 'der-hundeprofi/buero-fall-chihuahua-joel', + 'ext': 'mp4', + 'title': "Büro-Fall / Chihuahua 'Joel'", + 'description': 'md5:e62cb6bf7c3cc669179d4f1eb279ad8d', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1432408200, + 'upload_date': '20150523', + 'duration': 3092, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('id') + station = mobj.group('station') + + info = self._download_json( + 'https://api.nowtv.de/v3/movies/%s?fields=*,format,files' % display_id, + display_id) + + video_id = compat_str(info['id']) + + files = info['files'] + if not files: + if info.get('geoblocked', False): + raise ExtractorError( + 'Video %s is not available from your location due to geo restriction' % video_id, + expected=True) + if not info.get('free', True): + raise ExtractorError( + 'Video %s is not available for free' % video_id, expected=True) + + f = info.get('format', {}) + station = f.get('station') or station + + STATIONS = { + 'rtl': 'rtlnow', + 'rtl2': 'rtl2now', + 'vox': 'voxnow', + 'nitro': 'rtlnitronow', + 'ntv': 'n-tvnow', + 'superrtl': 'superrtlnow' + } + + formats = [] + for item in files['items']: + item_path = remove_start(item['path'], '/') + tbr = int_or_none(item['bitrate']) + m3u8_url = 'http://hls.fra.%s.de/hls-vod-enc/%s.m3u8' % (STATIONS[station], item_path) + m3u8_url = m3u8_url.replace('now/', 'now/videos/') + formats.append({ + 'url': m3u8_url, + 'format_id': '%s-%sk' % (item['id'], tbr), + 'ext': 'mp4', + 'tbr': tbr, + }) + self._sort_formats(formats) + + title = info['title'] + description = info.get('articleLong') or info.get('articleShort') + timestamp = parse_iso8601(info.get('broadcastStartDate'), ' ') + duration = parse_duration(info.get('duration')) + thumbnail = f.get('defaultImage169Format') or f.get('defaultImage169Logo') + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + } diff --git a/youtube_dl/extractor/patreon.py b/youtube_dl/extractor/patreon.py index f179ea200..6cdc2638b 100644 --- a/youtube_dl/extractor/patreon.py +++ b/youtube_dl/extractor/patreon.py @@ -87,7 +87,7 @@ class PatreonIE(InfoExtractor): r'
', webpage, 'attachment URL', default=None) embed = self._html_search_regex( - r'
\s*