From: Sergey M․ Date: Sat, 13 Jun 2015 22:56:54 +0000 (+0600) Subject: Merge branch 'dramafever' of https://github.com/ping/youtube-dl into ping-dramafever X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=commitdiff_plain;h=ad49fe7c8f22c78a001eb1a53d7df9ef54424b89;hp=137597b0ea88a92d174341b44b8f395b8897a2bf;p=youtube-dl Merge branch 'dramafever' of https://github.com/ping/youtube-dl into ping-dramafever --- diff --git a/AUTHORS b/AUTHORS index 267b8da1e..bf2a25cb8 100644 --- a/AUTHORS +++ b/AUTHORS @@ -124,3 +124,6 @@ Mohammad Teimori Pabandi Roman Le Négrate Matthias Küch Julian Richen +Ping O. +Mister Hat +Peter Ding diff --git a/README.md b/README.md index 3d9436456..f3d83c89f 100644 --- a/README.md +++ b/README.md @@ -17,12 +17,12 @@ youtube-dl - download videos from youtube.com or other video platforms To install it right away for all UNIX users (Linux, OS X, etc.), type: sudo curl https://yt-dl.org/latest/youtube-dl -o /usr/local/bin/youtube-dl - sudo chmod a+x /usr/local/bin/youtube-dl + sudo chmod a+rx /usr/local/bin/youtube-dl If you do not have curl, you can alternatively use a recent wget: sudo wget https://yt-dl.org/downloads/latest/youtube-dl -O /usr/local/bin/youtube-dl - sudo chmod a+x /usr/local/bin/youtube-dl + sudo chmod a+rx /usr/local/bin/youtube-dl Windows users can [download a .exe file](https://yt-dl.org/latest/youtube-dl.exe) and place it in their home directory or any other location on their [PATH](http://en.wikipedia.org/wiki/PATH_%28variable%29). @@ -168,7 +168,7 @@ which means you can modify it, redistribute it or use it however you like. --no-progress Do not print progress bar --console-title Display progress in console titlebar -v, --verbose Print various debugging information - --dump-pages Print downloaded pages to debug problems (very verbose) + --dump-pages Print downloaded pages encoded using base64 to debug problems (very verbose) --write-pages Write downloaded intermediary pages to files in the current directory to debug problems --print-traffic Display sent and read HTTP traffic -C, --call-home Contact the youtube-dl server for debugging @@ -220,7 +220,7 @@ which means you can modify it, redistribute it or use it however you like. --embed-thumbnail Embed thumbnail in the audio as cover art --add-metadata Write metadata to the video file --metadata-from-title FORMAT Parse additional metadata like song title / artist from the video title. The format syntax is the same as --output, the parsed - parameters replace existing values. Additional templates: %(album), %(artist). Example: --metadata-from-title "%(artist)s - + parameters replace existing values. Additional templates: %(album)s, %(artist)s. Example: --metadata-from-title "%(artist)s - %(title)s" matches a title like "Coldplay - Paradise" --xattrs Write metadata to the video file's xattrs (using dublin core and xdg standards) --fixup POLICY Automatically correct known faults of the file. One of never (do nothing), warn (only emit a warning), detect_or_warn(the default; diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 43fbe8b1d..d147b53fe 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -10,6 +10,7 @@ - **56.com** - **5min** - **8tracks** + - **91porn** - **9gag** - **abc.net.au** - **Abc7News** @@ -26,8 +27,7 @@ - **anitube.se** - **AnySex** - **Aparat** - - **AppleDailyAnimationNews** - - **AppleDailyRealtimeNews** + - **AppleDaily** - **AppleTrailers** - **archive.org**: archive.org videos - **ARD** @@ -142,6 +142,7 @@ - **Eporner** - **EroProfile** - **Escapist** + - **ESPN** (Currently broken) - **EveryonesMixtape** - **exfm**: ex.fm - **ExpoTV** @@ -151,7 +152,6 @@ - **fc2** - **fernsehkritik.tv** - **fernsehkritik.tv:postecke** - - **Firedrive** - **Firstpost** - **Flickr** - **Folketinget**: Folketinget (ft.dk; Danish parliament) @@ -229,6 +229,7 @@ - **KanalPlay**: Kanal 5/9/11 Play - **Kankan** - **Karaoketv** + - **KarriereVideos** - **keek** - **KeezMovies** - **KhanAcademy** @@ -319,8 +320,10 @@ - **Noco** - **Normalboots** - **NosVideo** + - **Nova**: TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz - **novamov**: NovaMov - **Nowness** + - **NowTV** - **nowvideo**: NowVideo - **npo.nl** - **npo.nl:live** @@ -338,6 +341,7 @@ - **OktoberfestTV** - **on.aol.com** - **Ooyala** + - **OoyalaExternal** - **OpenFilm** - **orf:fm4**: radio FM4 - **orf:iptv**: iptv.ORF.at @@ -391,7 +395,6 @@ - **Rte** - **rtl.nl**: rtl.nl and rtlxl.nl - **RTL2** - - **RTLnow** - **RTP** - **RTS**: RTS.ch - **rtve.es:alacarta**: RTVE a la carta @@ -429,8 +432,9 @@ - **smotri:community**: Smotri.com community videos - **smotri:user**: Smotri.com user videos - **Snotr** - - **Sockshare** - **Sohu** + - **soompi** + - **soompi:show** - **soundcloud** - **soundcloud:playlist** - **soundcloud:set** @@ -451,6 +455,7 @@ - **Spike** - **Sport5** - **SportBox** + - **SportBoxEmbed** - **SportDeutschland** - **Srf** - **SRMediathek**: Saarländischer Rundfunk @@ -504,12 +509,15 @@ - **Trilulilu** - **TruTube** - **Tube8** + - **TubiTv** - **Tudou** - **Tumblr** - **TuneIn** - **Turbo** - **Tutv** - **tv.dfb.de** + - **TV2** + - **TV2Article** - **TV4**: tv4.se and tv4play.se - **tvigle**: Интернет-телевидение Tvigle.ru - **tvp.pl** @@ -559,6 +567,7 @@ - **vier:videos** - **Viewster** - **viki** + - **viki:channel** - **vimeo** - **vimeo:album** - **vimeo:channel** diff --git a/test/test_aes.py b/test/test_aes.py index 4dc7de7b5..315a3f5ae 100644 --- a/test/test_aes.py +++ b/test/test_aes.py @@ -39,7 +39,7 @@ class TestAES(unittest.TestCase): encrypted = base64.b64encode( intlist_to_bytes(self.iv[:8]) + b'\x17\x15\x93\xab\x8d\x80V\xcdV\xe0\t\xcdo\xc2\xa5\xd8ksM\r\xe27N\xae' - ) + ).decode('utf-8') decrypted = (aes_decrypt_text(encrypted, password, 16)) self.assertEqual(decrypted, self.secret_msg) @@ -47,7 +47,7 @@ class TestAES(unittest.TestCase): encrypted = base64.b64encode( intlist_to_bytes(self.iv[:8]) + b'\x0b\xe6\xa4\xd9z\x0e\xb8\xb9\xd0\xd4i_\x85\x1d\x99\x98_\xe5\x80\xe7.\xbf\xa5\x83' - ) + ).decode('utf-8') decrypted = (aes_decrypt_text(encrypted, password, 32)) self.assertEqual(decrypted, self.secret_msg) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 891ee620b..c4e3adb67 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -266,7 +266,7 @@ class TestNRKSubtitles(BaseTestSubtitles): self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['no'])) - self.assertEqual(md5(subtitles['no']), '1d221e6458c95c5494dcd38e6a1f129a') + self.assertEqual(md5(subtitles['no']), '544fa917d3197fcbee64634559221cc2') class TestRaiSubtitles(BaseTestSubtitles): diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 58b34e087..b1f792d4e 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -49,6 +49,7 @@ from .utils import ( ExtractorError, format_bytes, formatSeconds, + HEADRequest, locked_file, make_HTTPS_handler, MaxDownloadsReached, @@ -923,8 +924,9 @@ class YoutubeDL(object): if f.get('vcodec') != 'none' and f.get('acodec') != 'none'] if audiovideo_formats: return audiovideo_formats[format_idx] - # for audio only urls, select the best/worst audio format - elif all(f.get('acodec') != 'none' for f in available_formats): + # for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format + elif (all(f.get('acodec') != 'none' for f in available_formats) or + all(f.get('vcodec') != 'none' for f in available_formats)): return available_formats[format_idx] elif format_spec == 'bestaudio': audio_formats = [ @@ -1014,13 +1016,13 @@ class YoutubeDL(object): info_dict['display_id'] = info_dict['id'] if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None: - # Working around negative timestamps in Windows - # (see http://bugs.python.org/issue1646728) - if info_dict['timestamp'] < 0 and os.name == 'nt': - info_dict['timestamp'] = 0 - upload_date = datetime.datetime.utcfromtimestamp( - info_dict['timestamp']) - info_dict['upload_date'] = upload_date.strftime('%Y%m%d') + # Working around out-of-range timestamp values (e.g. negative ones on Windows, + # see http://bugs.python.org/issue1646728) + try: + upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp']) + info_dict['upload_date'] = upload_date.strftime('%Y%m%d') + except (ValueError, OverflowError, OSError): + pass if self.params.get('listsubtitles', False): if 'automatic_captions' in info_dict: @@ -1047,6 +1049,8 @@ class YoutubeDL(object): if not formats: raise ExtractorError('No video formats found!') + formats_dict = {} + # We check that all the formats have the format and format_id fields for i, format in enumerate(formats): if 'url' not in format: @@ -1054,6 +1058,18 @@ class YoutubeDL(object): if format.get('format_id') is None: format['format_id'] = compat_str(i) + format_id = format['format_id'] + if format_id not in formats_dict: + formats_dict[format_id] = [] + formats_dict[format_id].append(format) + + # Make sure all formats have unique format_id + for format_id, ambiguous_formats in formats_dict.items(): + if len(ambiguous_formats) > 1: + for i, format in enumerate(ambiguous_formats): + format['format_id'] = '%s-%d' % (format_id, i) + + for i, format in enumerate(formats): if format.get('format') is None: format['format'] = '{id} - {res}{note}'.format( id=format['format_id'], @@ -1527,6 +1543,7 @@ class YoutubeDL(object): pps_chain.extend(ie_info['__postprocessors']) pps_chain.extend(self._pps) for pp in pps_chain: + files_to_delete = [] try: files_to_delete, info = pp.run(info) except PostProcessingError as e: @@ -1705,7 +1722,8 @@ class YoutubeDL(object): if req_is_string: req = url_escaped else: - req = compat_urllib_request.Request( + req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request + req = req_type( url_escaped, data=req.data, headers=req.headers, origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) diff --git a/youtube_dl/aes.py b/youtube_dl/aes.py index 07224d508..7817adcfd 100644 --- a/youtube_dl/aes.py +++ b/youtube_dl/aes.py @@ -152,7 +152,7 @@ def aes_decrypt_text(data, password, key_size_bytes): """ NONCE_LENGTH_BYTES = 8 - data = bytes_to_intlist(base64.b64decode(data)) + data = bytes_to_intlist(base64.b64decode(data.encode('utf-8'))) password = bytes_to_intlist(password.encode('utf-8')) key = password[:key_size_bytes] + [0] * (key_size_bytes - len(password)) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index ca857a75f..6fdaf90b2 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -153,10 +153,10 @@ from .extremetube import ExtremeTubeIE from .facebook import FacebookIE from .faz import FazIE from .fc2 import FC2IE -from .firedrive import FiredriveIE from .firstpost import FirstpostIE from .firsttv import FirstTVIE from .fivemin import FiveMinIE +from .fivetv import FiveTVIE from .fktv import ( FKTVIE, FKTVPosteckeIE, @@ -234,6 +234,7 @@ from .infoq import InfoQIE from .instagram import InstagramIE, InstagramUserIE from .internetvideoarchive import InternetVideoArchiveIE from .iprima import IPrimaIE +from .iqiyi import IqiyiIE from .ivi import ( IviIE, IviCompilationIE @@ -248,6 +249,7 @@ from .kaltura import KalturaIE from .kanalplay import KanalPlayIE from .kankan import KankanIE from .karaoketv import KaraoketvIE +from .karrierevideos import KarriereVideosIE from .keezmovies import KeezMoviesIE from .khanacademy import KhanAcademyIE from .kickstarter import KickStarterIE @@ -342,8 +344,7 @@ from .newstube import NewstubeIE from .nextmedia import ( NextMediaIE, NextMediaActionNewsIE, - AppleDailyRealtimeNewsIE, - AppleDailyAnimationNewsIE + AppleDailyIE, ) from .nfb import NFBIE from .nfl import NFLIE @@ -357,8 +358,10 @@ from .ninegag import NineGagIE from .noco import NocoIE from .normalboots import NormalbootsIE from .nosvideo import NosVideoIE +from .nova import NovaIE from .novamov import NovaMovIE from .nowness import NownessIE +from .nowtv import NowTVIE from .nowvideo import NowVideoIE from .npo import ( NPOIE, @@ -405,6 +408,7 @@ from .playfm import PlayFMIE from .playvid import PlayvidIE from .playwire import PlaywireIE from .podomatic import PodomaticIE +from .porn91 import Porn91IE from .pornhd import PornHdIE from .pornhub import ( PornHubIE, @@ -442,7 +446,6 @@ from .roxwel import RoxwelIE from .rtbf import RTBFIE from .rte import RteIE from .rtlnl import RtlNlIE -from .rtlnow import RTLnowIE from .rtl2 import RTL2IE from .rtp import RTPIE from .rts import RTSIE @@ -456,6 +459,7 @@ from .rutube import ( RutubePersonIE, ) from .rutv import RUTVIE +from .ruutu import RuutuIE from .sandia import SandiaIE from .safari import ( SafariIE, @@ -484,8 +488,11 @@ from .smotri import ( SmotriBroadcastIE, ) from .snotr import SnotrIE -from .sockshare import SockshareIE from .sohu import SohuIE +from .soompi import ( + SoompiIE, + SoompiShowIE, +) from .soundcloud import ( SoundcloudIE, SoundcloudSetIE, @@ -571,6 +578,7 @@ from .traileraddict import TrailerAddictIE from .trilulilu import TriluliluIE from .trutube import TruTubeIE from .tube8 import Tube8IE +from .tubitv import TubiTvIE from .tudou import TudouIE from .tumblr import TumblrIE from .tunein import TuneInIE @@ -581,6 +589,10 @@ from .tv2 import ( TV2ArticleIE, ) from .tv4 import TV4IE +from .tvc import ( + TVCIE, + TVCArticleIE, +) from .tvigle import TvigleIE from .tvp import TvpIE, TvpSeriesIE from .tvplay import TVPlayIE @@ -651,7 +663,10 @@ from .vine import ( VineIE, VineUserIE, ) -from .viki import VikiIE +from .viki import ( + VikiIE, + VikiChannelIE, +) from .vk import ( VKIE, VKUserVideosIE, diff --git a/youtube_dl/extractor/aftonbladet.py b/youtube_dl/extractor/aftonbladet.py index a117502bc..e0518cf26 100644 --- a/youtube_dl/extractor/aftonbladet.py +++ b/youtube_dl/extractor/aftonbladet.py @@ -6,11 +6,11 @@ from ..utils import int_or_none class AftonbladetIE(InfoExtractor): - _VALID_URL = r'http://tv\.aftonbladet\.se/webbtv.+?(?Particle[0-9]+)\.ab(?:$|[?#])' + _VALID_URL = r'http://tv\.aftonbladet\.se/abtv/articles/(?P[0-9]+)' _TEST = { - 'url': 'http://tv.aftonbladet.se/webbtv/nyheter/vetenskap/rymden/article36015.ab', + 'url': 'http://tv.aftonbladet.se/abtv/articles/36015', 'info_dict': { - 'id': 'article36015', + 'id': '36015', 'ext': 'mp4', 'title': 'Vulkanutbrott i rymden - nu släpper NASA bilderna', 'description': 'Jupiters mÃ¥ne mest aktiv av alla himlakroppar', @@ -25,8 +25,9 @@ class AftonbladetIE(InfoExtractor): # find internal video meta data meta_url = 'http://aftonbladet-play.drlib.aptoma.no/video/%s.json' - internal_meta_id = self._html_search_regex( - r'data-aptomaId="([\w\d]+)"', webpage, 'internal_meta_id') + player_config = self._parse_json(self._html_search_regex( + r'data-player-config="([^"]+)"', webpage, 'player config'), video_id) + internal_meta_id = player_config['videoId'] internal_meta_url = meta_url % internal_meta_id internal_meta_json = self._download_json( internal_meta_url, video_id, 'Downloading video meta data') diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 8273bd6c9..76de24477 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -7,7 +7,6 @@ from .common import InfoExtractor from ..utils import ( find_xpath_attr, unified_strdate, - get_element_by_id, get_element_by_attribute, int_or_none, qualities, @@ -195,7 +194,9 @@ class ArteTVFutureIE(ArteTVPlus7IE): def _real_extract(self, url): anchor_id, lang = self._extract_url_info(url) webpage = self._download_webpage(url, anchor_id) - row = get_element_by_id(anchor_id, webpage) + row = self._search_regex( + r'(?s)id="%s"[^>]*>.+?(]*arte_vp_url[^>]*>)' % anchor_id, + webpage, 'row') return self._extract_from_webpage(row, anchor_id, lang) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 7ca835e31..bf60450c2 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -3,6 +3,8 @@ from __future__ import unicode_literals import re import itertools +import json +import xml.etree.ElementTree as ET from .common import InfoExtractor from ..utils import ( @@ -67,11 +69,19 @@ class BiliBiliIE(InfoExtractor): entries = [] - lq_doc = self._download_xml( + lq_page = self._download_webpage( 'http://interface.bilibili.com/v_cdn_play?appkey=1&cid=%s' % cid, video_id, note='Downloading LQ video info' ) + try: + err_info = json.loads(lq_page) + raise ExtractorError( + 'BiliBili said: ' + err_info['error_text'], expected=True) + except ValueError: + pass + + lq_doc = ET.fromstring(lq_page) lq_durls = lq_doc.findall('./durl') hq_doc = self._download_xml( @@ -80,9 +90,11 @@ class BiliBiliIE(InfoExtractor): note='Downloading HQ video info', fatal=False, ) - hq_durls = hq_doc.findall('./durl') if hq_doc is not False else itertools.repeat(None) - - assert len(lq_durls) == len(hq_durls) + if hq_doc is not False: + hq_durls = hq_doc.findall('./durl') + assert len(lq_durls) == len(hq_durls) + else: + hq_durls = itertools.repeat(None) i = 1 for lq_durl, hq_durl in zip(lq_durls, hq_durls): @@ -93,7 +105,7 @@ class BiliBiliIE(InfoExtractor): 'filesize': int_or_none( lq_durl.find('./size'), get_attr='text'), }] - if hq_durl: + if hq_durl is not None: formats.append({ 'format_id': 'hq', 'quality': 2, diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 4f60d5366..d768f99e6 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -156,6 +156,28 @@ class BrightcoveIE(InfoExtractor): linkBase = find_param('linkBaseURL') if linkBase is not None: params['linkBaseURL'] = linkBase + return cls._make_brightcove_url(params) + + @classmethod + def _build_brighcove_url_from_js(cls, object_js): + # The layout of JS is as follows: + # customBC.createVideo = function (width, height, playerID, playerKey, videoPlayer, VideoRandomID) { + # // build Brightcove XML + # } + m = re.search( + r'''(?x)customBC.\createVideo\( + .*? # skipping width and height + ["\'](?P\d+)["\']\s*,\s* # playerID + ["\'](?PAQ[^"\']{48})[^"\']*["\']\s*,\s* # playerKey begins with AQ and is 50 characters + # in length, however it's appended to itself + # in places, so truncate + ["\'](?P\d+)["\'] # @videoPlayer + ''', object_js) + if m: + return cls._make_brightcove_url(m.groupdict()) + + @classmethod + def _make_brightcove_url(cls, params): data = compat_urllib_parse.urlencode(params) return cls._FEDERATED_URL_TEMPLATE % data @@ -172,7 +194,7 @@ class BrightcoveIE(InfoExtractor): """Return a list of all Brightcove URLs from the webpage """ url_m = re.search( - r']*?>\s*\s*''', webpage) - return list(filter(None, [cls._build_brighcove_url(m) for m in matches])) + if matches: + return list(filter(None, [cls._build_brighcove_url(m) for m in matches])) + + return list(filter(None, [ + cls._build_brighcove_url_from_js(custom_bc) + for custom_bc in re.findall(r'(customBC\.createVideo\(.+?\);)', webpage)])) def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index 1ceb9d8d9..75fffb156 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -4,12 +4,13 @@ from .common import InfoExtractor class CBSIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?cbs\.com/shows/[^/]+/(?:video|artist)/(?P[^/]+)/.*' + _VALID_URL = r'https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/(?:video|artist)|colbertlateshow\.com/(?:video|podcasts))/[^/]+/(?P[^/]+)' _TESTS = [{ 'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', 'info_dict': { 'id': '4JUVEwq3wUT7', + 'display_id': 'connect-chat-feat-garth-brooks', 'ext': 'flv', 'title': 'Connect Chat feat. Garth Brooks', 'description': 'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!', @@ -24,6 +25,7 @@ class CBSIE(InfoExtractor): 'url': 'http://www.cbs.com/shows/liveonletterman/artist/221752/st-vincent/', 'info_dict': { 'id': 'WWF_5KqY3PK1', + 'display_id': 'st-vincent', 'ext': 'flv', 'title': 'Live on Letterman - St. Vincent', 'description': 'Live On Letterman: St. Vincent in concert from New York\'s Ed Sullivan Theater on Tuesday, July 16, 2014.', @@ -34,12 +36,23 @@ class CBSIE(InfoExtractor): 'skip_download': True, }, '_skip': 'Blocked outside the US', + }, { + 'url': 'http://colbertlateshow.com/video/8GmB0oY0McANFvp2aEffk9jZZZ2YyXxy/the-colbeard/', + 'only_matching': True, + }, { + 'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/', + 'only_matching': True, }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) real_id = self._search_regex( - r"video\.settings\.pid\s*=\s*'([^']+)';", + [r"video\.settings\.pid\s*=\s*'([^']+)';", r"cbsplayer\.pid\s*=\s*'([^']+)';"], webpage, 'real video ID') - return self.url_result('theplatform:%s' % real_id) + return { + '_type': 'url_transparent', + 'ie_key': 'ThePlatform', + 'url': 'theplatform:%s' % real_id, + 'display_id': display_id, + } diff --git a/youtube_dl/extractor/chilloutzone.py b/youtube_dl/extractor/chilloutzone.py index c922f6959..0206d96db 100644 --- a/youtube_dl/extractor/chilloutzone.py +++ b/youtube_dl/extractor/chilloutzone.py @@ -57,7 +57,7 @@ class ChilloutzoneIE(InfoExtractor): base64_video_info = self._html_search_regex( r'var cozVidData = "(.+?)";', webpage, 'video data') - decoded_video_info = base64.b64decode(base64_video_info).decode("utf-8") + decoded_video_info = base64.b64decode(base64_video_info.encode('utf-8')).decode('utf-8') video_info_dict = json.loads(decoded_video_info) # get video information from dict diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index cf0a7551b..c949a4814 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -60,6 +60,17 @@ class CinemassacreIE(InfoExtractor): 'uploader_id': 'Cinemassacre', 'title': 'AVGN: McKids', } + }, + { + 'url': 'http://cinemassacre.com/2015/05/25/mario-kart-64-nintendo-64-james-mike-mondays/', + 'md5': '1376908e49572389e7b06251a53cdd08', + 'info_dict': { + 'id': 'Cinemassacre-555779690c440', + 'ext': 'mp4', + 'description': 'Let’s Play Mario Kart 64 !! Mario Kart 64 is a classic go-kart racing game released for the Nintendo 64 (N64). Today James & Mike do 4 player Battle Mode with Kyle and Bootsy!', + 'title': 'Mario Kart 64 (Nintendo 64) James & Mike Mondays', + 'upload_date': '20150525', + } } ] @@ -72,7 +83,7 @@ class CinemassacreIE(InfoExtractor): playerdata_url = self._search_regex( [ - r'src="(http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=.+?)"', + r'src="(http://(?:player2\.screenwavemedia\.com|player\.screenwavemedia\.com/play)/[a-zA-Z]+\.php\?[^"]*\bid=.+?)"', r']+src="((?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"', ], webpage, 'player data URL', default=None) diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cnet.py index 3145b3051..5dd69bff7 100644 --- a/youtube_dl/extractor/cnet.py +++ b/youtube_dl/extractor/cnet.py @@ -11,7 +11,7 @@ from ..utils import ( class CNETIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?cnet\.com/videos/(?P[^/]+)/' - _TEST = { + _TESTS = [{ 'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/', 'info_dict': { 'id': '56f4ea68-bd21-4852-b08c-4de5b8354c60', @@ -25,7 +25,20 @@ class CNETIE(InfoExtractor): 'params': { 'skip_download': 'requires rtmpdump', } - } + }, { + 'url': 'http://www.cnet.com/videos/whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187/', + 'info_dict': { + 'id': '56527b93-d25d-44e3-b738-f989ce2e49ba', + 'ext': 'flv', + 'description': 'Khail and Ashley wonder what other civic woes can be solved by self-tweeting objects, investigate a new kind of VR camera and watch an origami robot self-assemble, walk, climb, dig and dissolve. #TDPothole', + 'uploader_id': 'b163284d-6b73-44fc-b3e6-3da66c392d40', + 'uploader': 'Ashley Esqueda', + 'title': 'Whiny potholes tweet at local government when hit by cars (Tomorrow Daily 187)', + }, + 'params': { + 'skip_download': True, # requires rtmpdump + }, + }] def _real_extract(self, url): display_id = self._match_id(url) @@ -42,7 +55,7 @@ class CNETIE(InfoExtractor): raise ExtractorError('Cannot find video data') mpx_account = data['config']['players']['default']['mpx_account'] - vid = vdata['files']['rtmp'] + vid = vdata['files'].get('rtmp', vdata['files']['hds']) tp_link = 'http://link.theplatform.com/s/%s/%s' % (mpx_account, vid) video_id = vdata['id'] diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index 5efc5f4fe..3b1bd4033 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -12,7 +12,7 @@ from ..utils import ( class CNNIE(InfoExtractor): _VALID_URL = r'''(?x)https?://(?:(?:edition|www)\.)?cnn\.com/video/(?:data/.+?|\?)/ - (?P.+?/(?P[^/]+?)(?:\.(?:[a-z]{3,5})(?:-ap)?|(?=&)))''' + (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z\-]+)|(?=&)))''' _TESTS = [{ 'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn', diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index cecf917ff..49e4dc710 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -846,7 +846,7 @@ class InfoExtractor(object): def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, entry_protocol='m3u8', preference=None, - m3u8_id=None): + m3u8_id=None, note=None, errnote=None): formats = [{ 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])), @@ -865,8 +865,8 @@ class InfoExtractor(object): m3u8_doc = self._download_webpage( m3u8_url, video_id, - note='Downloading m3u8 information', - errnote='Failed to download m3u8 information') + note=note or 'Downloading m3u8 information', + errnote=errnote or 'Failed to download m3u8 information') last_info = None last_media = None kv_rex = re.compile( diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 1c77df47e..41f0c736d 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -76,8 +76,8 @@ class CrunchyrollIE(InfoExtractor): self._login() def _decrypt_subtitles(self, data, iv, id): - data = bytes_to_intlist(data) - iv = bytes_to_intlist(iv) + data = bytes_to_intlist(base64.b64decode(data.encode('utf-8'))) + iv = bytes_to_intlist(base64.b64decode(iv.encode('utf-8'))) id = int(id) def obfuscate_key_aux(count, modulo, start): @@ -179,6 +179,16 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text return output + def _extract_subtitles(self, subtitle): + sub_root = xml.etree.ElementTree.fromstring(subtitle) + return [{ + 'ext': 'srt', + 'data': self._convert_subtitles_to_srt(sub_root), + }, { + 'ext': 'ass', + 'data': self._convert_subtitles_to_ass(sub_root), + }] + def _get_subtitles(self, video_id, webpage): subtitles = {} for sub_id, sub_name in re.findall(r'\?ssid=([0-9]+)" title="([^"]+)', webpage): @@ -190,25 +200,11 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text data = self._search_regex(r'<data>([^<]+)', sub_page, 'subtitle_data', fatal=False) if not id or not iv or not data: continue - id = int(id) - iv = base64.b64decode(iv) - data = base64.b64decode(data) - subtitle = self._decrypt_subtitles(data, iv, id).decode('utf-8') lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False) if not lang_code: continue - sub_root = xml.etree.ElementTree.fromstring(subtitle) - subtitles[lang_code] = [ - { - 'ext': 'srt', - 'data': self._convert_subtitles_to_srt(sub_root), - }, - { - 'ext': 'ass', - 'data': self._convert_subtitles_to_ass(sub_root), - }, - ] + subtitles[lang_code] = self._extract_subtitles(subtitle) return subtitles def _real_extract(self, url): diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index db10b8d00..70aa4333c 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -225,7 +225,7 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): class DailymotionUserIE(DailymotionPlaylistIE): IE_NAME = 'dailymotion:user' - _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?:old/)?user/(?P<user>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?:(?:old/)?user/)?(?P<user>[^/]+)$' _PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s' _TESTS = [{ 'url': 'https://www.dailymotion.com/user/nqtv', @@ -239,7 +239,8 @@ class DailymotionUserIE(DailymotionPlaylistIE): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) user = mobj.group('user') - webpage = self._download_webpage(url, user) + webpage = self._download_webpage( + 'https://www.dailymotion.com/user/%s' % user, user) full_user = unescapeHTML(self._html_search_regex( r'<a class="nav-image" title="([^"]+)" href="/%s">' % re.escape(user), webpage, 'user')) diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py index d3e667528..d6723ecf2 100644 --- a/youtube_dl/extractor/discovery.py +++ b/youtube_dl/extractor/discovery.py @@ -2,19 +2,19 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + parse_duration, parse_iso8601, - int_or_none, ) +from ..compat import compat_str class DiscoveryIE(InfoExtractor): _VALID_URL = r'http://www\.discovery\.com\/[a-zA-Z0-9\-]*/[a-zA-Z0-9\-]*/videos/(?P<id>[a-zA-Z0-9_\-]*)(?:\.htm)?' - _TEST = { + _TESTS = [{ 'url': 'http://www.discovery.com/tv-shows/mythbusters/videos/mission-impossible-outtakes.htm', - 'md5': '3c69d77d9b0d82bfd5e5932a60f26504', 'info_dict': { - 'id': 'mission-impossible-outtakes', - 'ext': 'flv', + 'id': '20769', + 'ext': 'mp4', 'title': 'Mission Impossible Outtakes', 'description': ('Watch Jamie Hyneman and Adam Savage practice being' ' each other -- to the point of confusing Jamie\'s dog -- and ' @@ -24,22 +24,36 @@ class DiscoveryIE(InfoExtractor): 'timestamp': 1303099200, 'upload_date': '20110418', }, - } + 'params': { + 'skip_download': True, # requires ffmpeg + } + }, { + 'url': 'http://www.discovery.com/tv-shows/mythbusters/videos/mythbusters-the-simpsons', + 'info_dict': { + 'id': 'mythbusters-the-simpsons', + 'title': 'MythBusters: The Simpsons', + }, + 'playlist_count': 9, + }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + info = self._download_json(url + '?flat=1', video_id) - info = self._parse_json(self._search_regex( - r'(?s)<script type="application/ld\+json">(.*?)</script>', - webpage, 'video info'), video_id) + video_title = info.get('playlist_title') or info.get('video_title') - return { - 'id': video_id, - 'title': info['name'], - 'url': info['contentURL'], - 'description': info.get('description'), - 'thumbnail': info.get('thumbnailUrl'), - 'timestamp': parse_iso8601(info.get('uploadDate')), - 'duration': int_or_none(info.get('duration')), - } + entries = [{ + 'id': compat_str(video_info['id']), + 'formats': self._extract_m3u8_formats( + video_info['src'], video_id, ext='mp4', + note='Download m3u8 information for video %d' % (idx + 1)), + 'title': video_info['title'], + 'description': video_info.get('description'), + 'duration': parse_duration(video_info.get('video_length')), + 'webpage_url': video_info.get('href'), + 'thumbnail': video_info.get('thumbnailURL'), + 'alt_title': video_info.get('secondary_title'), + 'timestamp': parse_iso8601(video_info.get('publishedDate')), + } for idx, video_info in enumerate(info['playlist'])] + + return self.playlist_result(entries, video_id, video_title) diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index f25ab319e..baa24c6d1 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -1,8 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals -from .common import InfoExtractor, ExtractorError -from ..utils import parse_iso8601 +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + parse_iso8601, +) class DRTVIE(InfoExtractor): @@ -60,19 +63,31 @@ class DRTVIE(InfoExtractor): restricted_to_denmark = asset['RestrictedToDenmark'] spoken_subtitles = asset['Target'] == 'SpokenSubtitles' for link in asset['Links']: - target = link['Target'] uri = link['Uri'] + target = link['Target'] format_id = target - preference = -1 if target == 'HDS' else -2 + preference = None if spoken_subtitles: - preference -= 2 + preference = -1 format_id += '-spoken-subtitles' - formats.append({ - 'url': uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43' if target == 'HDS' else uri, - 'format_id': format_id, - 'ext': link['FileFormat'], - 'preference': preference, - }) + if target == 'HDS': + formats.extend(self._extract_f4m_formats( + uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43', + video_id, preference, f4m_id=format_id)) + elif target == 'HLS': + formats.extend(self._extract_m3u8_formats( + uri, video_id, 'mp4', preference=preference, + m3u8_id=format_id)) + else: + bitrate = link.get('Bitrate') + if bitrate: + format_id += '-%s' % bitrate + formats.append({ + 'url': uri, + 'format_id': format_id, + 'tbr': bitrate, + 'ext': link.get('FileFormat'), + }) subtitles_list = asset.get('SubtitlesList') if isinstance(subtitles_list, list): LANGS = { diff --git a/youtube_dl/extractor/empflix.py b/youtube_dl/extractor/empflix.py index 70f8efe27..4827022e0 100644 --- a/youtube_dl/extractor/empflix.py +++ b/youtube_dl/extractor/empflix.py @@ -4,22 +4,28 @@ from .tnaflix import TNAFlixIE class EMPFlixIE(TNAFlixIE): - _VALID_URL = r'^https?://www\.empflix\.com/videos/(?P<display_id>[0-9a-zA-Z-]+)-(?P<id>[0-9]+)\.html' + _VALID_URL = r'https?://(?:www\.)?empflix\.com/videos/(?P<display_id>.+?)-(?P<id>[0-9]+)\.html' _TITLE_REGEX = r'name="title" value="(?P<title>[^"]*)"' _DESCRIPTION_REGEX = r'name="description" value="([^"]*)"' _CONFIG_REGEX = r'flashvars\.config\s*=\s*escape\("([^"]+)"' - _TEST = { - 'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html', - 'md5': 'b1bc15b6412d33902d6e5952035fcabc', - 'info_dict': { - 'id': '33051', - 'display_id': 'Amateur-Finger-Fuck', - 'ext': 'mp4', - 'title': 'Amateur Finger Fuck', - 'description': 'Amateur solo finger fucking.', - 'thumbnail': 're:https?://.*\.jpg$', - 'age_limit': 18, + _TESTS = [ + { + 'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html', + 'md5': 'b1bc15b6412d33902d6e5952035fcabc', + 'info_dict': { + 'id': '33051', + 'display_id': 'Amateur-Finger-Fuck', + 'ext': 'mp4', + 'title': 'Amateur Finger Fuck', + 'description': 'Amateur solo finger fucking.', + 'thumbnail': 're:https?://.*\.jpg$', + 'age_limit': 18, + } + }, + { + 'url': 'http://www.empflix.com/videos/[AROMA][ARMD-718]-Aoi-Yoshino-Sawa-25826.html', + 'only_matching': True, } - } + ] diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 937b28fcc..82dc27bc6 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -50,7 +50,10 @@ class FacebookIE(InfoExtractor): 'id': '274175099429670', 'ext': 'mp4', 'title': 'Facebook video #274175099429670', - } + }, + 'expected_warnings': [ + 'title' + ] }, { 'url': 'https://www.facebook.com/video.php?v=10204634152394104', 'only_matching': True, @@ -149,12 +152,12 @@ class FacebookIE(InfoExtractor): raise ExtractorError('Cannot find video formats') video_title = self._html_search_regex( - r'<h2 class="uiHeaderTitle">([^<]*)</h2>', webpage, 'title', - fatal=False) + r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage, 'title', + default=None) if not video_title: video_title = self._html_search_regex( r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(.*?)</span>', - webpage, 'alternative title', default=None) + webpage, 'alternative title', fatal=False) video_title = limit_length(video_title, 80) if not video_title: video_title = 'Facebook video #%s' % video_id diff --git a/youtube_dl/extractor/firedrive.py b/youtube_dl/extractor/firedrive.py deleted file mode 100644 index 3191116d9..000000000 --- a/youtube_dl/extractor/firedrive.py +++ /dev/null @@ -1,80 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse, - compat_urllib_request, -) -from ..utils import ( - ExtractorError, -) - - -class FiredriveIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?firedrive\.com/' + \ - '(?:file|embed)/(?P<id>[0-9a-zA-Z]+)' - _FILE_DELETED_REGEX = r'<div class="removed_file_image">' - - _TESTS = [{ - 'url': 'https://www.firedrive.com/file/FEB892FA160EBD01', - 'md5': 'd5d4252f80ebeab4dc2d5ceaed1b7970', - 'info_dict': { - 'id': 'FEB892FA160EBD01', - 'ext': 'flv', - 'title': 'bbb_theora_486kbit.flv', - 'thumbnail': 're:^http://.*\.jpg$', - }, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - url = 'http://firedrive.com/file/%s' % video_id - webpage = self._download_webpage(url, video_id) - - if re.search(self._FILE_DELETED_REGEX, webpage) is not None: - raise ExtractorError('Video %s does not exist' % video_id, - expected=True) - - fields = dict(re.findall(r'''(?x)<input\s+ - type="hidden"\s+ - name="([^"]+)"\s+ - value="([^"]*)" - ''', webpage)) - - post = compat_urllib_parse.urlencode(fields) - req = compat_urllib_request.Request(url, post) - req.add_header('Content-type', 'application/x-www-form-urlencoded') - - # Apparently, this header is required for confirmation to work. - req.add_header('Host', 'www.firedrive.com') - - webpage = self._download_webpage(req, video_id, - 'Downloading video page') - - title = self._search_regex(r'class="external_title_left">(.+)</div>', - webpage, 'title') - thumbnail = self._search_regex(r'image:\s?"(//[^\"]+)', webpage, - 'thumbnail', fatal=False) - if thumbnail is not None: - thumbnail = 'http:' + thumbnail - - ext = self._search_regex(r'type:\s?\'([^\']+)\',', - webpage, 'extension', fatal=False) - video_url = self._search_regex( - r'file:\s?loadURL\(\'(http[^\']+)\'\),', webpage, 'file url') - - formats = [{ - 'format_id': 'sd', - 'url': video_url, - 'ext': ext, - }] - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'formats': formats, - } diff --git a/youtube_dl/extractor/fivetv.py b/youtube_dl/extractor/fivetv.py new file mode 100644 index 000000000..13fbc4da2 --- /dev/null +++ b/youtube_dl/extractor/fivetv.py @@ -0,0 +1,88 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import int_or_none + + +class FiveTVIE(InfoExtractor): + _VALID_URL = r'''(?x) + http:// + (?:www\.)?5-tv\.ru/ + (?: + (?:[^/]+/)+(?P<id>\d+)| + (?P<path>[^/?#]+)(?:[/?#])? + ) + ''' + + _TESTS = [{ + 'url': 'http://5-tv.ru/news/96814/', + 'md5': 'bbff554ad415ecf5416a2f48c22d9283', + 'info_dict': { + 'id': '96814', + 'ext': 'mp4', + 'title': 'Россияне выбрали имя для общенациональной платежной системы', + 'description': 'md5:a8aa13e2b7ad36789e9f77a74b6de660', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 180, + }, + }, { + 'url': 'http://5-tv.ru/video/1021729/', + 'info_dict': { + 'id': '1021729', + 'ext': 'mp4', + 'title': '3D принтер', + 'description': 'md5:d76c736d29ef7ec5c0cf7d7c65ffcb41', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 180, + }, + }, { + 'url': 'http://www.5-tv.ru/glavnoe/#itemDetails', + 'info_dict': { + 'id': 'glavnoe', + 'ext': 'mp4', + 'title': 'Итоги недели с 8 по 14 июня 2015 года', + 'thumbnail': 're:^https?://.*\.jpg$', + }, + }, { + 'url': 'http://www.5-tv.ru/glavnoe/broadcasts/508645/', + 'only_matching': True, + }, { + 'url': 'http://5-tv.ru/films/1507502/', + 'only_matching': True, + }, { + 'url': 'http://5-tv.ru/programs/broadcast/508713/', + 'only_matching': True, + }, { + 'url': 'http://5-tv.ru/angel/', + 'only_matching': True, + }, { + 'url': 'http://www.5-tv.ru/schedule/?iframe=true&width=900&height=450', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') or mobj.group('path') + + webpage = self._download_webpage(url, video_id) + + video_url = self._search_regex( + r'<a[^>]+?href="([^"]+)"[^>]+?class="videoplayer"', + webpage, 'video url') + + title = self._og_search_title(webpage, default=None) or self._search_regex( + r'<title>([^<]+)', webpage, 'title') + duration = int_or_none(self._og_search_property( + 'video:duration', webpage, 'duration', default=None)) + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'description': self._og_search_description(webpage, default=None), + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'duration': duration, + } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 9a7b0d25d..f6b984300 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -9,6 +9,8 @@ from .common import InfoExtractor from .youtube import YoutubeIE from ..compat import ( compat_urllib_parse, + compat_urllib_parse_unquote, + compat_urllib_request, compat_urlparse, compat_xml_parse_error, ) @@ -32,6 +34,7 @@ from .brightcove import BrightcoveIE from .nbc import NBCSportsVPlayerIE from .ooyala import OoyalaIE from .rutv import RUTVIE +from .tvc import TVCIE from .sportbox import SportBoxEmbedIE from .smotri import SmotriIE from .condenast import CondeNastIE @@ -39,6 +42,7 @@ from .udn import UDNEmbedIE from .senateisvp import SenateISVPIE from .bliptv import BlipTVIE from .svt import SVTIE +from .pornhub import PornHubIE class GenericIE(InfoExtractor): @@ -46,6 +50,97 @@ class GenericIE(InfoExtractor): _VALID_URL = r'.*' IE_NAME = 'generic' _TESTS = [ + # Direct link to a video + { + 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4', + 'md5': '67d406c2bcb6af27fa886f31aa934bbe', + 'info_dict': { + 'id': 'trailer', + 'ext': 'mp4', + 'title': 'trailer', + 'upload_date': '20100513', + } + }, + # Direct link to media delivered compressed (until Accept-Encoding is *) + { + 'url': 'http://calimero.tk/muzik/FictionJunction-Parallel_Hearts.flac', + 'md5': '128c42e68b13950268b648275386fc74', + 'info_dict': { + 'id': 'FictionJunction-Parallel_Hearts', + 'ext': 'flac', + 'title': 'FictionJunction-Parallel_Hearts', + 'upload_date': '20140522', + }, + 'expected_warnings': [ + 'URL could be a direct video link, returning it as such.' + ] + }, + # Direct download with broken HEAD + { + 'url': 'http://ai-radio.org:8000/radio.opus', + 'info_dict': { + 'id': 'radio', + 'ext': 'opus', + 'title': 'radio', + }, + 'params': { + 'skip_download': True, # infinite live stream + }, + 'expected_warnings': [ + r'501.*Not Implemented' + ], + }, + # Direct link with incorrect MIME type + { + 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', + 'md5': '4ccbebe5f36706d85221f204d7eb5913', + 'info_dict': { + 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', + 'id': '5_Lennart_Poettering_-_Systemd', + 'ext': 'webm', + 'title': '5_Lennart_Poettering_-_Systemd', + 'upload_date': '20141120', + }, + 'expected_warnings': [ + 'URL could be a direct video link, returning it as such.' + ] + }, + # RSS feed + { + 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml', + 'info_dict': { + 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml', + 'title': 'Zero Punctuation', + 'description': 're:.*groundbreaking video review series.*' + }, + 'playlist_mincount': 11, + }, + # RSS feed with enclosure + { + 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', + 'info_dict': { + 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624', + 'ext': 'm4v', + 'upload_date': '20150228', + 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624', + } + }, + # google redirect + { + 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', + 'info_dict': { + 'id': 'cmQHVoWB5FY', + 'ext': 'mp4', + 'upload_date': '20130224', + 'uploader_id': 'TheVerge', + 'description': 're:^Chris Ziegler takes a look at the\.*', + 'uploader': 'The Verge', + 'title': 'First Firefox OS phones side-by-side', + }, + 'params': { + 'skip_download': False, + } + }, { 'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html', 'md5': '85b90ccc9d73b4acd9138d3af4c27f89', @@ -125,17 +220,6 @@ class GenericIE(InfoExtractor): 'skip_download': True, # m3u8 download }, }, - # Direct link to a video - { - 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4', - 'md5': '67d406c2bcb6af27fa886f31aa934bbe', - 'info_dict': { - 'id': 'trailer', - 'ext': 'mp4', - 'title': 'trailer', - 'upload_date': '20100513', - } - }, # ooyala video { 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219', @@ -160,22 +244,6 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['Ooyala'], }, - # google redirect - { - 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', - 'info_dict': { - 'id': 'cmQHVoWB5FY', - 'ext': 'mp4', - 'upload_date': '20130224', - 'uploader_id': 'TheVerge', - 'description': 're:^Chris Ziegler takes a look at the\.*', - 'uploader': 'The Verge', - 'title': 'First Firefox OS phones side-by-side', - }, - 'params': { - 'skip_download': False, - } - }, # embed.ly video { 'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/', @@ -225,6 +293,15 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, + # TVC embed + { + 'url': 'http://sch1298sz.mskobr.ru/dou_edu/karamel_ki/filial_galleries/video/iframe_src_http_tvc_ru_video_iframe_id_55304_isplay_false_acc_video_id_channel_brand_id_11_show_episodes_episode_id_32307_frameb/', + 'info_dict': { + 'id': '55304', + 'ext': 'mp4', + 'title': 'Дошкольное воспитание', + }, + }, # SportBox embed { 'url': 'http://www.vestifinance.ru/articles/25753', @@ -407,16 +484,6 @@ class GenericIE(InfoExtractor): 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com', } }, - # RSS feed - { - 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml', - 'info_dict': { - 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml', - 'title': 'Zero Punctuation', - 'description': 're:.*groundbreaking video review series.*' - }, - 'playlist_mincount': 11, - }, # Multiple brightcove videos # https://github.com/rg3/youtube-dl/issues/2283 { @@ -470,21 +537,6 @@ class GenericIE(InfoExtractor): 'uploader': 'thoughtworks.wistia.com', }, }, - # Direct download with broken HEAD - { - 'url': 'http://ai-radio.org:8000/radio.opus', - 'info_dict': { - 'id': 'radio', - 'ext': 'opus', - 'title': 'radio', - }, - 'params': { - 'skip_download': True, # infinite live stream - }, - 'expected_warnings': [ - r'501.*Not Implemented' - ], - }, # Soundcloud embed { 'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/', @@ -516,21 +568,6 @@ class GenericIE(InfoExtractor): }, 'playlist_mincount': 2, }, - # Direct link with incorrect MIME type - { - 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', - 'md5': '4ccbebe5f36706d85221f204d7eb5913', - 'info_dict': { - 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', - 'id': '5_Lennart_Poettering_-_Systemd', - 'ext': 'webm', - 'title': '5_Lennart_Poettering_-_Systemd', - 'upload_date': '20141120', - }, - 'expected_warnings': [ - 'URL could be a direct video link, returning it as such.' - ] - }, # Cinchcast embed { 'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/', @@ -689,16 +726,6 @@ class GenericIE(InfoExtractor): 'age_limit': 0, }, }, - # RSS feed with enclosure - { - 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', - 'info_dict': { - 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624', - 'ext': 'm4v', - 'upload_date': '20150228', - 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624', - } - }, # Crooks and Liars embed { 'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists', @@ -773,6 +800,18 @@ class GenericIE(InfoExtractor): # rtmpe downloads 'skip_download': True, } + }, + # Brightcove URL in single quotes + { + 'url': 'http://www.sportsnet.ca/baseball/mlb/sn-presents-russell-martin-world-citizen/', + 'md5': '4ae374f1f8b91c889c4b9203c8c752af', + 'info_dict': { + 'id': '4255764656001', + 'ext': 'mp4', + 'title': 'SN Presents: Russell Martin, World Citizen', + 'description': 'To understand why he was the Toronto Blue Jays’ top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.', + 'uploader': 'Rogers Sportsnet', + }, } ] @@ -894,7 +933,7 @@ class GenericIE(InfoExtractor): force_videoid = smuggled_data['force_videoid'] video_id = force_videoid else: - video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0] + video_id = compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0]) self.to_screen('%s: Requesting header' % video_id) @@ -916,7 +955,9 @@ class GenericIE(InfoExtractor): full_response = None if head_response is False: - full_response = self._request_webpage(url, video_id) + request = compat_urllib_request.Request(url) + request.add_header('Accept-Encoding', '*') + full_response = self._request_webpage(request, video_id) head_response = full_response # Check for direct link to a video @@ -927,7 +968,7 @@ class GenericIE(InfoExtractor): head_response.headers.get('Last-Modified')) return { 'id': video_id, - 'title': os.path.splitext(url_basename(url))[0], + 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]), 'direct': True, 'formats': [{ 'format_id': m.group('format_id'), @@ -941,7 +982,17 @@ class GenericIE(InfoExtractor): self._downloader.report_warning('Falling back on generic information extractor.') if not full_response: - full_response = self._request_webpage(url, video_id) + request = compat_urllib_request.Request(url) + # Some webservers may serve compressed content of rather big size (e.g. gzipped flac) + # making it impossible to download only chunk of the file (yet we need only 512kB to + # test whether it's HTML or not). According to youtube-dl default Accept-Encoding + # that will always result in downloading the whole file that is not desirable. + # Therefore for extraction pass we have to override Accept-Encoding to any in order + # to accept raw bytes and being able to download only a chunk. + # It may probably better to solve this by checking Content-Type for application/octet-stream + # after HEAD request finishes, but not sure if we can rely on this. + request.add_header('Accept-Encoding', '*') + full_response = self._request_webpage(request, video_id) # Maybe it's a direct link to a video? # Be careful not to download the whole thing! @@ -953,7 +1004,7 @@ class GenericIE(InfoExtractor): head_response.headers.get('Last-Modified')) return { 'id': video_id, - 'title': os.path.splitext(url_basename(url))[0], + 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]), 'direct': True, 'url': url, 'upload_date': upload_date, @@ -1033,7 +1084,7 @@ class GenericIE(InfoExtractor): # Look for embedded rtl.nl player matches = re.findall( - r']+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"', webpage) if matches: return _playlist_from_matches(matches, ie='RtlNl') @@ -1261,11 +1312,27 @@ class GenericIE(InfoExtractor): if rutv_url: return self.url_result(rutv_url, 'RUTV') + # Look for embedded TVC player + tvc_url = TVCIE._extract_url(webpage) + if tvc_url: + return self.url_result(tvc_url, 'TVC') + # Look for embedded SportBox player sportbox_urls = SportBoxEmbedIE._extract_urls(webpage) if sportbox_urls: return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed') + # Look for embedded PornHub player + pornhub_url = PornHubIE._extract_url(webpage) + if pornhub_url: + return self.url_result(pornhub_url, 'PornHub') + + # Look for embedded Tvigle player + mobj = re.search( + r']+?src=(["\'])(?P(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage) + if mobj is not None: + return self.url_result(mobj.group('url'), 'Tvigle') + # Look for embedded TED player mobj = re.search( r']+?src=(["\'])(?Phttps?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage) diff --git a/youtube_dl/extractor/imgur.py b/youtube_dl/extractor/imgur.py index fe5d95e2c..d692ea79a 100644 --- a/youtube_dl/extractor/imgur.py +++ b/youtube_dl/extractor/imgur.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( int_or_none, js_to_json, @@ -12,7 +13,7 @@ from ..utils import ( class ImgurIE(InfoExtractor): - _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?P[a-zA-Z0-9]+)(?:\.mp4|\.gifv)?' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?P[a-zA-Z0-9]+)' _TESTS = [{ 'url': 'https://i.imgur.com/A61SaA1.gifv', @@ -34,7 +35,8 @@ class ImgurIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage( + compat_urlparse.urljoin(url, video_id), video_id) width = int_or_none(self._search_regex( r'[^?#]+)' + _VALID_URL = r'https?://play\.iprima\.cz/(?:[^/]+/)*(?P[^?#]+)' _TESTS = [{ 'url': 'http://play.iprima.cz/particka/particka-92', @@ -23,7 +24,7 @@ class IPrimaIE(InfoExtractor): 'id': '39152', 'ext': 'flv', 'title': 'Partička (92)', - 'description': 'md5:3740fda51464da35a2d4d0670b8e4fd6', + 'description': 'md5:74e9617e51bca67c3ecfb2c6f9766f45', 'thumbnail': 'http://play.iprima.cz/sites/default/files/image_crops/image_620x349/3/491483_particka-92_image_620x349.jpg', }, 'params': { @@ -35,13 +36,14 @@ class IPrimaIE(InfoExtractor): 'id': '9718337', 'ext': 'flv', 'title': 'Tchibo Partička - Jarní móda', - 'description': 'md5:589f8f59f414220621ff8882eb3ce7be', 'thumbnail': 're:^http:.*\.jpg$', }, 'params': { 'skip_download': True, # requires rtmpdump }, - 'skip': 'Do not have permission to access this page', + }, { + 'url': 'http://play.iprima.cz/zpravy-ftv-prima-2752015', + 'only_matching': True, }] def _real_extract(self, url): @@ -102,8 +104,10 @@ class IPrimaIE(InfoExtractor): return { 'id': real_id, - 'title': self._og_search_title(webpage), + 'title': remove_end(self._og_search_title(webpage), ' | Prima PLAY'), 'thumbnail': self._og_search_thumbnail(webpage), 'formats': formats, - 'description': self._og_search_description(webpage), + 'description': self._search_regex( + r']+itemprop="description"[^>]*>([^<]+)', + webpage, 'description', default=None), } diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py new file mode 100644 index 000000000..9106dd074 --- /dev/null +++ b/youtube_dl/extractor/iqiyi.py @@ -0,0 +1,296 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import hashlib +import math +import os.path +import random +import re +import time +import uuid +import zlib + +from .common import InfoExtractor +from ..compat import compat_urllib_parse +from ..utils import ( + ExtractorError, + url_basename, +) + + +class IqiyiIE(InfoExtractor): + IE_NAME = 'iqiyi' + + _VALID_URL = r'http://(?:www\.)iqiyi.com/v_.+?\.html' + + _TESTS = [{ + 'url': 'http://www.iqiyi.com/v_19rrojlavg.html', + 'md5': '2cb594dc2781e6c941a110d8f358118b', + 'info_dict': { + 'id': '9c1fb1b99d192b21c559e5a1a2cb3c73', + 'title': '美国德州空中惊现奇异云团 酷似UFO', + 'ext': 'f4v', + } + }, { + 'url': 'http://www.iqiyi.com/v_19rrhnnclk.html', + 'info_dict': { + 'id': 'e3f585b550a280af23c98b6cb2be19fb', + 'title': '名侦探柯南第752集', + }, + 'playlist': [{ + 'md5': '7e49376fecaffa115d951634917fe105', + 'info_dict': { + 'id': 'e3f585b550a280af23c98b6cb2be19fb_part1', + 'ext': 'f4v', + 'title': '名侦探柯南第752集', + }, + }, { + 'md5': '41b75ba13bb7ac0e411131f92bc4f6ca', + 'info_dict': { + 'id': 'e3f585b550a280af23c98b6cb2be19fb_part2', + 'ext': 'f4v', + 'title': '名侦探柯南第752集', + }, + }, { + 'md5': '0cee1dd0a3d46a83e71e2badeae2aab0', + 'info_dict': { + 'id': 'e3f585b550a280af23c98b6cb2be19fb_part3', + 'ext': 'f4v', + 'title': '名侦探柯南第752集', + }, + }, { + 'md5': '4f8ad72373b0c491b582e7c196b0b1f9', + 'info_dict': { + 'id': 'e3f585b550a280af23c98b6cb2be19fb_part4', + 'ext': 'f4v', + 'title': '名侦探柯南第752集', + }, + }, { + 'md5': 'd89ad028bcfad282918e8098e811711d', + 'info_dict': { + 'id': 'e3f585b550a280af23c98b6cb2be19fb_part5', + 'ext': 'f4v', + 'title': '名侦探柯南第752集', + }, + }, { + 'md5': '9cb1e5c95da25dff0660c32ae50903b7', + 'info_dict': { + 'id': 'e3f585b550a280af23c98b6cb2be19fb_part6', + 'ext': 'f4v', + 'title': '名侦探柯南第752集', + }, + }, { + 'md5': '155116e0ff1867bbc9b98df294faabc9', + 'info_dict': { + 'id': 'e3f585b550a280af23c98b6cb2be19fb_part7', + 'ext': 'f4v', + 'title': '名侦探柯南第752集', + }, + }, { + 'md5': '53f5db77622ae14fa493ed2a278a082b', + 'info_dict': { + 'id': 'e3f585b550a280af23c98b6cb2be19fb_part8', + 'ext': 'f4v', + 'title': '名侦探柯南第752集', + }, + }], + }] + + _FORMATS_MAP = [ + ('1', 'h6'), + ('2', 'h5'), + ('3', 'h4'), + ('4', 'h3'), + ('5', 'h2'), + ('10', 'h1'), + ] + + def construct_video_urls(self, data, video_id, _uuid): + def do_xor(x, y): + a = y % 3 + if a == 1: + return x ^ 121 + if a == 2: + return x ^ 72 + return x ^ 103 + + def get_encode_code(l): + a = 0 + b = l.split('-') + c = len(b) + s = '' + for i in range(c - 1, -1, -1): + a = do_xor(int(b[c - i - 1], 16), i) + s += chr(a) + return s[::-1] + + def get_path_key(x, format_id, segment_index): + mg = ')(*&^flash@#$%a' + tm = self._download_json( + 'http://data.video.qiyi.com/t?tn=' + str(random.random()), video_id, + note='Download path key of segment %d for format %s' % (segment_index + 1, format_id) + )['t'] + t = str(int(math.floor(int(tm) / (600.0)))) + return hashlib.md5((t + mg + x).encode('utf8')).hexdigest() + + video_urls_dict = {} + for format_item in data['vp']['tkl'][0]['vs']: + if 0 < int(format_item['bid']) <= 10: + format_id = self.get_format(format_item['bid']) + else: + continue + + video_urls = [] + + video_urls_info = format_item['fs'] + if not format_item['fs'][0]['l'].startswith('/'): + t = get_encode_code(format_item['fs'][0]['l']) + if t.endswith('mp4'): + video_urls_info = format_item['flvs'] + + for segment_index, segment in enumerate(video_urls_info): + vl = segment['l'] + if not vl.startswith('/'): + vl = get_encode_code(vl) + key = get_path_key( + vl.split('/')[-1].split('.')[0], format_id, segment_index) + filesize = segment['b'] + base_url = data['vp']['du'].split('/') + base_url.insert(-1, key) + base_url = '/'.join(base_url) + param = { + 'su': _uuid, + 'qyid': uuid.uuid4().hex, + 'client': '', + 'z': '', + 'bt': '', + 'ct': '', + 'tn': str(int(time.time())) + } + api_video_url = base_url + vl + '?' + \ + compat_urllib_parse.urlencode(param) + js = self._download_json( + api_video_url, video_id, + note='Download video info of segment %d for format %s' % (segment_index + 1, format_id)) + video_url = js['l'] + video_urls.append( + (video_url, filesize)) + + video_urls_dict[format_id] = video_urls + return video_urls_dict + + def get_format(self, bid): + matched_format_ids = [_format_id for _bid, _format_id in self._FORMATS_MAP if _bid == str(bid)] + return matched_format_ids[0] if len(matched_format_ids) else None + + def get_bid(self, format_id): + matched_bids = [_bid for _bid, _format_id in self._FORMATS_MAP if _format_id == format_id] + return matched_bids[0] if len(matched_bids) else None + + def get_raw_data(self, tvid, video_id, enc_key, _uuid): + tm = str(int(time.time())) + param = { + 'key': 'fvip', + 'src': hashlib.md5(b'youtube-dl').hexdigest(), + 'tvId': tvid, + 'vid': video_id, + 'vinfo': 1, + 'tm': tm, + 'enc': hashlib.md5( + (enc_key + tm + tvid).encode('utf8')).hexdigest(), + 'qyid': _uuid, + 'tn': random.random(), + 'um': 0, + 'authkey': hashlib.md5( + (tm + tvid).encode('utf8')).hexdigest() + } + + api_url = 'http://cache.video.qiyi.com/vms' + '?' + \ + compat_urllib_parse.urlencode(param) + raw_data = self._download_json(api_url, video_id) + return raw_data + + def get_enc_key(self, swf_url, video_id): + filename, _ = os.path.splitext(url_basename(swf_url)) + enc_key_json = self._downloader.cache.load('iqiyi-enc-key', filename) + if enc_key_json is not None: + return enc_key_json[0] + + req = self._request_webpage( + swf_url, video_id, note='download swf content') + cn = req.read() + cn = zlib.decompress(cn[8:]) + pt = re.compile(b'MixerRemote\x08(?P.+?)\$&vv') + enc_key = self._search_regex(pt, cn, 'enc_key').decode('utf8') + + self._downloader.cache.store('iqiyi-enc-key', filename, [enc_key]) + + return enc_key + + def _real_extract(self, url): + webpage = self._download_webpage( + url, 'temp_id', note='download video page') + tvid = self._search_regex( + r'data-player-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid') + video_id = self._search_regex( + r'data-player-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id') + swf_url = self._search_regex( + r'(http://[^\'"]+MainPlayer[^.]+\.swf)', webpage, 'swf player URL') + _uuid = uuid.uuid4().hex + + enc_key = self.get_enc_key(swf_url, video_id) + + raw_data = self.get_raw_data(tvid, video_id, enc_key, _uuid) + + if raw_data['code'] != 'A000000': + raise ExtractorError('Unable to load data. Error code: ' + raw_data['code']) + + if not raw_data['data']['vp']['tkl']: + raise ExtractorError('No support iQiqy VIP video') + + data = raw_data['data'] + + title = data['vi']['vn'] + + # generate video_urls_dict + video_urls_dict = self.construct_video_urls( + data, video_id, _uuid) + + # construct info + entries = [] + for format_id in video_urls_dict: + video_urls = video_urls_dict[format_id] + for i, video_url_info in enumerate(video_urls): + if len(entries) < i + 1: + entries.append({'formats': []}) + entries[i]['formats'].append( + { + 'url': video_url_info[0], + 'filesize': video_url_info[-1], + 'format_id': format_id, + 'preference': int(self.get_bid(format_id)) + } + ) + + for i in range(len(entries)): + self._sort_formats(entries[i]['formats']) + entries[i].update( + { + 'id': '%s_part%d' % (video_id, i + 1), + 'title': title, + } + ) + + if len(entries) > 1: + info = { + '_type': 'multi_video', + 'id': video_id, + 'title': title, + 'entries': entries, + } + else: + info = entries[0] + info['id'] = video_id + info['title'] = title + + return info diff --git a/youtube_dl/extractor/izlesene.py b/youtube_dl/extractor/izlesene.py index 99a1361f8..bc226fa67 100644 --- a/youtube_dl/extractor/izlesene.py +++ b/youtube_dl/extractor/izlesene.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote from ..utils import ( determine_ext, float_or_none, @@ -30,7 +31,7 @@ class IzleseneIE(InfoExtractor): 'description': 'md5:253753e2655dde93f59f74b572454f6d', 'thumbnail': 're:^http://.*\.jpg', 'uploader_id': 'pelikzzle', - 'timestamp': 1404302298, + 'timestamp': int, 'upload_date': '20140702', 'duration': 95.395, 'age_limit': 0, @@ -46,7 +47,7 @@ class IzleseneIE(InfoExtractor): 'description': 'Tarkan Dortmund 2006 Konseri', 'thumbnail': 're:^http://.*\.jpg', 'uploader_id': 'parlayankiz', - 'timestamp': 1163322193, + 'timestamp': int, 'upload_date': '20061112', 'duration': 253.666, 'age_limit': 0, @@ -67,9 +68,9 @@ class IzleseneIE(InfoExtractor): uploader = self._html_search_regex( r"adduserUsername\s*=\s*'([^']+)';", - webpage, 'uploader', fatal=False, default='') + webpage, 'uploader', fatal=False) timestamp = parse_iso8601(self._html_search_meta( - 'uploadDate', webpage, 'upload date', fatal=False)) + 'uploadDate', webpage, 'upload date')) duration = float_or_none(self._html_search_regex( r'"videoduration"\s*:\s*"([^"]+)"', @@ -86,8 +87,7 @@ class IzleseneIE(InfoExtractor): # Might be empty for some videos. streams = self._html_search_regex( - r'"qualitylevel"\s*:\s*"([^"]+)"', - webpage, 'streams', fatal=False, default='') + r'"qualitylevel"\s*:\s*"([^"]+)"', webpage, 'streams', default='') formats = [] if streams: @@ -95,15 +95,15 @@ class IzleseneIE(InfoExtractor): quality, url = re.search(r'\[(\w+)\](.+)', stream).groups() formats.append({ 'format_id': '%sp' % quality if quality else 'sd', - 'url': url, + 'url': compat_urllib_parse_unquote(url), 'ext': ext, }) else: stream_url = self._search_regex( - r'"streamurl"\s?:\s?"([^"]+)"', webpage, 'stream URL') + r'"streamurl"\s*:\s*"([^"]+)"', webpage, 'stream URL') formats.append({ 'format_id': 'sd', - 'url': stream_url, + 'url': compat_urllib_parse_unquote(stream_url), 'ext': ext, }) diff --git a/youtube_dl/extractor/karrierevideos.py b/youtube_dl/extractor/karrierevideos.py new file mode 100644 index 000000000..bed94bc93 --- /dev/null +++ b/youtube_dl/extractor/karrierevideos.py @@ -0,0 +1,96 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + fix_xml_ampersands, + float_or_none, + xpath_with_ns, + xpath_text, +) + + +class KarriereVideosIE(InfoExtractor): + _VALID_URL = r'http://(?:www\.)?karrierevideos\.at(?:/[^/]+)+/(?P[^/]+)' + _TESTS = [{ + 'url': 'http://www.karrierevideos.at/berufsvideos/mittlere-hoehere-schulen/altenpflegerin', + 'info_dict': { + 'id': '32c91', + 'ext': 'flv', + 'title': 'AltenpflegerIn', + 'description': 'md5:dbadd1259fde2159a9b28667cb664ae2', + 'thumbnail': 're:^http://.*\.png', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + # broken ampersands + 'url': 'http://www.karrierevideos.at/orientierung/vaeterkarenz-und-neue-chancen-fuer-muetter-baby-was-nun', + 'info_dict': { + 'id': '5sniu', + 'ext': 'flv', + 'title': 'Väterkarenz und neue Chancen für Mütter - "Baby - was nun?"', + 'description': 'md5:97092c6ad1fd7d38e9d6a5fdeb2bcc33', + 'thumbnail': 're:^http://.*\.png', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = (self._html_search_meta('title', webpage, default=None) or + self._search_regex(r'

([^<]+)

')) + + video_id = self._search_regex( + r'/config/video/(.+?)\.xml', webpage, 'video id') + playlist = self._download_xml( + 'http://www.karrierevideos.at/player-playlist.xml.php?p=%s' % video_id, + video_id, transform_source=fix_xml_ampersands) + + NS_MAP = { + 'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats' + } + + def ns(path): + return xpath_with_ns(path, NS_MAP) + + item = playlist.find('./tracklist/item') + video_file = xpath_text( + item, ns('./jwplayer:file'), 'video url', fatal=True) + streamer = xpath_text( + item, ns('./jwplayer:streamer'), 'streamer', fatal=True) + + uploader = xpath_text( + item, ns('./jwplayer:author'), 'uploader') + duration = float_or_none( + xpath_text(item, ns('./jwplayer:duration'), 'duration')) + + description = self._html_search_regex( + r'(?s)
(.+?)
', + webpage, 'description') + + thumbnail = self._html_search_meta( + 'thumbnail', webpage, 'thumbnail') + if thumbnail: + thumbnail = compat_urlparse.urljoin(url, thumbnail) + + return { + 'id': video_id, + 'url': streamer.replace('rtmpt', 'rtmp'), + 'play_path': 'mp4:%s' % video_file, + 'ext': 'flv', + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'duration': duration, + } diff --git a/youtube_dl/extractor/kickstarter.py b/youtube_dl/extractor/kickstarter.py index 7d4b57056..1d391e69f 100644 --- a/youtube_dl/extractor/kickstarter.py +++ b/youtube_dl/extractor/kickstarter.py @@ -28,6 +28,14 @@ class KickStarterIE(InfoExtractor): 'uploader': 'Pebble Technology', 'title': 'Pebble iOS Notifications', } + }, { + 'url': 'https://www.kickstarter.com/projects/1420158244/power-drive-2000/widget/video.html', + 'info_dict': { + 'id': '1420158244', + 'ext': 'mp4', + 'title': 'Power Drive 2000', + }, + 'expected_warnings': ['OpenGraph description'], }] def _real_extract(self, url): @@ -48,10 +56,15 @@ class KickStarterIE(InfoExtractor): 'title': title, } + thumbnail = self._og_search_thumbnail(webpage, default=None) + if thumbnail is None: + thumbnail = self._html_search_regex( + r']+class="[^"]+\s*poster\s*[^"]+"[^>]+src="([^"]+)"', + webpage, 'thumbnail image', fatal=False) return { 'id': video_id, 'url': video_url, 'title': title, 'description': self._og_search_description(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), + 'thumbnail': thumbnail, } diff --git a/youtube_dl/extractor/letv.py b/youtube_dl/extractor/letv.py index 1484ac0d2..da896caf1 100644 --- a/youtube_dl/extractor/letv.py +++ b/youtube_dl/extractor/letv.py @@ -50,9 +50,7 @@ class LetvIE(InfoExtractor): 'title': '与龙共舞 完整版', 'description': 'md5:7506a5eeb1722bb9d4068f85024e3986', }, - 'params': { - 'cn_verification_proxy': 'http://proxy.uku.im:8888' - }, + 'skip': 'Only available in China', }] @staticmethod diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index d8897eb90..7091f3335 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -20,7 +20,6 @@ class MiTeleIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/', - 'md5': '6a75fe9d0d3275bead0cb683c616fddb', 'info_dict': { 'id': '0fce117d', 'ext': 'mp4', @@ -29,6 +28,10 @@ class MiTeleIE(InfoExtractor): 'display_id': 'programa-144', 'duration': 2913, }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }] def _real_extract(self, url): @@ -56,12 +59,14 @@ class MiTeleIE(InfoExtractor): episode, transform_source=strip_jsonp ) + formats = self._extract_m3u8_formats( + token_info['tokenizedUrl'], episode, ext='mp4') return { 'id': embed_data['videoId'], 'display_id': episode, 'title': info_el.find('title').text, - 'url': token_info['tokenizedUrl'], + 'formats': formats, 'description': get_element_by_attribute('class', 'text', webpage), 'thumbnail': info_el.find('thumb').text, 'duration': parse_duration(info_el.find('duration').text), diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py index c10405f04..925967753 100644 --- a/youtube_dl/extractor/naver.py +++ b/youtube_dl/extractor/naver.py @@ -6,6 +6,7 @@ import re from .common import InfoExtractor from ..compat import ( compat_urllib_parse, + compat_urlparse, ) from ..utils import ( ExtractorError, @@ -16,7 +17,7 @@ from ..utils import ( class NaverIE(InfoExtractor): _VALID_URL = r'https?://(?:m\.)?tvcast\.naver\.com/v/(?P\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://tvcast.naver.com/v/81652', 'info_dict': { 'id': '81652', @@ -25,7 +26,18 @@ class NaverIE(InfoExtractor): 'description': '합격불변의 법칙 메가스터디 | 메가스터디 수학 김상희 선생님이 9월 모의고사 수학A형 16번에서 20번까지 해설강의를 공개합니다.', 'upload_date': '20130903', }, - } + }, { + 'url': 'http://tvcast.naver.com/v/395837', + 'md5': '638ed4c12012c458fefcddfd01f173cd', + 'info_dict': { + 'id': '395837', + 'ext': 'mp4', + 'title': '9년이 지나도 아픈 기억, 전효성의 아버지', + 'description': 'md5:5bf200dcbf4b66eb1b350d1eb9c753f7', + 'upload_date': '20150519', + }, + 'skip': 'Georestricted', + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -35,7 +47,7 @@ class NaverIE(InfoExtractor): webpage) if m_id is None: m_error = re.search( - r'(?s)
\s*(?:)?\s*

(?P.+?)

\s*
', + r'(?s)
\s*(?:)?\s*

(?P.+?)

\s*
', webpage) if m_error: raise ExtractorError(clean_html(m_error.group('msg')), expected=True) @@ -58,14 +70,18 @@ class NaverIE(InfoExtractor): formats = [] for format_el in urls.findall('EncodingOptions/EncodingOption'): domain = format_el.find('Domain').text + uri = format_el.find('uri').text f = { - 'url': domain + format_el.find('uri').text, + 'url': compat_urlparse.urljoin(domain, uri), 'ext': 'mp4', 'width': int(format_el.find('width').text), 'height': int(format_el.find('height').text), } if domain.startswith('rtmp'): + # urlparse does not support custom schemes + # https://bugs.python.org/issue18828 f.update({ + 'url': domain + uri, 'ext': 'flv', 'rtmp_protocol': '1', # rtmpt }) diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py index 862b706bf..944096e1c 100644 --- a/youtube_dl/extractor/nba.py +++ b/youtube_dl/extractor/nba.py @@ -22,6 +22,18 @@ class NBAIE(InfoExtractor): }, { 'url': 'http://www.nba.com/video/games/hornets/2014/12/05/0021400276-nyk-cha-play5.nba/', 'only_matching': True, + }, { + 'url': 'http://watch.nba.com/nba/video/channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba', + 'info_dict': { + 'id': '0041400301-cle-atl-recap.nba', + 'ext': 'mp4', + 'title': 'NBA GAME TIME | Video: Hawks vs. Cavaliers Game 1', + 'description': 'md5:8094c3498d35a9bd6b1a8c396a071b4d', + 'duration': 228, + }, + 'params': { + 'skip_download': True, + } }] def _real_extract(self, url): @@ -35,8 +47,12 @@ class NBAIE(InfoExtractor): self._og_search_title(webpage, default=shortened_video_id), ' : NBA.com') description = self._og_search_description(webpage) - duration = parse_duration( - self._html_search_meta('duration', webpage, 'duration')) + duration_str = self._html_search_meta( + 'duration', webpage, 'duration', default=None) + if not duration_str: + duration_str = self._html_search_regex( + r'Duration:\s*(\d+:\d+)', webpage, 'duration', fatal=False) + duration = parse_duration(duration_str) return { 'id': shortened_video_id, diff --git a/youtube_dl/extractor/nextmedia.py b/youtube_dl/extractor/nextmedia.py index 02dba4ef6..d1b7cff4c 100644 --- a/youtube_dl/extractor/nextmedia.py +++ b/youtube_dl/extractor/nextmedia.py @@ -89,8 +89,8 @@ class NextMediaActionNewsIE(NextMediaIE): return self._extract_from_nextmedia_page(news_id, url, article_page) -class AppleDailyRealtimeNewsIE(NextMediaIE): - _VALID_URL = r'http://(www|ent).appledaily.com.tw/(realtimenews|enews)/[^/]+/[^/]+/(?P\d+)/(?P\d+)(/.*)?' +class AppleDailyIE(NextMediaIE): + _VALID_URL = r'http://(www|ent).appledaily.com.tw/(?:animation|appledaily|enews|realtimenews)/[^/]+/[^/]+/(?P\d+)/(?P\d+)(/.*)?' _TESTS = [{ 'url': 'http://ent.appledaily.com.tw/enews/article/entertainment/20150128/36354694', 'md5': 'a843ab23d150977cc55ef94f1e2c1e4d', @@ -99,7 +99,7 @@ class AppleDailyRealtimeNewsIE(NextMediaIE): 'ext': 'mp4', 'title': '周亭羽走過摩鐵陰霾2男陪吃 九把刀孤寒看醫生', 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 'md5:b23787119933404ce515c6356a8c355c', + 'description': 'md5:2acd430e59956dc47cd7f67cb3c003f4', 'upload_date': '20150128', } }, { @@ -110,26 +110,10 @@ class AppleDailyRealtimeNewsIE(NextMediaIE): 'ext': 'mp4', 'title': '不滿被踩腳 山東兩大媽一路打下車', 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 'md5:2648aaf6fc4f401f6de35a91d111aa1d', + 'description': 'md5:175b4260c1d7c085993474217e4ab1b4', 'upload_date': '20150128', } - }] - - _URL_PATTERN = r'\{url: \'(.+)\'\}' - - def _fetch_title(self, page): - return self._html_search_regex(r'

([^<>]+)

', page, 'news title') - - def _fetch_thumbnail(self, page): - return self._html_search_regex(r"setInitialImage\(\'([^']+)'\)", page, 'video thumbnail', fatal=False) - - def _fetch_timestamp(self, page): - return None - - -class AppleDailyAnimationNewsIE(AppleDailyRealtimeNewsIE): - _VALID_URL = 'http://www.appledaily.com.tw/animation/[^/]+/[^/]+/(?P\d+)/(?P\d+)(/.*)?' - _TESTS = [{ + }, { 'url': 'http://www.appledaily.com.tw/animation/realtimenews/new/20150128/5003671', 'md5': '03df296d95dedc2d5886debbb80cb43f', 'info_dict': { @@ -154,10 +138,22 @@ class AppleDailyAnimationNewsIE(AppleDailyRealtimeNewsIE): 'expected_warnings': [ 'video thumbnail', ] + }, { + 'url': 'http://www.appledaily.com.tw/appledaily/article/supplement/20140417/35770334/', + 'only_matching': True, }] + _URL_PATTERN = r'\{url: \'(.+)\'\}' + def _fetch_title(self, page): - return self._html_search_meta('description', page, 'news title') + return (self._html_search_regex(r'

([^<>]+)

', page, 'news title', default=None) or + self._html_search_meta('description', page, 'news title')) + + def _fetch_thumbnail(self, page): + return self._html_search_regex(r"setInitialImage\(\'([^']+)'\)", page, 'video thumbnail', fatal=False) + + def _fetch_timestamp(self, page): + return None def _fetch_description(self, page): return self._html_search_meta('description', page, 'news description') diff --git a/youtube_dl/extractor/nfl.py b/youtube_dl/extractor/nfl.py index 2684dd250..dc54634a5 100644 --- a/youtube_dl/extractor/nfl.py +++ b/youtube_dl/extractor/nfl.py @@ -19,7 +19,7 @@ class NFLIE(InfoExtractor): _VALID_URL = r'''(?x)https?:// (?P(?:www\.)?(?:nfl\.com|.*?\.clubs\.nfl\.com))/ (?:.+?/)* - (?P(?:\d[a-z]{2}\d{13}|\w{8}\-(?:\w{4}\-){3}\w{12}))''' + (?P(?:[a-z0-9]{16}|\w{8}\-(?:\w{4}\-){3}\w{12}))''' _TESTS = [ { 'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights', @@ -58,6 +58,10 @@ class NFLIE(InfoExtractor): 'upload_date': '20150202', }, }, + { + 'url': 'http://www.nfl.com/videos/nfl-network-top-ten/09000d5d810a6bd4/Top-10-Gutsiest-Performances-Jack-Youngblood', + 'only_matching': True, + } ] @staticmethod diff --git a/youtube_dl/extractor/noco.py b/youtube_dl/extractor/noco.py index 664dc81d4..5bbd2dcf6 100644 --- a/youtube_dl/extractor/noco.py +++ b/youtube_dl/extractor/noco.py @@ -166,6 +166,10 @@ class NocoIE(InfoExtractor): self._sort_formats(formats) timestamp = parse_iso8601(show.get('online_date_start_utc'), ' ') + + if timestamp is not None and timestamp < 0: + timestamp = None + uploader = show.get('partner_name') uploader_id = show.get('partner_key') duration = float_or_none(show.get('duration_ms'), 1000) diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py new file mode 100644 index 000000000..3f9c776ef --- /dev/null +++ b/youtube_dl/extractor/nova.py @@ -0,0 +1,179 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + unified_strdate, +) + + +class NovaIE(InfoExtractor): + IE_DESC = 'TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz' + _VALID_URL = 'http://(?:[^.]+\.)?(?Ptv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P[^/]+?)(?:\.html|/|$)' + _TESTS = [{ + 'url': 'http://tvnoviny.nova.cz/clanek/novinky/co-na-sebe-sportaci-praskli-vime-jestli-pujde-hrdlicka-na-materskou.html?utm_source=tvnoviny&utm_medium=cpfooter&utm_campaign=novaplus', + 'info_dict': { + 'id': '1608920', + 'display_id': 'co-na-sebe-sportaci-praskli-vime-jestli-pujde-hrdlicka-na-materskou', + 'ext': 'flv', + 'title': 'Duel: Michal Hrdlička a Petr Suchoň', + 'description': 'md5:d0cc509858eee1b1374111c588c6f5d5', + 'thumbnail': 're:^https?://.*\.(?:jpg)', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + 'url': 'http://tn.nova.cz/clanek/tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci.html#player_13260', + 'md5': '1dd7b9d5ea27bc361f110cd855a19bd3', + 'info_dict': { + 'id': '1757139', + 'display_id': 'tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci', + 'ext': 'mp4', + 'title': 'Podzemní nemocnice v pražské Krči', + 'description': 'md5:f0a42dd239c26f61c28f19e62d20ef53', + 'thumbnail': 're:^https?://.*\.(?:jpg)', + } + }, { + 'url': 'http://novaplus.nova.cz/porad/policie-modrava/video/5591-policie-modrava-15-dil-blondynka-na-hrbitove', + 'info_dict': { + 'id': '1756825', + 'display_id': '5591-policie-modrava-15-dil-blondynka-na-hrbitove', + 'ext': 'flv', + 'title': 'Policie Modrava - 15. díl - Blondýnka na hřbitově', + 'description': 'md5:dc24e50be5908df83348e50d1431295e', # Make sure this description is clean of html tags + 'thumbnail': 're:^https?://.*\.(?:jpg)', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + 'url': 'http://novaplus.nova.cz/porad/televizni-noviny/video/5585-televizni-noviny-30-5-2015/', + 'info_dict': { + 'id': '1756858', + 'ext': 'flv', + 'title': 'Televizní noviny - 30. 5. 2015', + 'thumbnail': 're:^https?://.*\.(?:jpg)', + 'upload_date': '20150530', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + 'url': 'http://fanda.nova.cz/clanek/fun-and-games/krvavy-epos-zaklinac-3-divoky-hon-vychazi-vyhrajte-ho-pro-sebe.html', + 'info_dict': { + 'id': '1753621', + 'ext': 'mp4', + 'title': 'Zaklínač 3: Divoký hon', + 'description': 're:.*Pokud se stejně jako my nemůžete.*', + 'thumbnail': 're:https?://.*\.jpg(\?.*)?', + 'upload_date': '20150521', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + 'url': 'http://sport.tn.nova.cz/clanek/sport/hokej/nhl/zivot-jde-dal-hodnotil-po-vyrazeni-z-playoff-jiri-sekac.html', + 'only_matching': True, + }, { + 'url': 'http://fanda.nova.cz/clanek/fun-and-games/krvavy-epos-zaklinac-3-divoky-hon-vychazi-vyhrajte-ho-pro-sebe.html', + 'only_matching': True, + }, { + 'url': 'http://doma.nova.cz/clanek/zdravi/prijdte-se-zapsat-do-registru-kostni-drene-jiz-ve-stredu-3-cervna.html', + 'only_matching': True, + }, { + 'url': 'http://prask.nova.cz/clanek/novinky/co-si-na-sobe-nase-hvezdy-nechaly-pojistit.html', + 'only_matching': True, + }, { + 'url': 'http://tv.nova.cz/clanek/novinky/zivot-je-zivot-bondovsky-trailer.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('id') + site = mobj.group('site') + + webpage = self._download_webpage(url, display_id) + + video_id = self._search_regex( + [r"(?:media|video_id)\s*:\s*'(\d+)'", + r'media=(\d+)', + r'id="article_video_(\d+)"', + r'id="player_(\d+)"'], + webpage, 'video id') + + config_url = self._search_regex( + r'src="(http://tn\.nova\.cz/bin/player/videojs/config\.php\?[^"]+)"', + webpage, 'config url', default=None) + + if not config_url: + DEFAULT_SITE_ID = '23000' + SITES = { + 'tvnoviny': DEFAULT_SITE_ID, + 'novaplus': DEFAULT_SITE_ID, + 'vymena': DEFAULT_SITE_ID, + 'krasna': DEFAULT_SITE_ID, + 'fanda': '30', + 'tn': '30', + 'doma': '30', + } + + site_id = self._search_regex( + r'site=(\d+)', webpage, 'site id', default=None) or SITES.get(site, DEFAULT_SITE_ID) + + config_url = ('http://tn.nova.cz/bin/player/videojs/config.php?site=%s&media=%s&jsVar=vjsconfig' + % (site_id, video_id)) + + config = self._download_json( + config_url, display_id, + 'Downloading config JSON', + transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1]) + + mediafile = config['mediafile'] + video_url = mediafile['src'] + + m = re.search(r'^(?Prtmpe?://[^/]+/(?P[^/]+?))/&*(?P.+)$', video_url) + if m: + formats = [{ + 'url': m.group('url'), + 'app': m.group('app'), + 'play_path': m.group('playpath'), + 'player_path': 'http://tvnoviny.nova.cz/static/shared/app/videojs/video-js.swf', + 'ext': 'flv', + }] + else: + formats = [{ + 'url': video_url, + }] + self._sort_formats(formats) + + title = mediafile.get('meta', {}).get('title') or self._og_search_title(webpage) + description = clean_html(self._og_search_description(webpage, default=None)) + thumbnail = config.get('poster') + + if site == 'novaplus': + upload_date = unified_strdate(self._search_regex( + r'(\d{1,2}-\d{1,2}-\d{4})$', display_id, 'upload date', default=None)) + elif site == 'fanda': + upload_date = unified_strdate(self._search_regex( + r'(\d{1,2}\.\d{1,2}\.\d{4})', webpage, 'upload date', default=None)) + else: + upload_date = None + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'upload_date': upload_date, + 'thumbnail': thumbnail, + 'formats': formats, + } diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py new file mode 100644 index 000000000..173e46cd8 --- /dev/null +++ b/youtube_dl/extractor/nowtv.py @@ -0,0 +1,192 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + parse_iso8601, + parse_duration, + remove_start, +) + + +class NowTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?nowtv\.de/(?Prtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P.+?)/player' + + _TESTS = [{ + # rtl + 'url': 'http://www.nowtv.de/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/player', + 'info_dict': { + 'id': '203519', + 'display_id': 'bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit', + 'ext': 'mp4', + 'title': 'Die neuen Bauern und eine Hochzeit', + 'description': 'md5:e234e1ed6d63cf06be5c070442612e7e', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1432580700, + 'upload_date': '20150525', + 'duration': 2786, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # rtl2 + 'url': 'http://www.nowtv.de/rtl2/berlin-tag-nacht/berlin-tag-nacht-folge-934/player', + 'info_dict': { + 'id': '203481', + 'display_id': 'berlin-tag-nacht/berlin-tag-nacht-folge-934', + 'ext': 'mp4', + 'title': 'Berlin - Tag & Nacht (Folge 934)', + 'description': 'md5:c85e88c2e36c552dfe63433bc9506dd0', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1432666800, + 'upload_date': '20150526', + 'duration': 2641, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # rtlnitro + 'url': 'http://www.nowtv.de/rtlnitro/alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00/player', + 'info_dict': { + 'id': '165780', + 'display_id': 'alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00', + 'ext': 'mp4', + 'title': 'Hals- und Beinbruch', + 'description': 'md5:b50d248efffe244e6f56737f0911ca57', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1432415400, + 'upload_date': '20150523', + 'duration': 2742, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # superrtl + 'url': 'http://www.nowtv.de/superrtl/medicopter-117/angst/player', + 'info_dict': { + 'id': '99205', + 'display_id': 'medicopter-117/angst', + 'ext': 'mp4', + 'title': 'Angst!', + 'description': 'md5:30cbc4c0b73ec98bcd73c9f2a8c17c4e', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1222632900, + 'upload_date': '20080928', + 'duration': 3025, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # ntv + 'url': 'http://www.nowtv.de/ntv/ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch/player', + 'info_dict': { + 'id': '203521', + 'display_id': 'ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch', + 'ext': 'mp4', + 'title': 'Thema u.a.: Der erste Blick: Die Apple Watch', + 'description': 'md5:4312b6c9d839ffe7d8caf03865a531af', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1432751700, + 'upload_date': '20150527', + 'duration': 1083, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # vox + 'url': 'http://www.nowtv.de/vox/der-hundeprofi/buero-fall-chihuahua-joel/player', + 'info_dict': { + 'id': '128953', + 'display_id': 'der-hundeprofi/buero-fall-chihuahua-joel', + 'ext': 'mp4', + 'title': "Büro-Fall / Chihuahua 'Joel'", + 'description': 'md5:e62cb6bf7c3cc669179d4f1eb279ad8d', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1432408200, + 'upload_date': '20150523', + 'duration': 3092, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('id') + station = mobj.group('station') + + info = self._download_json( + 'https://api.nowtv.de/v3/movies/%s?fields=*,format,files' % display_id, + display_id) + + video_id = compat_str(info['id']) + + files = info['files'] + if not files: + if info.get('geoblocked', False): + raise ExtractorError( + 'Video %s is not available from your location due to geo restriction' % video_id, + expected=True) + if not info.get('free', True): + raise ExtractorError( + 'Video %s is not available for free' % video_id, expected=True) + + f = info.get('format', {}) + station = f.get('station') or station + + STATIONS = { + 'rtl': 'rtlnow', + 'rtl2': 'rtl2now', + 'vox': 'voxnow', + 'nitro': 'rtlnitronow', + 'ntv': 'n-tvnow', + 'superrtl': 'superrtlnow' + } + + formats = [] + for item in files['items']: + item_path = remove_start(item['path'], '/') + tbr = int_or_none(item['bitrate']) + m3u8_url = 'http://hls.fra.%s.de/hls-vod-enc/%s.m3u8' % (STATIONS[station], item_path) + m3u8_url = m3u8_url.replace('now/', 'now/videos/') + formats.append({ + 'url': m3u8_url, + 'format_id': '%s-%sk' % (item['id'], tbr), + 'ext': 'mp4', + 'tbr': tbr, + }) + self._sort_formats(formats) + + title = info['title'] + description = info.get('articleLong') or info.get('articleShort') + timestamp = parse_iso8601(info.get('broadcastStartDate'), ' ') + duration = parse_duration(info.get('duration')) + thumbnail = f.get('defaultImage169Format') or f.get('defaultImage169Logo') + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + } diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py index fbc521d1a..6c7149fe3 100644 --- a/youtube_dl/extractor/odnoklassniki.py +++ b/youtube_dl/extractor/odnoklassniki.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_urllib_parse from ..utils import ( unified_strdate, int_or_none, @@ -11,8 +12,9 @@ from ..utils import ( class OdnoklassnikiIE(InfoExtractor): - _VALID_URL = r'https?://(?:odnoklassniki|ok)\.ru/(?:video|web-api/video/moviePlayer)/(?P\d+)' + _VALID_URL = r'https?://(?:odnoklassniki|ok)\.ru/(?:video|web-api/video/moviePlayer)/(?P[\d-]+)' _TESTS = [{ + # metadata in JSON 'url': 'http://ok.ru/video/20079905452', 'md5': '8e24ad2da6f387948e7a7d44eb8668fe', 'info_dict': { @@ -20,11 +22,22 @@ class OdnoklassnikiIE(InfoExtractor): 'ext': 'mp4', 'title': 'Культура меняет нас (прекрасный ролик!))', 'duration': 100, - 'upload_date': '20141207', 'uploader_id': '330537914540', 'uploader': 'Виталий Добровольский', 'like_count': int, - 'age_limit': 0, + }, + }, { + # metadataUrl + 'url': 'http://ok.ru/video/63567059965189-0', + 'md5': '9676cf86eff5391d35dea675d224e131', + 'info_dict': { + 'id': '63567059965189-0', + 'ext': 'mp4', + 'title': 'Девушка без комплексов ...', + 'duration': 191, + 'uploader_id': '534380003155', + 'uploader': 'Андрей Мещанинов', + 'like_count': int, }, }, { 'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452', @@ -34,14 +47,23 @@ class OdnoklassnikiIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage( + 'http://ok.ru/video/%s' % video_id, video_id) player = self._parse_json( unescapeHTML(self._search_regex( r'data-attributes="([^"]+)"', webpage, 'player')), video_id) - metadata = self._parse_json(player['flashvars']['metadata'], video_id) + flashvars = player['flashvars'] + + metadata = flashvars.get('metadata') + if metadata: + metadata = self._parse_json(metadata, video_id) + else: + metadata = self._download_json( + compat_urllib_parse.unquote(flashvars['metadataUrl']), + video_id, 'Downloading metadata JSON') movie = metadata['movie'] title = movie['title'] @@ -53,11 +75,11 @@ class OdnoklassnikiIE(InfoExtractor): uploader = author.get('name') upload_date = unified_strdate(self._html_search_meta( - 'ya:ovs:upload_date', webpage, 'upload date')) + 'ya:ovs:upload_date', webpage, 'upload date', default=None)) age_limit = None adult = self._html_search_meta( - 'ya:ovs:adult', webpage, 'age limit') + 'ya:ovs:adult', webpage, 'age limit', default=None) if adult: age_limit = 18 if adult == 'true' else 0 diff --git a/youtube_dl/extractor/patreon.py b/youtube_dl/extractor/patreon.py index f179ea200..6cdc2638b 100644 --- a/youtube_dl/extractor/patreon.py +++ b/youtube_dl/extractor/patreon.py @@ -87,7 +87,7 @@ class PatreonIE(InfoExtractor): r'
', webpage, 'attachment URL', default=None) embed = self._html_search_regex( - r'
\s*