From: Sergey M․ Date: Fri, 19 Jun 2015 17:00:00 +0000 (+0600) Subject: Merge branch 'pinkbike' of https://github.com/misterhat/youtube-dl into misterhat... X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=commitdiff_plain;h=c8e337450b63c3e3d93d1d6badf9475d0757690f;hp=680f9744c4e010ad5111c7711c58c341d5ba24dd;p=youtube-dl Merge branch 'pinkbike' of https://github.com/misterhat/youtube-dl into misterhat-pinkbike --- diff --git a/AUTHORS b/AUTHORS index ebed7ebb3..889d599a2 100644 --- a/AUTHORS +++ b/AUTHORS @@ -125,3 +125,6 @@ Roman Le Négrate Matthias Küch Julian Richen Ping O. +Mister Hat +Peter Ding +jackyzy823 diff --git a/README.md b/README.md index e51bb5343..5f3a08f5a 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ which means you can modify it, redistribute it or use it however you like. -i, --ignore-errors Continue on download errors, for example to skip unavailable videos in a playlist --abort-on-error Abort downloading of further videos (in the playlist or the command line) if an error occurs --dump-user-agent Display the current browser identification - --list-extractors List all supported extractors and the URLs they would handle + --list-extractors List all supported extractors --extractor-descriptions Output descriptions of all supported extractors --default-search PREFIX Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for youtube-dl "large apple". Use the value "auto" to let youtube-dl guess ("auto_warning" to emit a warning when guessing). "error" just throws an error. The @@ -168,7 +168,7 @@ which means you can modify it, redistribute it or use it however you like. --no-progress Do not print progress bar --console-title Display progress in console titlebar -v, --verbose Print various debugging information - --dump-pages Print downloaded pages to debug problems (very verbose) + --dump-pages Print downloaded pages encoded using base64 to debug problems (very verbose) --write-pages Write downloaded intermediary pages to files in the current directory to debug problems --print-traffic Display sent and read HTTP traffic -C, --call-home Contact the youtube-dl server for debugging @@ -220,10 +220,10 @@ which means you can modify it, redistribute it or use it however you like. --embed-thumbnail Embed thumbnail in the audio as cover art --add-metadata Write metadata to the video file --metadata-from-title FORMAT Parse additional metadata like song title / artist from the video title. The format syntax is the same as --output, the parsed - parameters replace existing values. Additional templates: %(album), %(artist). Example: --metadata-from-title "%(artist)s - + parameters replace existing values. Additional templates: %(album)s, %(artist)s. Example: --metadata-from-title "%(artist)s - %(title)s" matches a title like "Coldplay - Paradise" --xattrs Write metadata to the video file's xattrs (using dublin core and xdg standards) - --fixup POLICY Automatically correct known faults of the file. One of never (do nothing), warn (only emit a warning), detect_or_warn(the default; + --fixup POLICY Automatically correct known faults of the file. One of never (do nothing), warn (only emit a warning), detect_or_warn (the default; fix file if we can, warn otherwise) --prefer-avconv Prefer avconv over ffmpeg for running the postprocessors (default) --prefer-ffmpeg Prefer ffmpeg over avconv for running the postprocessors diff --git a/docs/supportedsites.md b/docs/supportedsites.md index a4879bd9a..220e52b98 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -10,6 +10,7 @@ - **56.com** - **5min** - **8tracks** + - **91porn** - **9gag** - **abc.net.au** - **Abc7News** @@ -26,8 +27,7 @@ - **anitube.se** - **AnySex** - **Aparat** - - **AppleDailyAnimationNews** - - **AppleDailyRealtimeNews** + - **AppleDaily** - **AppleTrailers** - **archive.org**: archive.org videos - **ARD** @@ -120,6 +120,8 @@ - **divxstage**: DivxStage - **Dotsub** - **DouyuTV** + - **dramafever** + - **dramafever:series** - **DRBonanza** - **Dropbox** - **DrTuber** @@ -152,8 +154,8 @@ - **fc2** - **fernsehkritik.tv** - **fernsehkritik.tv:postecke** - - **Firedrive** - **Firstpost** + - **FiveTV** - **Flickr** - **Folketinget**: Folketinget (ft.dk; Danish parliament) - **FootyRoom** @@ -218,6 +220,7 @@ - **instagram:user**: Instagram user profile - **InternetVideoArchive** - **IPrima** + - **iqiyi** - **ivi**: ivi.ru - **ivi:compilation**: ivi.ru compilations - **Izlesene** @@ -230,6 +233,7 @@ - **KanalPlay**: Kanal 5/9/11 Play - **Kankan** - **Karaoketv** + - **KarriereVideos** - **keek** - **KeezMovies** - **KhanAcademy** @@ -320,8 +324,10 @@ - **Noco** - **Normalboots** - **NosVideo** + - **Nova**: TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz - **novamov**: NovaMov - **Nowness** + - **NowTV** - **nowvideo**: NowVideo - **npo.nl** - **npo.nl:live** @@ -393,7 +399,6 @@ - **Rte** - **rtl.nl**: rtl.nl and rtlxl.nl - **RTL2** - - **RTLnow** - **RTP** - **RTS**: RTS.ch - **rtve.es:alacarta**: RTVE a la carta @@ -406,6 +411,7 @@ - **rutube:movie**: Rutube movies - **rutube:person**: Rutube person videos - **RUTV**: RUTV.RU + - **Ruutu** - **safari**: safaribooksonline.com online video - **safari:course**: safaribooksonline.com online courses - **Sandia**: Sandia National Laboratories @@ -431,8 +437,9 @@ - **smotri:community**: Smotri.com community videos - **smotri:user**: Smotri.com user videos - **Snotr** - - **Sockshare** - **Sohu** + - **soompi** + - **soompi:show** - **soundcloud** - **soundcloud:playlist** - **soundcloud:set** @@ -507,6 +514,7 @@ - **Trilulilu** - **TruTube** - **Tube8** + - **TubiTv** - **Tudou** - **Tumblr** - **TuneIn** @@ -516,6 +524,8 @@ - **TV2** - **TV2Article** - **TV4**: tv4.se and tv4play.se + - **TVC** + - **TVCArticle** - **tvigle**: Интернет-телевидение Tvigle.ru - **tvp.pl** - **tvp.pl:Series** @@ -564,6 +574,7 @@ - **vier:videos** - **Viewster** - **viki** + - **viki:channel** - **vimeo** - **vimeo:album** - **vimeo:channel** diff --git a/test/test_aes.py b/test/test_aes.py index 4dc7de7b5..315a3f5ae 100644 --- a/test/test_aes.py +++ b/test/test_aes.py @@ -39,7 +39,7 @@ class TestAES(unittest.TestCase): encrypted = base64.b64encode( intlist_to_bytes(self.iv[:8]) + b'\x17\x15\x93\xab\x8d\x80V\xcdV\xe0\t\xcdo\xc2\xa5\xd8ksM\r\xe27N\xae' - ) + ).decode('utf-8') decrypted = (aes_decrypt_text(encrypted, password, 16)) self.assertEqual(decrypted, self.secret_msg) @@ -47,7 +47,7 @@ class TestAES(unittest.TestCase): encrypted = base64.b64encode( intlist_to_bytes(self.iv[:8]) + b'\x0b\xe6\xa4\xd9z\x0e\xb8\xb9\xd0\xd4i_\x85\x1d\x99\x98_\xe5\x80\xe7.\xbf\xa5\x83' - ) + ).decode('utf-8') decrypted = (aes_decrypt_text(encrypted, password, 32)) self.assertEqual(decrypted, self.secret_msg) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 891ee620b..c4e3adb67 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -266,7 +266,7 @@ class TestNRKSubtitles(BaseTestSubtitles): self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['no'])) - self.assertEqual(md5(subtitles['no']), '1d221e6458c95c5494dcd38e6a1f129a') + self.assertEqual(md5(subtitles['no']), '544fa917d3197fcbee64634559221cc2') class TestRaiSubtitles(BaseTestSubtitles): diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index d1953c18f..6e4b6f566 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -49,6 +49,7 @@ from .utils import ( ExtractorError, format_bytes, formatSeconds, + HEADRequest, locked_file, make_HTTPS_handler, MaxDownloadsReached, @@ -118,7 +119,7 @@ class YoutubeDL(object): username: Username for authentication purposes. password: Password for authentication purposes. - videopassword: Password for acces a video. + videopassword: Password for accessing a video. usenetrc: Use netrc for authentication instead. verbose: Print additional info to stdout. quiet: Do not print messages to stdout. @@ -923,8 +924,9 @@ class YoutubeDL(object): if f.get('vcodec') != 'none' and f.get('acodec') != 'none'] if audiovideo_formats: return audiovideo_formats[format_idx] - # for audio only urls, select the best/worst audio format - elif all(f.get('acodec') != 'none' for f in available_formats): + # for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format + elif (all(f.get('acodec') != 'none' for f in available_formats) or + all(f.get('vcodec') != 'none' for f in available_formats)): return available_formats[format_idx] elif format_spec == 'bestaudio': audio_formats = [ @@ -1014,13 +1016,13 @@ class YoutubeDL(object): info_dict['display_id'] = info_dict['id'] if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None: - # Working around negative timestamps in Windows - # (see http://bugs.python.org/issue1646728) - if info_dict['timestamp'] < 0 and os.name == 'nt': - info_dict['timestamp'] = 0 - upload_date = datetime.datetime.utcfromtimestamp( - info_dict['timestamp']) - info_dict['upload_date'] = upload_date.strftime('%Y%m%d') + # Working around out-of-range timestamp values (e.g. negative ones on Windows, + # see http://bugs.python.org/issue1646728) + try: + upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp']) + info_dict['upload_date'] = upload_date.strftime('%Y%m%d') + except (ValueError, OverflowError, OSError): + pass if self.params.get('listsubtitles', False): if 'automatic_captions' in info_dict: @@ -1031,12 +1033,6 @@ class YoutubeDL(object): info_dict['id'], info_dict.get('subtitles'), info_dict.get('automatic_captions')) - # This extractors handle format selection themselves - if info_dict['extractor'] in ['Youku']: - if download: - self.process_info(info_dict) - return info_dict - # We now pick which formats have to be downloaded if info_dict.get('formats') is None: # There's only one format available @@ -1047,6 +1043,8 @@ class YoutubeDL(object): if not formats: raise ExtractorError('No video formats found!') + formats_dict = {} + # We check that all the formats have the format and format_id fields for i, format in enumerate(formats): if 'url' not in format: @@ -1054,6 +1052,18 @@ class YoutubeDL(object): if format.get('format_id') is None: format['format_id'] = compat_str(i) + format_id = format['format_id'] + if format_id not in formats_dict: + formats_dict[format_id] = [] + formats_dict[format_id].append(format) + + # Make sure all formats have unique format_id + for format_id, ambiguous_formats in formats_dict.items(): + if len(ambiguous_formats) > 1: + for i, format in enumerate(ambiguous_formats): + format['format_id'] = '%s-%d' % (format_id, i) + + for i, format in enumerate(formats): if format.get('format') is None: format['format'] = '{id} - {res}{note}'.format( id=format['format_id'], @@ -1706,7 +1716,8 @@ class YoutubeDL(object): if req_is_string: req = url_escaped else: - req = compat_urllib_request.Request( + req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request + req = req_type( url_escaped, data=req.data, headers=req.headers, origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 80bec39da..6c548d8e9 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -112,6 +112,10 @@ from .dfb import DFBIE from .dhm import DHMIE from .dotsub import DotsubIE from .douyutv import DouyuTVIE +from .dramafever import ( + DramaFeverIE, + DramaFeverSeriesIE, +) from .dreisat import DreiSatIE from .drbonanza import DRBonanzaIE from .drtuber import DrTuberIE @@ -149,10 +153,10 @@ from .extremetube import ExtremeTubeIE from .facebook import FacebookIE from .faz import FazIE from .fc2 import FC2IE -from .firedrive import FiredriveIE from .firstpost import FirstpostIE from .firsttv import FirstTVIE from .fivemin import FiveMinIE +from .fivetv import FiveTVIE from .fktv import ( FKTVIE, FKTVPosteckeIE, @@ -230,6 +234,7 @@ from .infoq import InfoQIE from .instagram import InstagramIE, InstagramUserIE from .internetvideoarchive import InternetVideoArchiveIE from .iprima import IPrimaIE +from .iqiyi import IqiyiIE from .ivi import ( IviIE, IviCompilationIE @@ -353,8 +358,10 @@ from .ninegag import NineGagIE from .noco import NocoIE from .normalboots import NormalbootsIE from .nosvideo import NosVideoIE +from .nova import NovaIE from .novamov import NovaMovIE from .nowness import NownessIE +from .nowtv import NowTVIE from .nowvideo import NowVideoIE from .npo import ( NPOIE, @@ -402,6 +409,7 @@ from .playfm import PlayFMIE from .playvid import PlayvidIE from .playwire import PlaywireIE from .podomatic import PodomaticIE +from .porn91 import Porn91IE from .pornhd import PornHdIE from .pornhub import ( PornHubIE, @@ -439,7 +447,6 @@ from .roxwel import RoxwelIE from .rtbf import RTBFIE from .rte import RteIE from .rtlnl import RtlNlIE -from .rtlnow import RTLnowIE from .rtl2 import RTL2IE from .rtp import RTPIE from .rts import RTSIE @@ -453,6 +460,7 @@ from .rutube import ( RutubePersonIE, ) from .rutv import RUTVIE +from .ruutu import RuutuIE from .sandia import SandiaIE from .safari import ( SafariIE, @@ -481,8 +489,11 @@ from .smotri import ( SmotriBroadcastIE, ) from .snotr import SnotrIE -from .sockshare import SockshareIE from .sohu import SohuIE +from .soompi import ( + SoompiIE, + SoompiShowIE, +) from .soundcloud import ( SoundcloudIE, SoundcloudSetIE, @@ -568,6 +579,7 @@ from .traileraddict import TrailerAddictIE from .trilulilu import TriluliluIE from .trutube import TruTubeIE from .tube8 import Tube8IE +from .tubitv import TubiTvIE from .tudou import TudouIE from .tumblr import TumblrIE from .tunein import TuneInIE @@ -578,6 +590,10 @@ from .tv2 import ( TV2ArticleIE, ) from .tv4 import TV4IE +from .tvc import ( + TVCIE, + TVCArticleIE, +) from .tvigle import TvigleIE from .tvp import TvpIE, TvpSeriesIE from .tvplay import TVPlayIE diff --git a/youtube_dl/extractor/aftonbladet.py b/youtube_dl/extractor/aftonbladet.py index a117502bc..e0518cf26 100644 --- a/youtube_dl/extractor/aftonbladet.py +++ b/youtube_dl/extractor/aftonbladet.py @@ -6,11 +6,11 @@ from ..utils import int_or_none class AftonbladetIE(InfoExtractor): - _VALID_URL = r'http://tv\.aftonbladet\.se/webbtv.+?(?Particle[0-9]+)\.ab(?:$|[?#])' + _VALID_URL = r'http://tv\.aftonbladet\.se/abtv/articles/(?P[0-9]+)' _TEST = { - 'url': 'http://tv.aftonbladet.se/webbtv/nyheter/vetenskap/rymden/article36015.ab', + 'url': 'http://tv.aftonbladet.se/abtv/articles/36015', 'info_dict': { - 'id': 'article36015', + 'id': '36015', 'ext': 'mp4', 'title': 'Vulkanutbrott i rymden - nu släpper NASA bilderna', 'description': 'Jupiters mÃ¥ne mest aktiv av alla himlakroppar', @@ -25,8 +25,9 @@ class AftonbladetIE(InfoExtractor): # find internal video meta data meta_url = 'http://aftonbladet-play.drlib.aptoma.no/video/%s.json' - internal_meta_id = self._html_search_regex( - r'data-aptomaId="([\w\d]+)"', webpage, 'internal_meta_id') + player_config = self._parse_json(self._html_search_regex( + r'data-player-config="([^"]+)"', webpage, 'player config'), video_id) + internal_meta_id = player_config['videoId'] internal_meta_url = meta_url % internal_meta_id internal_meta_json = self._download_json( internal_meta_url, video_id, 'Downloading video meta data') diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index 249bc6bbd..0305f88b5 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -129,6 +129,20 @@ class BBCCoUkIE(InfoExtractor): 'skip_download': True, }, 'skip': 'geolocation', + }, { + 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition', + 'info_dict': { + 'id': 'b05zmgw1', + 'ext': 'flv', + 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.', + 'title': 'Royal Academy Summer Exhibition', + 'duration': 3540, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'geolocation', }, { 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4', 'only_matching': True, @@ -267,7 +281,7 @@ class BBCCoUkIE(InfoExtractor): programme_id, 'Downloading media selection XML') except ExtractorError as ee: if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: - media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().encode('utf-8')) + media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().decode('utf-8')) else: raise @@ -362,7 +376,7 @@ class BBCCoUkIE(InfoExtractor): formats, subtitles = self._download_media_selector(programme_id) title = self._og_search_title(webpage) description = self._search_regex( - r'

([^<]+)

', + r'

([^<]+)

', webpage, 'description', fatal=False) else: programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 7ca835e31..bf60450c2 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -3,6 +3,8 @@ from __future__ import unicode_literals import re import itertools +import json +import xml.etree.ElementTree as ET from .common import InfoExtractor from ..utils import ( @@ -67,11 +69,19 @@ class BiliBiliIE(InfoExtractor): entries = [] - lq_doc = self._download_xml( + lq_page = self._download_webpage( 'http://interface.bilibili.com/v_cdn_play?appkey=1&cid=%s' % cid, video_id, note='Downloading LQ video info' ) + try: + err_info = json.loads(lq_page) + raise ExtractorError( + 'BiliBili said: ' + err_info['error_text'], expected=True) + except ValueError: + pass + + lq_doc = ET.fromstring(lq_page) lq_durls = lq_doc.findall('./durl') hq_doc = self._download_xml( @@ -80,9 +90,11 @@ class BiliBiliIE(InfoExtractor): note='Downloading HQ video info', fatal=False, ) - hq_durls = hq_doc.findall('./durl') if hq_doc is not False else itertools.repeat(None) - - assert len(lq_durls) == len(hq_durls) + if hq_doc is not False: + hq_durls = hq_doc.findall('./durl') + assert len(lq_durls) == len(hq_durls) + else: + hq_durls = itertools.repeat(None) i = 1 for lq_durl, hq_durl in zip(lq_durls, hq_durls): @@ -93,7 +105,7 @@ class BiliBiliIE(InfoExtractor): 'filesize': int_or_none( lq_durl.find('./size'), get_attr='text'), }] - if hq_durl: + if hq_durl is not None: formats.append({ 'format_id': 'hq', 'quality': 2, diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 4f60d5366..d768f99e6 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -156,6 +156,28 @@ class BrightcoveIE(InfoExtractor): linkBase = find_param('linkBaseURL') if linkBase is not None: params['linkBaseURL'] = linkBase + return cls._make_brightcove_url(params) + + @classmethod + def _build_brighcove_url_from_js(cls, object_js): + # The layout of JS is as follows: + # customBC.createVideo = function (width, height, playerID, playerKey, videoPlayer, VideoRandomID) { + # // build Brightcove XML + # } + m = re.search( + r'''(?x)customBC.\createVideo\( + .*? # skipping width and height + ["\'](?P\d+)["\']\s*,\s* # playerID + ["\'](?PAQ[^"\']{48})[^"\']*["\']\s*,\s* # playerKey begins with AQ and is 50 characters + # in length, however it's appended to itself + # in places, so truncate + ["\'](?P\d+)["\'] # @videoPlayer + ''', object_js) + if m: + return cls._make_brightcove_url(m.groupdict()) + + @classmethod + def _make_brightcove_url(cls, params): data = compat_urllib_parse.urlencode(params) return cls._FEDERATED_URL_TEMPLATE % data @@ -172,7 +194,7 @@ class BrightcoveIE(InfoExtractor): """Return a list of all Brightcove URLs from the webpage """ url_m = re.search( - r']*?>\s*\s*''', webpage) - return list(filter(None, [cls._build_brighcove_url(m) for m in matches])) + if matches: + return list(filter(None, [cls._build_brighcove_url(m) for m in matches])) + + return list(filter(None, [ + cls._build_brighcove_url_from_js(custom_bc) + for custom_bc in re.findall(r'(customBC\.createVideo\(.+?\);)', webpage)])) def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index 1ceb9d8d9..75fffb156 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -4,12 +4,13 @@ from .common import InfoExtractor class CBSIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?cbs\.com/shows/[^/]+/(?:video|artist)/(?P[^/]+)/.*' + _VALID_URL = r'https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/(?:video|artist)|colbertlateshow\.com/(?:video|podcasts))/[^/]+/(?P[^/]+)' _TESTS = [{ 'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', 'info_dict': { 'id': '4JUVEwq3wUT7', + 'display_id': 'connect-chat-feat-garth-brooks', 'ext': 'flv', 'title': 'Connect Chat feat. Garth Brooks', 'description': 'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!', @@ -24,6 +25,7 @@ class CBSIE(InfoExtractor): 'url': 'http://www.cbs.com/shows/liveonletterman/artist/221752/st-vincent/', 'info_dict': { 'id': 'WWF_5KqY3PK1', + 'display_id': 'st-vincent', 'ext': 'flv', 'title': 'Live on Letterman - St. Vincent', 'description': 'Live On Letterman: St. Vincent in concert from New York\'s Ed Sullivan Theater on Tuesday, July 16, 2014.', @@ -34,12 +36,23 @@ class CBSIE(InfoExtractor): 'skip_download': True, }, '_skip': 'Blocked outside the US', + }, { + 'url': 'http://colbertlateshow.com/video/8GmB0oY0McANFvp2aEffk9jZZZ2YyXxy/the-colbeard/', + 'only_matching': True, + }, { + 'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/', + 'only_matching': True, }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) real_id = self._search_regex( - r"video\.settings\.pid\s*=\s*'([^']+)';", + [r"video\.settings\.pid\s*=\s*'([^']+)';", r"cbsplayer\.pid\s*=\s*'([^']+)';"], webpage, 'real video ID') - return self.url_result('theplatform:%s' % real_id) + return { + '_type': 'url_transparent', + 'ie_key': 'ThePlatform', + 'url': 'theplatform:%s' % real_id, + 'display_id': display_id, + } diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index cf0a7551b..c949a4814 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -60,6 +60,17 @@ class CinemassacreIE(InfoExtractor): 'uploader_id': 'Cinemassacre', 'title': 'AVGN: McKids', } + }, + { + 'url': 'http://cinemassacre.com/2015/05/25/mario-kart-64-nintendo-64-james-mike-mondays/', + 'md5': '1376908e49572389e7b06251a53cdd08', + 'info_dict': { + 'id': 'Cinemassacre-555779690c440', + 'ext': 'mp4', + 'description': 'Let’s Play Mario Kart 64 !! Mario Kart 64 is a classic go-kart racing game released for the Nintendo 64 (N64). Today James & Mike do 4 player Battle Mode with Kyle and Bootsy!', + 'title': 'Mario Kart 64 (Nintendo 64) James & Mike Mondays', + 'upload_date': '20150525', + } } ] @@ -72,7 +83,7 @@ class CinemassacreIE(InfoExtractor): playerdata_url = self._search_regex( [ - r'src="(http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=.+?)"', + r'src="(http://(?:player2\.screenwavemedia\.com|player\.screenwavemedia\.com/play)/[a-zA-Z]+\.php\?[^"]*\bid=.+?)"', r']+src="((?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"', ], webpage, 'player data URL', default=None) diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cnet.py index 3145b3051..5dd69bff7 100644 --- a/youtube_dl/extractor/cnet.py +++ b/youtube_dl/extractor/cnet.py @@ -11,7 +11,7 @@ from ..utils import ( class CNETIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?cnet\.com/videos/(?P[^/]+)/' - _TEST = { + _TESTS = [{ 'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/', 'info_dict': { 'id': '56f4ea68-bd21-4852-b08c-4de5b8354c60', @@ -25,7 +25,20 @@ class CNETIE(InfoExtractor): 'params': { 'skip_download': 'requires rtmpdump', } - } + }, { + 'url': 'http://www.cnet.com/videos/whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187/', + 'info_dict': { + 'id': '56527b93-d25d-44e3-b738-f989ce2e49ba', + 'ext': 'flv', + 'description': 'Khail and Ashley wonder what other civic woes can be solved by self-tweeting objects, investigate a new kind of VR camera and watch an origami robot self-assemble, walk, climb, dig and dissolve. #TDPothole', + 'uploader_id': 'b163284d-6b73-44fc-b3e6-3da66c392d40', + 'uploader': 'Ashley Esqueda', + 'title': 'Whiny potholes tweet at local government when hit by cars (Tomorrow Daily 187)', + }, + 'params': { + 'skip_download': True, # requires rtmpdump + }, + }] def _real_extract(self, url): display_id = self._match_id(url) @@ -42,7 +55,7 @@ class CNETIE(InfoExtractor): raise ExtractorError('Cannot find video data') mpx_account = data['config']['players']['default']['mpx_account'] - vid = vdata['files']['rtmp'] + vid = vdata['files'].get('rtmp', vdata['files']['hds']) tp_link = 'http://link.theplatform.com/s/%s/%s' % (mpx_account, vid) video_id = vdata['id'] diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index cecf917ff..49e4dc710 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -846,7 +846,7 @@ class InfoExtractor(object): def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, entry_protocol='m3u8', preference=None, - m3u8_id=None): + m3u8_id=None, note=None, errnote=None): formats = [{ 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])), @@ -865,8 +865,8 @@ class InfoExtractor(object): m3u8_doc = self._download_webpage( m3u8_url, video_id, - note='Downloading m3u8 information', - errnote='Failed to download m3u8 information') + note=note or 'Downloading m3u8 information', + errnote=errnote or 'Failed to download m3u8 information') last_info = None last_media = None kv_rex = re.compile( diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 1c77df47e..41f0c736d 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -76,8 +76,8 @@ class CrunchyrollIE(InfoExtractor): self._login() def _decrypt_subtitles(self, data, iv, id): - data = bytes_to_intlist(data) - iv = bytes_to_intlist(iv) + data = bytes_to_intlist(base64.b64decode(data.encode('utf-8'))) + iv = bytes_to_intlist(base64.b64decode(iv.encode('utf-8'))) id = int(id) def obfuscate_key_aux(count, modulo, start): @@ -179,6 +179,16 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text return output + def _extract_subtitles(self, subtitle): + sub_root = xml.etree.ElementTree.fromstring(subtitle) + return [{ + 'ext': 'srt', + 'data': self._convert_subtitles_to_srt(sub_root), + }, { + 'ext': 'ass', + 'data': self._convert_subtitles_to_ass(sub_root), + }] + def _get_subtitles(self, video_id, webpage): subtitles = {} for sub_id, sub_name in re.findall(r'\?ssid=([0-9]+)" title="([^"]+)', webpage): @@ -190,25 +200,11 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text data = self._search_regex(r'([^<]+)', sub_page, 'subtitle_data', fatal=False) if not id or not iv or not data: continue - id = int(id) - iv = base64.b64decode(iv) - data = base64.b64decode(data) - subtitle = self._decrypt_subtitles(data, iv, id).decode('utf-8') lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False) if not lang_code: continue - sub_root = xml.etree.ElementTree.fromstring(subtitle) - subtitles[lang_code] = [ - { - 'ext': 'srt', - 'data': self._convert_subtitles_to_srt(sub_root), - }, - { - 'ext': 'ass', - 'data': self._convert_subtitles_to_ass(sub_root), - }, - ] + subtitles[lang_code] = self._extract_subtitles(subtitle) return subtitles def _real_extract(self, url): diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index db10b8d00..70aa4333c 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -225,7 +225,7 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): class DailymotionUserIE(DailymotionPlaylistIE): IE_NAME = 'dailymotion:user' - _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?:old/)?user/(?P[^/]+)' + _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?:(?:old/)?user/)?(?P[^/]+)$' _PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s' _TESTS = [{ 'url': 'https://www.dailymotion.com/user/nqtv', @@ -239,7 +239,8 @@ class DailymotionUserIE(DailymotionPlaylistIE): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) user = mobj.group('user') - webpage = self._download_webpage(url, user) + webpage = self._download_webpage( + 'https://www.dailymotion.com/user/%s' % user, user) full_user = unescapeHTML(self._html_search_regex( r'' % re.escape(user), webpage, 'user')) diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py index d3e667528..d6723ecf2 100644 --- a/youtube_dl/extractor/discovery.py +++ b/youtube_dl/extractor/discovery.py @@ -2,19 +2,19 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + parse_duration, parse_iso8601, - int_or_none, ) +from ..compat import compat_str class DiscoveryIE(InfoExtractor): _VALID_URL = r'http://www\.discovery\.com\/[a-zA-Z0-9\-]*/[a-zA-Z0-9\-]*/videos/(?P[a-zA-Z0-9_\-]*)(?:\.htm)?' - _TEST = { + _TESTS = [{ 'url': 'http://www.discovery.com/tv-shows/mythbusters/videos/mission-impossible-outtakes.htm', - 'md5': '3c69d77d9b0d82bfd5e5932a60f26504', 'info_dict': { - 'id': 'mission-impossible-outtakes', - 'ext': 'flv', + 'id': '20769', + 'ext': 'mp4', 'title': 'Mission Impossible Outtakes', 'description': ('Watch Jamie Hyneman and Adam Savage practice being' ' each other -- to the point of confusing Jamie\'s dog -- and ' @@ -24,22 +24,36 @@ class DiscoveryIE(InfoExtractor): 'timestamp': 1303099200, 'upload_date': '20110418', }, - } + 'params': { + 'skip_download': True, # requires ffmpeg + } + }, { + 'url': 'http://www.discovery.com/tv-shows/mythbusters/videos/mythbusters-the-simpsons', + 'info_dict': { + 'id': 'mythbusters-the-simpsons', + 'title': 'MythBusters: The Simpsons', + }, + 'playlist_count': 9, + }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + info = self._download_json(url + '?flat=1', video_id) - info = self._parse_json(self._search_regex( - r'(?s)', - webpage, 'video info'), video_id) + video_title = info.get('playlist_title') or info.get('video_title') - return { - 'id': video_id, - 'title': info['name'], - 'url': info['contentURL'], - 'description': info.get('description'), - 'thumbnail': info.get('thumbnailUrl'), - 'timestamp': parse_iso8601(info.get('uploadDate')), - 'duration': int_or_none(info.get('duration')), - } + entries = [{ + 'id': compat_str(video_info['id']), + 'formats': self._extract_m3u8_formats( + video_info['src'], video_id, ext='mp4', + note='Download m3u8 information for video %d' % (idx + 1)), + 'title': video_info['title'], + 'description': video_info.get('description'), + 'duration': parse_duration(video_info.get('video_length')), + 'webpage_url': video_info.get('href'), + 'thumbnail': video_info.get('thumbnailURL'), + 'alt_title': video_info.get('secondary_title'), + 'timestamp': parse_iso8601(video_info.get('publishedDate')), + } for idx, video_info in enumerate(info['playlist'])] + + return self.playlist_result(entries, video_id, video_title) diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py new file mode 100644 index 000000000..ca41a3abf --- /dev/null +++ b/youtube_dl/extractor/dramafever.py @@ -0,0 +1,197 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import itertools + +from .common import InfoExtractor +from ..compat import ( + compat_HTTPError, + compat_urllib_parse, + compat_urllib_request, + compat_urlparse, +) +from ..utils import ( + ExtractorError, + clean_html, + determine_ext, + int_or_none, + parse_iso8601, +) + + +class DramaFeverBaseIE(InfoExtractor): + _LOGIN_URL = 'https://www.dramafever.com/accounts/login/' + _NETRC_MACHINE = 'dramafever' + + def _real_initialize(self): + self._login() + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + + login_form = { + 'username': username, + 'password': password, + } + + request = compat_urllib_request.Request( + self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8')) + response = self._download_webpage( + request, None, 'Logging in as %s' % username) + + if all(logout_pattern not in response + for logout_pattern in ['href="/accounts/logout/"', '>Log out<']): + error = self._html_search_regex( + r'(?s)class="hidden-xs prompt"[^>]*>(.+?)<', + response, 'error message', default=None) + if error: + raise ExtractorError('Unable to login: %s' % error, expected=True) + raise ExtractorError('Unable to log in') + + +class DramaFeverIE(DramaFeverBaseIE): + IE_NAME = 'dramafever' + _VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P[0-9]+/[0-9]+)(?:/|$)' + _TEST = { + 'url': 'http://www.dramafever.com/drama/4512/1/Cooking_with_Shin/', + 'info_dict': { + 'id': '4512.1', + 'ext': 'flv', + 'title': 'Cooking with Shin 4512.1', + 'description': 'md5:a8eec7942e1664a6896fcd5e1287bfd0', + 'thumbnail': 're:^https?://.*\.jpg', + 'timestamp': 1404336058, + 'upload_date': '20140702', + 'duration': 343, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url).replace('/', '.') + + try: + feed = self._download_json( + 'http://www.dramafever.com/amp/episode/feed.json?guid=%s' % video_id, + video_id, 'Downloading episode JSON')['channel']['item'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError): + raise ExtractorError( + 'Currently unavailable in your country.', expected=True) + raise + + media_group = feed.get('media-group', {}) + + formats = [] + for media_content in media_group['media-content']: + src = media_content.get('@attributes', {}).get('url') + if not src: + continue + ext = determine_ext(src) + if ext == 'f4m': + formats.extend(self._extract_f4m_formats( + src, video_id, f4m_id='hds')) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + src, video_id, 'mp4', m3u8_id='hls')) + else: + formats.append({ + 'url': src, + }) + self._sort_formats(formats) + + title = media_group.get('media-title') + description = media_group.get('media-description') + duration = int_or_none(media_group['media-content'][0].get('@attributes', {}).get('duration')) + thumbnail = self._proto_relative_url( + media_group.get('media-thumbnail', {}).get('@attributes', {}).get('url')) + timestamp = parse_iso8601(feed.get('pubDate'), ' ') + + subtitles = {} + for media_subtitle in media_group.get('media-subTitle', []): + lang = media_subtitle.get('@attributes', {}).get('lang') + href = media_subtitle.get('@attributes', {}).get('href') + if not lang or not href: + continue + subtitles[lang] = [{ + 'ext': 'ttml', + 'url': href, + }] + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles, + } + + +class DramaFeverSeriesIE(DramaFeverBaseIE): + IE_NAME = 'dramafever:series' + _VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P[0-9]+)(?:/(?:(?!\d+(?:/|$)).+)?)?$' + _TESTS = [{ + 'url': 'http://www.dramafever.com/drama/4512/Cooking_with_Shin/', + 'info_dict': { + 'id': '4512', + 'title': 'Cooking with Shin', + 'description': 'md5:84a3f26e3cdc3fb7f500211b3593b5c1', + }, + 'playlist_count': 4, + }, { + 'url': 'http://www.dramafever.com/drama/124/IRIS/', + 'info_dict': { + 'id': '124', + 'title': 'IRIS', + 'description': 'md5:b3a30e587cf20c59bd1c01ec0ee1b862', + }, + 'playlist_count': 20, + }] + + _CONSUMER_SECRET = 'DA59dtVXYLxajktV' + _PAGE_SIZE = 60 # max is 60 (see http://api.drama9.com/#get--api-4-episode-series-) + + def _get_consumer_secret(self, video_id): + mainjs = self._download_webpage( + 'http://www.dramafever.com/static/51afe95/df2014/scripts/main.js', + video_id, 'Downloading main.js', fatal=False) + if not mainjs: + return self._CONSUMER_SECRET + return self._search_regex( + r"var\s+cs\s*=\s*'([^']+)'", mainjs, + 'consumer secret', default=self._CONSUMER_SECRET) + + def _real_extract(self, url): + series_id = self._match_id(url) + + consumer_secret = self._get_consumer_secret(series_id) + + series = self._download_json( + 'http://www.dramafever.com/api/4/series/query/?cs=%s&series_id=%s' + % (consumer_secret, series_id), + series_id, 'Downloading series JSON')['series'][series_id] + + title = clean_html(series['name']) + description = clean_html(series.get('description') or series.get('description_short')) + + entries = [] + for page_num in itertools.count(1): + episodes = self._download_json( + 'http://www.dramafever.com/api/4/episode/series/?cs=%s&series_id=%s&page_size=%d&page_number=%d' + % (consumer_secret, series_id, self._PAGE_SIZE, page_num), + series_id, 'Downloading episodes JSON page #%d' % page_num) + for episode in episodes.get('value', []): + episode_url = episode.get('episode_url') + if not episode_url: + continue + entries.append(self.url_result( + compat_urlparse.urljoin(url, episode_url), + 'DramaFever', episode.get('guid'))) + if page_num == episodes['num_pages']: + break + + return self.playlist_result(entries, series_id, title, description) diff --git a/youtube_dl/extractor/empflix.py b/youtube_dl/extractor/empflix.py index 9a5a8f4bb..4827022e0 100644 --- a/youtube_dl/extractor/empflix.py +++ b/youtube_dl/extractor/empflix.py @@ -26,6 +26,6 @@ class EMPFlixIE(TNAFlixIE): }, { 'url': 'http://www.empflix.com/videos/[AROMA][ARMD-718]-Aoi-Yoshino-Sawa-25826.html', - 'matching_only': True, + 'only_matching': True, } ] diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 937b28fcc..82dc27bc6 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -50,7 +50,10 @@ class FacebookIE(InfoExtractor): 'id': '274175099429670', 'ext': 'mp4', 'title': 'Facebook video #274175099429670', - } + }, + 'expected_warnings': [ + 'title' + ] }, { 'url': 'https://www.facebook.com/video.php?v=10204634152394104', 'only_matching': True, @@ -149,12 +152,12 @@ class FacebookIE(InfoExtractor): raise ExtractorError('Cannot find video formats') video_title = self._html_search_regex( - r'

([^<]*)

', webpage, 'title', - fatal=False) + r']*class="uiHeaderTitle"[^>]*>([^<]*)', webpage, 'title', + default=None) if not video_title: video_title = self._html_search_regex( r'(?s)(.*?)', - webpage, 'alternative title', default=None) + webpage, 'alternative title', fatal=False) video_title = limit_length(video_title, 80) if not video_title: video_title = 'Facebook video #%s' % video_id diff --git a/youtube_dl/extractor/firedrive.py b/youtube_dl/extractor/firedrive.py deleted file mode 100644 index 3191116d9..000000000 --- a/youtube_dl/extractor/firedrive.py +++ /dev/null @@ -1,80 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse, - compat_urllib_request, -) -from ..utils import ( - ExtractorError, -) - - -class FiredriveIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?firedrive\.com/' + \ - '(?:file|embed)/(?P[0-9a-zA-Z]+)' - _FILE_DELETED_REGEX = r'
' - - _TESTS = [{ - 'url': 'https://www.firedrive.com/file/FEB892FA160EBD01', - 'md5': 'd5d4252f80ebeab4dc2d5ceaed1b7970', - 'info_dict': { - 'id': 'FEB892FA160EBD01', - 'ext': 'flv', - 'title': 'bbb_theora_486kbit.flv', - 'thumbnail': 're:^http://.*\.jpg$', - }, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - url = 'http://firedrive.com/file/%s' % video_id - webpage = self._download_webpage(url, video_id) - - if re.search(self._FILE_DELETED_REGEX, webpage) is not None: - raise ExtractorError('Video %s does not exist' % video_id, - expected=True) - - fields = dict(re.findall(r'''(?x)(.+)
', - webpage, 'title') - thumbnail = self._search_regex(r'image:\s?"(//[^\"]+)', webpage, - 'thumbnail', fatal=False) - if thumbnail is not None: - thumbnail = 'http:' + thumbnail - - ext = self._search_regex(r'type:\s?\'([^\']+)\',', - webpage, 'extension', fatal=False) - video_url = self._search_regex( - r'file:\s?loadURL\(\'(http[^\']+)\'\),', webpage, 'file url') - - formats = [{ - 'format_id': 'sd', - 'url': video_url, - 'ext': ext, - }] - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'formats': formats, - } diff --git a/youtube_dl/extractor/fivetv.py b/youtube_dl/extractor/fivetv.py new file mode 100644 index 000000000..13fbc4da2 --- /dev/null +++ b/youtube_dl/extractor/fivetv.py @@ -0,0 +1,88 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import int_or_none + + +class FiveTVIE(InfoExtractor): + _VALID_URL = r'''(?x) + http:// + (?:www\.)?5-tv\.ru/ + (?: + (?:[^/]+/)+(?P\d+)| + (?P[^/?#]+)(?:[/?#])? + ) + ''' + + _TESTS = [{ + 'url': 'http://5-tv.ru/news/96814/', + 'md5': 'bbff554ad415ecf5416a2f48c22d9283', + 'info_dict': { + 'id': '96814', + 'ext': 'mp4', + 'title': 'Россияне выбрали имя для общенациональной платежной системы', + 'description': 'md5:a8aa13e2b7ad36789e9f77a74b6de660', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 180, + }, + }, { + 'url': 'http://5-tv.ru/video/1021729/', + 'info_dict': { + 'id': '1021729', + 'ext': 'mp4', + 'title': '3D принтер', + 'description': 'md5:d76c736d29ef7ec5c0cf7d7c65ffcb41', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 180, + }, + }, { + 'url': 'http://www.5-tv.ru/glavnoe/#itemDetails', + 'info_dict': { + 'id': 'glavnoe', + 'ext': 'mp4', + 'title': 'Итоги недели с 8 по 14 июня 2015 года', + 'thumbnail': 're:^https?://.*\.jpg$', + }, + }, { + 'url': 'http://www.5-tv.ru/glavnoe/broadcasts/508645/', + 'only_matching': True, + }, { + 'url': 'http://5-tv.ru/films/1507502/', + 'only_matching': True, + }, { + 'url': 'http://5-tv.ru/programs/broadcast/508713/', + 'only_matching': True, + }, { + 'url': 'http://5-tv.ru/angel/', + 'only_matching': True, + }, { + 'url': 'http://www.5-tv.ru/schedule/?iframe=true&width=900&height=450', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') or mobj.group('path') + + webpage = self._download_webpage(url, video_id) + + video_url = self._search_regex( + r']+?href="([^"]+)"[^>]+?class="videoplayer"', + webpage, 'video url') + + title = self._og_search_title(webpage, default=None) or self._search_regex( + r'([^<]+)', webpage, 'title') + duration = int_or_none(self._og_search_property( + 'video:duration', webpage, 'duration', default=None)) + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'description': self._og_search_description(webpage, default=None), + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'duration': duration, + } diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index edf555b29..db0bbec1e 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -60,7 +60,7 @@ class FranceTVBaseInfoExtractor(InfoExtractor): continue video_url_parsed = compat_urllib_parse_urlparse(video_url) f4m_url = self._download_webpage( - 'http://hdfauth.francetv.fr/esi/urltokengen2.html?url=%s' % video_url_parsed.path, + 'http://hdfauth.francetv.fr/esi/TA?url=%s' % video_url_parsed.path, video_id, 'Downloading f4m manifest token', fatal=False) if f4m_url: formats.extend(self._extract_f4m_formats(f4m_url, video_id, 1, format_id)) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 9a7b0d25d..f6b984300 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -9,6 +9,8 @@ from .common import InfoExtractor from .youtube import YoutubeIE from ..compat import ( compat_urllib_parse, + compat_urllib_parse_unquote, + compat_urllib_request, compat_urlparse, compat_xml_parse_error, ) @@ -32,6 +34,7 @@ from .brightcove import BrightcoveIE from .nbc import NBCSportsVPlayerIE from .ooyala import OoyalaIE from .rutv import RUTVIE +from .tvc import TVCIE from .sportbox import SportBoxEmbedIE from .smotri import SmotriIE from .condenast import CondeNastIE @@ -39,6 +42,7 @@ from .udn import UDNEmbedIE from .senateisvp import SenateISVPIE from .bliptv import BlipTVIE from .svt import SVTIE +from .pornhub import PornHubIE class GenericIE(InfoExtractor): @@ -46,6 +50,97 @@ class GenericIE(InfoExtractor): _VALID_URL = r'.*' IE_NAME = 'generic' _TESTS = [ + # Direct link to a video + { + 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4', + 'md5': '67d406c2bcb6af27fa886f31aa934bbe', + 'info_dict': { + 'id': 'trailer', + 'ext': 'mp4', + 'title': 'trailer', + 'upload_date': '20100513', + } + }, + # Direct link to media delivered compressed (until Accept-Encoding is *) + { + 'url': 'http://calimero.tk/muzik/FictionJunction-Parallel_Hearts.flac', + 'md5': '128c42e68b13950268b648275386fc74', + 'info_dict': { + 'id': 'FictionJunction-Parallel_Hearts', + 'ext': 'flac', + 'title': 'FictionJunction-Parallel_Hearts', + 'upload_date': '20140522', + }, + 'expected_warnings': [ + 'URL could be a direct video link, returning it as such.' + ] + }, + # Direct download with broken HEAD + { + 'url': 'http://ai-radio.org:8000/radio.opus', + 'info_dict': { + 'id': 'radio', + 'ext': 'opus', + 'title': 'radio', + }, + 'params': { + 'skip_download': True, # infinite live stream + }, + 'expected_warnings': [ + r'501.*Not Implemented' + ], + }, + # Direct link with incorrect MIME type + { + 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', + 'md5': '4ccbebe5f36706d85221f204d7eb5913', + 'info_dict': { + 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', + 'id': '5_Lennart_Poettering_-_Systemd', + 'ext': 'webm', + 'title': '5_Lennart_Poettering_-_Systemd', + 'upload_date': '20141120', + }, + 'expected_warnings': [ + 'URL could be a direct video link, returning it as such.' + ] + }, + # RSS feed + { + 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml', + 'info_dict': { + 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml', + 'title': 'Zero Punctuation', + 'description': 're:.*groundbreaking video review series.*' + }, + 'playlist_mincount': 11, + }, + # RSS feed with enclosure + { + 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', + 'info_dict': { + 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624', + 'ext': 'm4v', + 'upload_date': '20150228', + 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624', + } + }, + # google redirect + { + 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', + 'info_dict': { + 'id': 'cmQHVoWB5FY', + 'ext': 'mp4', + 'upload_date': '20130224', + 'uploader_id': 'TheVerge', + 'description': 're:^Chris Ziegler takes a look at the\.*', + 'uploader': 'The Verge', + 'title': 'First Firefox OS phones side-by-side', + }, + 'params': { + 'skip_download': False, + } + }, { 'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html', 'md5': '85b90ccc9d73b4acd9138d3af4c27f89', @@ -125,17 +220,6 @@ class GenericIE(InfoExtractor): 'skip_download': True, # m3u8 download }, }, - # Direct link to a video - { - 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4', - 'md5': '67d406c2bcb6af27fa886f31aa934bbe', - 'info_dict': { - 'id': 'trailer', - 'ext': 'mp4', - 'title': 'trailer', - 'upload_date': '20100513', - } - }, # ooyala video { 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219', @@ -160,22 +244,6 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['Ooyala'], }, - # google redirect - { - 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', - 'info_dict': { - 'id': 'cmQHVoWB5FY', - 'ext': 'mp4', - 'upload_date': '20130224', - 'uploader_id': 'TheVerge', - 'description': 're:^Chris Ziegler takes a look at the\.*', - 'uploader': 'The Verge', - 'title': 'First Firefox OS phones side-by-side', - }, - 'params': { - 'skip_download': False, - } - }, # embed.ly video { 'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/', @@ -225,6 +293,15 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, + # TVC embed + { + 'url': 'http://sch1298sz.mskobr.ru/dou_edu/karamel_ki/filial_galleries/video/iframe_src_http_tvc_ru_video_iframe_id_55304_isplay_false_acc_video_id_channel_brand_id_11_show_episodes_episode_id_32307_frameb/', + 'info_dict': { + 'id': '55304', + 'ext': 'mp4', + 'title': 'Дошкольное воспитание', + }, + }, # SportBox embed { 'url': 'http://www.vestifinance.ru/articles/25753', @@ -407,16 +484,6 @@ class GenericIE(InfoExtractor): 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com', } }, - # RSS feed - { - 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml', - 'info_dict': { - 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml', - 'title': 'Zero Punctuation', - 'description': 're:.*groundbreaking video review series.*' - }, - 'playlist_mincount': 11, - }, # Multiple brightcove videos # https://github.com/rg3/youtube-dl/issues/2283 { @@ -470,21 +537,6 @@ class GenericIE(InfoExtractor): 'uploader': 'thoughtworks.wistia.com', }, }, - # Direct download with broken HEAD - { - 'url': 'http://ai-radio.org:8000/radio.opus', - 'info_dict': { - 'id': 'radio', - 'ext': 'opus', - 'title': 'radio', - }, - 'params': { - 'skip_download': True, # infinite live stream - }, - 'expected_warnings': [ - r'501.*Not Implemented' - ], - }, # Soundcloud embed { 'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/', @@ -516,21 +568,6 @@ class GenericIE(InfoExtractor): }, 'playlist_mincount': 2, }, - # Direct link with incorrect MIME type - { - 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', - 'md5': '4ccbebe5f36706d85221f204d7eb5913', - 'info_dict': { - 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', - 'id': '5_Lennart_Poettering_-_Systemd', - 'ext': 'webm', - 'title': '5_Lennart_Poettering_-_Systemd', - 'upload_date': '20141120', - }, - 'expected_warnings': [ - 'URL could be a direct video link, returning it as such.' - ] - }, # Cinchcast embed { 'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/', @@ -689,16 +726,6 @@ class GenericIE(InfoExtractor): 'age_limit': 0, }, }, - # RSS feed with enclosure - { - 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', - 'info_dict': { - 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624', - 'ext': 'm4v', - 'upload_date': '20150228', - 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624', - } - }, # Crooks and Liars embed { 'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists', @@ -773,6 +800,18 @@ class GenericIE(InfoExtractor): # rtmpe downloads 'skip_download': True, } + }, + # Brightcove URL in single quotes + { + 'url': 'http://www.sportsnet.ca/baseball/mlb/sn-presents-russell-martin-world-citizen/', + 'md5': '4ae374f1f8b91c889c4b9203c8c752af', + 'info_dict': { + 'id': '4255764656001', + 'ext': 'mp4', + 'title': 'SN Presents: Russell Martin, World Citizen', + 'description': 'To understand why he was the Toronto Blue Jays’ top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.', + 'uploader': 'Rogers Sportsnet', + }, } ] @@ -894,7 +933,7 @@ class GenericIE(InfoExtractor): force_videoid = smuggled_data['force_videoid'] video_id = force_videoid else: - video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0] + video_id = compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0]) self.to_screen('%s: Requesting header' % video_id) @@ -916,7 +955,9 @@ class GenericIE(InfoExtractor): full_response = None if head_response is False: - full_response = self._request_webpage(url, video_id) + request = compat_urllib_request.Request(url) + request.add_header('Accept-Encoding', '*') + full_response = self._request_webpage(request, video_id) head_response = full_response # Check for direct link to a video @@ -927,7 +968,7 @@ class GenericIE(InfoExtractor): head_response.headers.get('Last-Modified')) return { 'id': video_id, - 'title': os.path.splitext(url_basename(url))[0], + 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]), 'direct': True, 'formats': [{ 'format_id': m.group('format_id'), @@ -941,7 +982,17 @@ class GenericIE(InfoExtractor): self._downloader.report_warning('Falling back on generic information extractor.') if not full_response: - full_response = self._request_webpage(url, video_id) + request = compat_urllib_request.Request(url) + # Some webservers may serve compressed content of rather big size (e.g. gzipped flac) + # making it impossible to download only chunk of the file (yet we need only 512kB to + # test whether it's HTML or not). According to youtube-dl default Accept-Encoding + # that will always result in downloading the whole file that is not desirable. + # Therefore for extraction pass we have to override Accept-Encoding to any in order + # to accept raw bytes and being able to download only a chunk. + # It may probably better to solve this by checking Content-Type for application/octet-stream + # after HEAD request finishes, but not sure if we can rely on this. + request.add_header('Accept-Encoding', '*') + full_response = self._request_webpage(request, video_id) # Maybe it's a direct link to a video? # Be careful not to download the whole thing! @@ -953,7 +1004,7 @@ class GenericIE(InfoExtractor): head_response.headers.get('Last-Modified')) return { 'id': video_id, - 'title': os.path.splitext(url_basename(url))[0], + 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]), 'direct': True, 'url': url, 'upload_date': upload_date, @@ -1033,7 +1084,7 @@ class GenericIE(InfoExtractor): # Look for embedded rtl.nl player matches = re.findall( - r']+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"', webpage) if matches: return _playlist_from_matches(matches, ie='RtlNl') @@ -1261,11 +1312,27 @@ class GenericIE(InfoExtractor): if rutv_url: return self.url_result(rutv_url, 'RUTV') + # Look for embedded TVC player + tvc_url = TVCIE._extract_url(webpage) + if tvc_url: + return self.url_result(tvc_url, 'TVC') + # Look for embedded SportBox player sportbox_urls = SportBoxEmbedIE._extract_urls(webpage) if sportbox_urls: return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed') + # Look for embedded PornHub player + pornhub_url = PornHubIE._extract_url(webpage) + if pornhub_url: + return self.url_result(pornhub_url, 'PornHub') + + # Look for embedded Tvigle player + mobj = re.search( + r']+?src=(["\'])(?P(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage) + if mobj is not None: + return self.url_result(mobj.group('url'), 'Tvigle') + # Look for embedded TED player mobj = re.search( r']+?src=(["\'])(?Phttps?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage) diff --git a/youtube_dl/extractor/imgur.py b/youtube_dl/extractor/imgur.py index fe5d95e2c..d692ea79a 100644 --- a/youtube_dl/extractor/imgur.py +++ b/youtube_dl/extractor/imgur.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( int_or_none, js_to_json, @@ -12,7 +13,7 @@ from ..utils import ( class ImgurIE(InfoExtractor): - _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?P[a-zA-Z0-9]+)(?:\.mp4|\.gifv)?' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?P[a-zA-Z0-9]+)' _TESTS = [{ 'url': 'https://i.imgur.com/A61SaA1.gifv', @@ -34,7 +35,8 @@ class ImgurIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage( + compat_urlparse.urljoin(url, video_id), video_id) width = int_or_none(self._search_regex( r'[^?#]+)' + _VALID_URL = r'https?://play\.iprima\.cz/(?:[^/]+/)*(?P[^?#]+)' _TESTS = [{ 'url': 'http://play.iprima.cz/particka/particka-92', @@ -23,7 +24,7 @@ class IPrimaIE(InfoExtractor): 'id': '39152', 'ext': 'flv', 'title': 'Partička (92)', - 'description': 'md5:3740fda51464da35a2d4d0670b8e4fd6', + 'description': 'md5:74e9617e51bca67c3ecfb2c6f9766f45', 'thumbnail': 'http://play.iprima.cz/sites/default/files/image_crops/image_620x349/3/491483_particka-92_image_620x349.jpg', }, 'params': { @@ -35,13 +36,14 @@ class IPrimaIE(InfoExtractor): 'id': '9718337', 'ext': 'flv', 'title': 'Tchibo Partička - Jarní móda', - 'description': 'md5:589f8f59f414220621ff8882eb3ce7be', 'thumbnail': 're:^http:.*\.jpg$', }, 'params': { 'skip_download': True, # requires rtmpdump }, - 'skip': 'Do not have permission to access this page', + }, { + 'url': 'http://play.iprima.cz/zpravy-ftv-prima-2752015', + 'only_matching': True, }] def _real_extract(self, url): @@ -102,8 +104,10 @@ class IPrimaIE(InfoExtractor): return { 'id': real_id, - 'title': self._og_search_title(webpage), + 'title': remove_end(self._og_search_title(webpage), ' | Prima PLAY'), 'thumbnail': self._og_search_thumbnail(webpage), 'formats': formats, - 'description': self._og_search_description(webpage), + 'description': self._search_regex( + r']+itemprop="description"[^>]*>([^<]+)', + webpage, 'description', default=None), } diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py new file mode 100644 index 000000000..9106dd074 --- /dev/null +++ b/youtube_dl/extractor/iqiyi.py @@ -0,0 +1,296 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import hashlib +import math +import os.path +import random +import re +import time +import uuid +import zlib + +from .common import InfoExtractor +from ..compat import compat_urllib_parse +from ..utils import ( + ExtractorError, + url_basename, +) + + +class IqiyiIE(InfoExtractor): + IE_NAME = 'iqiyi' + + _VALID_URL = r'http://(?:www\.)iqiyi.com/v_.+?\.html' + + _TESTS = [{ + 'url': 'http://www.iqiyi.com/v_19rrojlavg.html', + 'md5': '2cb594dc2781e6c941a110d8f358118b', + 'info_dict': { + 'id': '9c1fb1b99d192b21c559e5a1a2cb3c73', + 'title': '美国德州空中惊现奇异云团 酷似UFO', + 'ext': 'f4v', + } + }, { + 'url': 'http://www.iqiyi.com/v_19rrhnnclk.html', + 'info_dict': { + 'id': 'e3f585b550a280af23c98b6cb2be19fb', + 'title': '名侦探柯南第752集', + }, + 'playlist': [{ + 'md5': '7e49376fecaffa115d951634917fe105', + 'info_dict': { + 'id': 'e3f585b550a280af23c98b6cb2be19fb_part1', + 'ext': 'f4v', + 'title': '名侦探柯南第752集', + }, + }, { + 'md5': '41b75ba13bb7ac0e411131f92bc4f6ca', + 'info_dict': { + 'id': 'e3f585b550a280af23c98b6cb2be19fb_part2', + 'ext': 'f4v', + 'title': '名侦探柯南第752集', + }, + }, { + 'md5': '0cee1dd0a3d46a83e71e2badeae2aab0', + 'info_dict': { + 'id': 'e3f585b550a280af23c98b6cb2be19fb_part3', + 'ext': 'f4v', + 'title': '名侦探柯南第752集', + }, + }, { + 'md5': '4f8ad72373b0c491b582e7c196b0b1f9', + 'info_dict': { + 'id': 'e3f585b550a280af23c98b6cb2be19fb_part4', + 'ext': 'f4v', + 'title': '名侦探柯南第752集', + }, + }, { + 'md5': 'd89ad028bcfad282918e8098e811711d', + 'info_dict': { + 'id': 'e3f585b550a280af23c98b6cb2be19fb_part5', + 'ext': 'f4v', + 'title': '名侦探柯南第752集', + }, + }, { + 'md5': '9cb1e5c95da25dff0660c32ae50903b7', + 'info_dict': { + 'id': 'e3f585b550a280af23c98b6cb2be19fb_part6', + 'ext': 'f4v', + 'title': '名侦探柯南第752集', + }, + }, { + 'md5': '155116e0ff1867bbc9b98df294faabc9', + 'info_dict': { + 'id': 'e3f585b550a280af23c98b6cb2be19fb_part7', + 'ext': 'f4v', + 'title': '名侦探柯南第752集', + }, + }, { + 'md5': '53f5db77622ae14fa493ed2a278a082b', + 'info_dict': { + 'id': 'e3f585b550a280af23c98b6cb2be19fb_part8', + 'ext': 'f4v', + 'title': '名侦探柯南第752集', + }, + }], + }] + + _FORMATS_MAP = [ + ('1', 'h6'), + ('2', 'h5'), + ('3', 'h4'), + ('4', 'h3'), + ('5', 'h2'), + ('10', 'h1'), + ] + + def construct_video_urls(self, data, video_id, _uuid): + def do_xor(x, y): + a = y % 3 + if a == 1: + return x ^ 121 + if a == 2: + return x ^ 72 + return x ^ 103 + + def get_encode_code(l): + a = 0 + b = l.split('-') + c = len(b) + s = '' + for i in range(c - 1, -1, -1): + a = do_xor(int(b[c - i - 1], 16), i) + s += chr(a) + return s[::-1] + + def get_path_key(x, format_id, segment_index): + mg = ')(*&^flash@#$%a' + tm = self._download_json( + 'http://data.video.qiyi.com/t?tn=' + str(random.random()), video_id, + note='Download path key of segment %d for format %s' % (segment_index + 1, format_id) + )['t'] + t = str(int(math.floor(int(tm) / (600.0)))) + return hashlib.md5((t + mg + x).encode('utf8')).hexdigest() + + video_urls_dict = {} + for format_item in data['vp']['tkl'][0]['vs']: + if 0 < int(format_item['bid']) <= 10: + format_id = self.get_format(format_item['bid']) + else: + continue + + video_urls = [] + + video_urls_info = format_item['fs'] + if not format_item['fs'][0]['l'].startswith('/'): + t = get_encode_code(format_item['fs'][0]['l']) + if t.endswith('mp4'): + video_urls_info = format_item['flvs'] + + for segment_index, segment in enumerate(video_urls_info): + vl = segment['l'] + if not vl.startswith('/'): + vl = get_encode_code(vl) + key = get_path_key( + vl.split('/')[-1].split('.')[0], format_id, segment_index) + filesize = segment['b'] + base_url = data['vp']['du'].split('/') + base_url.insert(-1, key) + base_url = '/'.join(base_url) + param = { + 'su': _uuid, + 'qyid': uuid.uuid4().hex, + 'client': '', + 'z': '', + 'bt': '', + 'ct': '', + 'tn': str(int(time.time())) + } + api_video_url = base_url + vl + '?' + \ + compat_urllib_parse.urlencode(param) + js = self._download_json( + api_video_url, video_id, + note='Download video info of segment %d for format %s' % (segment_index + 1, format_id)) + video_url = js['l'] + video_urls.append( + (video_url, filesize)) + + video_urls_dict[format_id] = video_urls + return video_urls_dict + + def get_format(self, bid): + matched_format_ids = [_format_id for _bid, _format_id in self._FORMATS_MAP if _bid == str(bid)] + return matched_format_ids[0] if len(matched_format_ids) else None + + def get_bid(self, format_id): + matched_bids = [_bid for _bid, _format_id in self._FORMATS_MAP if _format_id == format_id] + return matched_bids[0] if len(matched_bids) else None + + def get_raw_data(self, tvid, video_id, enc_key, _uuid): + tm = str(int(time.time())) + param = { + 'key': 'fvip', + 'src': hashlib.md5(b'youtube-dl').hexdigest(), + 'tvId': tvid, + 'vid': video_id, + 'vinfo': 1, + 'tm': tm, + 'enc': hashlib.md5( + (enc_key + tm + tvid).encode('utf8')).hexdigest(), + 'qyid': _uuid, + 'tn': random.random(), + 'um': 0, + 'authkey': hashlib.md5( + (tm + tvid).encode('utf8')).hexdigest() + } + + api_url = 'http://cache.video.qiyi.com/vms' + '?' + \ + compat_urllib_parse.urlencode(param) + raw_data = self._download_json(api_url, video_id) + return raw_data + + def get_enc_key(self, swf_url, video_id): + filename, _ = os.path.splitext(url_basename(swf_url)) + enc_key_json = self._downloader.cache.load('iqiyi-enc-key', filename) + if enc_key_json is not None: + return enc_key_json[0] + + req = self._request_webpage( + swf_url, video_id, note='download swf content') + cn = req.read() + cn = zlib.decompress(cn[8:]) + pt = re.compile(b'MixerRemote\x08(?P.+?)\$&vv') + enc_key = self._search_regex(pt, cn, 'enc_key').decode('utf8') + + self._downloader.cache.store('iqiyi-enc-key', filename, [enc_key]) + + return enc_key + + def _real_extract(self, url): + webpage = self._download_webpage( + url, 'temp_id', note='download video page') + tvid = self._search_regex( + r'data-player-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid') + video_id = self._search_regex( + r'data-player-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id') + swf_url = self._search_regex( + r'(http://[^\'"]+MainPlayer[^.]+\.swf)', webpage, 'swf player URL') + _uuid = uuid.uuid4().hex + + enc_key = self.get_enc_key(swf_url, video_id) + + raw_data = self.get_raw_data(tvid, video_id, enc_key, _uuid) + + if raw_data['code'] != 'A000000': + raise ExtractorError('Unable to load data. Error code: ' + raw_data['code']) + + if not raw_data['data']['vp']['tkl']: + raise ExtractorError('No support iQiqy VIP video') + + data = raw_data['data'] + + title = data['vi']['vn'] + + # generate video_urls_dict + video_urls_dict = self.construct_video_urls( + data, video_id, _uuid) + + # construct info + entries = [] + for format_id in video_urls_dict: + video_urls = video_urls_dict[format_id] + for i, video_url_info in enumerate(video_urls): + if len(entries) < i + 1: + entries.append({'formats': []}) + entries[i]['formats'].append( + { + 'url': video_url_info[0], + 'filesize': video_url_info[-1], + 'format_id': format_id, + 'preference': int(self.get_bid(format_id)) + } + ) + + for i in range(len(entries)): + self._sort_formats(entries[i]['formats']) + entries[i].update( + { + 'id': '%s_part%d' % (video_id, i + 1), + 'title': title, + } + ) + + if len(entries) > 1: + info = { + '_type': 'multi_video', + 'id': video_id, + 'title': title, + 'entries': entries, + } + else: + info = entries[0] + info['id'] = video_id + info['title'] = title + + return info diff --git a/youtube_dl/extractor/izlesene.py b/youtube_dl/extractor/izlesene.py index 99a1361f8..bc226fa67 100644 --- a/youtube_dl/extractor/izlesene.py +++ b/youtube_dl/extractor/izlesene.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote from ..utils import ( determine_ext, float_or_none, @@ -30,7 +31,7 @@ class IzleseneIE(InfoExtractor): 'description': 'md5:253753e2655dde93f59f74b572454f6d', 'thumbnail': 're:^http://.*\.jpg', 'uploader_id': 'pelikzzle', - 'timestamp': 1404302298, + 'timestamp': int, 'upload_date': '20140702', 'duration': 95.395, 'age_limit': 0, @@ -46,7 +47,7 @@ class IzleseneIE(InfoExtractor): 'description': 'Tarkan Dortmund 2006 Konseri', 'thumbnail': 're:^http://.*\.jpg', 'uploader_id': 'parlayankiz', - 'timestamp': 1163322193, + 'timestamp': int, 'upload_date': '20061112', 'duration': 253.666, 'age_limit': 0, @@ -67,9 +68,9 @@ class IzleseneIE(InfoExtractor): uploader = self._html_search_regex( r"adduserUsername\s*=\s*'([^']+)';", - webpage, 'uploader', fatal=False, default='') + webpage, 'uploader', fatal=False) timestamp = parse_iso8601(self._html_search_meta( - 'uploadDate', webpage, 'upload date', fatal=False)) + 'uploadDate', webpage, 'upload date')) duration = float_or_none(self._html_search_regex( r'"videoduration"\s*:\s*"([^"]+)"', @@ -86,8 +87,7 @@ class IzleseneIE(InfoExtractor): # Might be empty for some videos. streams = self._html_search_regex( - r'"qualitylevel"\s*:\s*"([^"]+)"', - webpage, 'streams', fatal=False, default='') + r'"qualitylevel"\s*:\s*"([^"]+)"', webpage, 'streams', default='') formats = [] if streams: @@ -95,15 +95,15 @@ class IzleseneIE(InfoExtractor): quality, url = re.search(r'\[(\w+)\](.+)', stream).groups() formats.append({ 'format_id': '%sp' % quality if quality else 'sd', - 'url': url, + 'url': compat_urllib_parse_unquote(url), 'ext': ext, }) else: stream_url = self._search_regex( - r'"streamurl"\s?:\s?"([^"]+)"', webpage, 'stream URL') + r'"streamurl"\s*:\s*"([^"]+)"', webpage, 'stream URL') formats.append({ 'format_id': 'sd', - 'url': stream_url, + 'url': compat_urllib_parse_unquote(stream_url), 'ext': ext, }) diff --git a/youtube_dl/extractor/kickstarter.py b/youtube_dl/extractor/kickstarter.py index 7d4b57056..1d391e69f 100644 --- a/youtube_dl/extractor/kickstarter.py +++ b/youtube_dl/extractor/kickstarter.py @@ -28,6 +28,14 @@ class KickStarterIE(InfoExtractor): 'uploader': 'Pebble Technology', 'title': 'Pebble iOS Notifications', } + }, { + 'url': 'https://www.kickstarter.com/projects/1420158244/power-drive-2000/widget/video.html', + 'info_dict': { + 'id': '1420158244', + 'ext': 'mp4', + 'title': 'Power Drive 2000', + }, + 'expected_warnings': ['OpenGraph description'], }] def _real_extract(self, url): @@ -48,10 +56,15 @@ class KickStarterIE(InfoExtractor): 'title': title, } + thumbnail = self._og_search_thumbnail(webpage, default=None) + if thumbnail is None: + thumbnail = self._html_search_regex( + r']+class="[^"]+\s*poster\s*[^"]+"[^>]+src="([^"]+)"', + webpage, 'thumbnail image', fatal=False) return { 'id': video_id, 'url': video_url, 'title': title, 'description': self._og_search_description(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), + 'thumbnail': thumbnail, } diff --git a/youtube_dl/extractor/lifenews.py b/youtube_dl/extractor/lifenews.py index 42cb6e35f..f8cbca7b3 100644 --- a/youtube_dl/extractor/lifenews.py +++ b/youtube_dl/extractor/lifenews.py @@ -8,6 +8,7 @@ from ..compat import compat_urlparse from ..utils import ( determine_ext, int_or_none, + remove_end, unified_strdate, ExtractorError, ) @@ -39,7 +40,6 @@ class LifeNewsIE(InfoExtractor): 'title': 'В Сети появилось видео захвата «Правым сектором» колхозных полей ', 'description': 'Жители двух поселков Днепропетровской области не простили радикалам угрозу лишения плодородных земель и пошли в лобовую. ', 'upload_date': '20150402', - 'uploader': 'embed.life.ru', } }, { 'url': 'http://lifenews.ru/news/153461', @@ -50,7 +50,6 @@ class LifeNewsIE(InfoExtractor): 'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве', 'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.', 'upload_date': '20150505', - 'uploader': 'embed.life.ru', } }, { 'url': 'http://lifenews.ru/video/13035', @@ -72,20 +71,20 @@ class LifeNewsIE(InfoExtractor): if not videos and not iframe_link: raise ExtractorError('No media links available for %s' % video_id) - title = self._og_search_title(webpage) - TITLE_SUFFIX = ' - Первый по срочным новостям — LIFE | NEWS' - if title.endswith(TITLE_SUFFIX): - title = title[:-len(TITLE_SUFFIX)] + title = remove_end( + self._og_search_title(webpage), + ' - Первый по срочным новостям — LIFE | NEWS') description = self._og_search_description(webpage) view_count = self._html_search_regex( r'
\s*(\d+)\s*
', webpage, 'view count', fatal=False) comment_count = self._html_search_regex( - r'
\s*\s*(\d+)\s*', webpage, 'comment count', fatal=False) + r'=\'commentCount\'[^>]*>\s*(\d+)\s*<', + webpage, 'comment count', fatal=False) upload_date = self._html_search_regex( - r'
', webpage, 'attachment URL', default=None) embed = self._html_search_regex( - r'
\s*