Merge remote-tracking branch 'yasoob/master'
authorPhilipp Hagemeister <phihag@phihag.de>
Thu, 11 Jul 2013 10:02:16 +0000 (12:02 +0200)
committerPhilipp Hagemeister <phihag@phihag.de>
Thu, 11 Jul 2013 10:02:16 +0000 (12:02 +0200)
12 files changed:
devscripts/youtube_genalgo.py
test/test_youtube_sig.py
youtube_dl/YoutubeDL.py
youtube_dl/extractor/__init__.py
youtube_dl/extractor/brightcove.py
youtube_dl/extractor/dotsub.py
youtube_dl/extractor/gamespot.py
youtube_dl/extractor/generic.py
youtube_dl/extractor/steam.py
youtube_dl/extractor/veoh.py [new file with mode: 0644]
youtube_dl/extractor/youtube.py
youtube_dl/version.py

index c3d69e6f445af3846081af3bdea0f8c9a2a7063c..150c88d1754c4cfcb6f79b20ef559406f0dc2937 100644 (file)
@@ -20,9 +20,9 @@ tests = [
     # 84
     ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?>.<",
      "<.>?;}[{=+-_)(*&^%$#@!MNBVCXZASDFGHJKLPOIUYTREWe098765432rmnbvcxzasdfghjklpoiuyt1"),
-    # 83
+    # 83 - vfl26ng3K 2013/07/10
     ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>.<",
-     "D.>/?;}[{=+_)(*&^%$#!MNBVCXeAS<FGHJKLPOIUYTREWZ0987654321mnbvcxzasdfghjklpoiuytrQ"),
+     "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>"),
     # 82
     ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.<",
      "Q>/?;}[{=+-(*<^%$#@!MNBVCXZASDFGHKLPOIUY8REWT0q&7654321mnbvcxzasdfghjklpoiuytrew9"),
index e87b6259bef3841e900ae0c9d98c03d6c30e0f20..e7660424484ef2d4cb43410ab1478d79734870f5 100755 (executable)
@@ -45,7 +45,7 @@ class TestYoutubeSig(unittest.TestCase):
 
     def test_83(self):
         wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>.<"
-        right = "D.>/?;}[{=+_)(*&^%$#!MNBVCXeAS<FGHJKLPOIUYTREWZ0987654321mnbvcxzasdfghjklpoiuytrQ"
+        right = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>"
         self.assertEqual(sig(wrong), right)
 
     def test_82(self):
index d3281fed25c8a6b7d2476b662167691c93322c3a..e2470611512698745181dcf45cebbebc67acdb65 100644 (file)
@@ -348,6 +348,7 @@ class YoutubeDL(object):
 
         result_type = ie_result.get('_type', 'video') # If not given we suppose it's a video, support the default old system
         if result_type == 'video':
+            ie_result.update(extra_info)
             if 'playlist' not in ie_result:
                 # It isn't part of a playlist
                 ie_result['playlist'] = None
index d2a71a6f10d9aa6125348d722064ccf7fc89ce92..4b67f333b03776a759724c7e0bf52791cf608179 100644 (file)
@@ -59,6 +59,7 @@ from .tumblr import TumblrIE
 from .tutv import TutvIE
 from .ustream import UstreamIE
 from .vbox7 import Vbox7IE
+from .veoh import VeohIE
 from .vevo import VevoIE
 from .vimeo import VimeoIE
 from .vine import VineIE
index f85acbb5db3dcb8e68b1e6e19f4eb91095fa6cfd..68ee5292b61157b5bccbd78619a3da758fdf93dc 100644 (file)
@@ -1,28 +1,80 @@
 import re
 import json
+import xml.etree.ElementTree
 
 from .common import InfoExtractor
+from ..utils import (
+    compat_urllib_parse,
+)
 
 class BrightcoveIE(InfoExtractor):
-    _VALID_URL = r'http://.*brightcove\.com/.*\?(?P<query>.*videoPlayer=(?P<id>\d*).*)'
+    _VALID_URL = r'https?://.*brightcove\.com/(services|viewer).*\?(?P<query>.*)'
+    _FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s'
+    _PLAYLIST_URL_TEMPLATE = 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=%s'
+    
+    # There is a test for Brigtcove in GenericIE, that way we test both the download
+    # and the detection of videos, and we don't have to find an URL that is always valid
+
+    @classmethod
+    def _build_brighcove_url(cls, object_str):
+        """
+        Build a Brightcove url from a xml string containing
+        <object class="BrightcoveExperience">{params}</object>
+        """
+        object_doc = xml.etree.ElementTree.fromstring(object_str)
+        assert u'BrightcoveExperience' in object_doc.attrib['class']
+        params = {'flashID': object_doc.attrib['id'],
+                  'playerID': object_doc.find('./param[@name="playerID"]').attrib['value'],
+                  }
+        playerKey = object_doc.find('./param[@name="playerKey"]')
+        # Not all pages define this value
+        if playerKey is not None:
+            params['playerKey'] = playerKey.attrib['value']
+        videoPlayer = object_doc.find('./param[@name="@videoPlayer"]')
+        if videoPlayer is not None:
+            params['@videoPlayer'] = videoPlayer.attrib['value']
+        data = compat_urllib_parse.urlencode(params)
+        return cls._FEDERATED_URL_TEMPLATE % data
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         query = mobj.group('query')
-        video_id = mobj.group('id')
 
-        request_url = 'http://c.brightcove.com/services/viewer/htmlFederated?%s' % query
+        m_video_id = re.search(r'videoPlayer=(\d+)', query)
+        if m_video_id is not None:
+            video_id = m_video_id.group(1)
+            return self._get_video_info(video_id, query)
+        else:
+            player_key = self._search_regex(r'playerKey=(.+?)(&|$)', query, 'playlist_id')
+            return self._get_playlist_info(player_key)
+
+    def _get_video_info(self, video_id, query):
+        request_url = self._FEDERATED_URL_TEMPLATE % query
         webpage = self._download_webpage(request_url, video_id)
 
         self.report_extraction(video_id)
         info = self._search_regex(r'var experienceJSON = ({.*?});', webpage, 'json')
         info = json.loads(info)['data']
         video_info = info['programmedContent']['videoPlayer']['mediaDTO']
+
+        return self._extract_video_info(video_info)
+
+    def _get_playlist_info(self, player_key):
+        playlist_info = self._download_webpage(self._PLAYLIST_URL_TEMPLATE % player_key,
+                                               player_key, u'Downloading playlist information')
+
+        playlist_info = json.loads(playlist_info)['videoList']
+        videos = [self._extract_video_info(video_info) for video_info in playlist_info['mediaCollectionDTO']['videoDTOs']]
+
+        return self.playlist_result(videos, playlist_id=playlist_info['id'],
+                                    playlist_title=playlist_info['mediaCollectionDTO']['displayName'])
+
+    def _extract_video_info(self, video_info):
         renditions = video_info['renditions']
         renditions = sorted(renditions, key=lambda r: r['size'])
         best_format = renditions[-1]
-        
-        return {'id': video_id,
+
+        return {'id': video_info['id'],
                 'title': video_info['displayName'],
                 'url': best_format['defaultURL'], 
                 'ext': 'mp4',
index 2afeaba072def6ebbad625484e184c26bc652853..0ee9a684eb4c66907634f9d4b603a46beec5a357 100644 (file)
@@ -1,5 +1,7 @@
 import re
 import json
+import time
+
 from .common import InfoExtractor
 
 
@@ -13,7 +15,8 @@ class DotsubIE(InfoExtractor):
             u"title": u"Pyramids of Waste (2010), AKA The Lightbulb Conspiracy - Planned obsolescence documentary",
             u"uploader": u"4v4l0n42",
             u'description': u'Pyramids of Waste (2010) also known as "The lightbulb conspiracy" is a documentary about how our economic system based on consumerism  and planned obsolescence is breaking our planet down.\r\n\r\nSolutions to this can be found at:\r\nhttp://robotswillstealyourjob.com\r\nhttp://www.federicopistono.org\r\n\r\nhttp://opensourceecology.org\r\nhttp://thezeitgeistmovement.com',
-            u'thumbnail': u'http://dotsub.com/media/aed3b8b2-1889-4df5-ae63-ad85f5572f27/p'
+            u'thumbnail': u'http://dotsub.com/media/aed3b8b2-1889-4df5-ae63-ad85f5572f27/p',
+            u'upload_date': u'20101213',
         }
     }
 
@@ -23,20 +26,16 @@ class DotsubIE(InfoExtractor):
         info_url = "https://dotsub.com/api/media/%s/metadata" %(video_id)
         webpage = self._download_webpage(info_url, video_id)
         info = json.loads(webpage)
-        video_url = info['mediaURI']
-        uploader = info['user']
-        description = info['description']
-        view_count = info['numberOfViews']
-        title = info['title']
-        thumbnail_url = info['screenshotURI']
-        ext = 'flv'
+        date = time.gmtime(info['dateCreated']/1000) # The timestamp is in miliseconds
+
         return [{
             'id':          video_id,
-            'url':         video_url,
-            'ext':         ext,
-            'title':       title,
-            'thumbnail':   thumbnail_url,
-            'description': description,
-            'uploader':    uploader,
-            'view_count':  view_count,
+            'url':         info['mediaURI'],
+            'ext':         'flv',
+            'title':       info['title'],
+            'thumbnail':   info['screenshotURI'],
+            'description': info['description'],
+            'uploader':    info['user'],
+            'view_count':  info['numberOfViews'],
+            'upload_date': u'%04i%02i%02i' % (date.tm_year, date.tm_mon, date.tm_mday),
         }]
index cec3b7ac863247e8ddc2c2add953372bd809eed4..7585b70618d1e4f92e8297fbf4d1397359a5224b 100644 (file)
@@ -4,14 +4,15 @@ import xml.etree.ElementTree
 from .common import InfoExtractor
 from ..utils import (
     unified_strdate,
+    compat_urllib_parse,
 )
 
 class GameSpotIE(InfoExtractor):
-    _VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/([^/]+)/videos/([^/]+)-([^/d]+)/'
+    _VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/.*-(?P<page_id>\d+)/?'
     _TEST = {
         u"url": u"http://www.gamespot.com/arma-iii/videos/arma-iii-community-guide-sitrep-i-6410818/",
         u"file": u"6410818.mp4",
-        u"md5": u"5569d64ca98db01f0177c934fe8c1e9b",
+        u"md5": u"b2a30deaa8654fcccd43713a6b6a4825",
         u"info_dict": {
             u"title": u"Arma III - Community Guide: SITREP I",
             u"upload_date": u"20130627", 
@@ -21,13 +22,22 @@ class GameSpotIE(InfoExtractor):
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group(3).split("-")[-1]
-        info_url = "http://www.gamespot.com/pages/video_player/xml.php?id="+str(video_id)
+        page_id = mobj.group('page_id')
+        webpage = self._download_webpage(url, page_id)
+        video_id = self._html_search_regex([r'"og:video" content=".*?\?id=(\d+)"',
+                                            r'http://www\.gamespot\.com/videoembed/(\d+)'],
+                                           webpage, 'video id')
+        data = compat_urllib_parse.urlencode({'id': video_id, 'newplayer': '1'})
+        info_url = 'http://www.gamespot.com/pages/video_player/xml.php?' + data
         info_xml = self._download_webpage(info_url, video_id)
         doc = xml.etree.ElementTree.fromstring(info_xml)
         clip_el = doc.find('./playList/clip')
 
-        video_url = clip_el.find('./URI').text
+        http_urls = [{'url': node.find('filePath').text,
+                      'rate': int(node.find('rate').text)}
+            for node in clip_el.find('./httpURI')]
+        best_quality = sorted(http_urls, key=lambda f: f['rate'])[-1]
+        video_url = best_quality['url']
         title = clip_el.find('./title').text
         ext = video_url.rpartition('.')[2]
         thumbnail_url = clip_el.find('./screenGrabURI').text
index 20bc533300aa38d5d8b2d6a13eefee44fe439f72..33790741f8184a9d5e8002f0c7afc1ea20c2a986 100644 (file)
@@ -1,3 +1,5 @@
+# encoding: utf-8
+
 import os
 import re
 
@@ -9,20 +11,34 @@ from ..utils import (
 
     ExtractorError,
 )
+from .brightcove import BrightcoveIE
 
 class GenericIE(InfoExtractor):
     IE_DESC = u'Generic downloader that works on some sites'
     _VALID_URL = r'.*'
     IE_NAME = u'generic'
-    _TEST = {
-        u'url': u'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
-        u'file': u'13601338388002.mp4',
-        u'md5': u'85b90ccc9d73b4acd9138d3af4c27f89',
-        u'info_dict': {
-            u"uploader": u"www.hodiho.fr", 
-            u"title": u"R\u00e9gis plante sa Jeep"
-        }
-    }
+    _TESTS = [
+        {
+            u'url': u'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
+            u'file': u'13601338388002.mp4',
+            u'md5': u'85b90ccc9d73b4acd9138d3af4c27f89',
+            u'info_dict': {
+                u"uploader": u"www.hodiho.fr", 
+                u"title": u"R\u00e9gis plante sa Jeep"
+            }
+        },
+        {
+            u'url': u'http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/',
+            u'file': u'2371591881001.mp4',
+            u'md5': u'9e80619e0a94663f0bdc849b4566af19',
+            u'note': u'Test Brightcove downloads and detection in GenericIE',
+            u'info_dict': {
+                u'title': u'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”',
+                u'uploader': u'8TV',
+                u'description': u'md5:a950cc4285c43e44d763d036710cd9cd',
+            }
+        },
+    ]
 
     def report_download_webpage(self, video_id):
         """Report webpage download."""
@@ -103,6 +119,13 @@ class GenericIE(InfoExtractor):
             raise ExtractorError(u'Invalid URL: %s' % url)
 
         self.report_extraction(video_id)
+        # Look for BrigthCove:
+        m_brightcove = re.search(r'<object.+?class=".*?BrightcoveExperience.*?".+?</object>', webpage, re.DOTALL)
+        if m_brightcove is not None:
+            self.to_screen(u'Brightcove video detected.')
+            bc_url = BrightcoveIE._build_brighcove_url(m_brightcove.group())
+            return self.url_result(bc_url, 'Brightcove')
+
         # Start with something easy: JW Player in SWFObject
         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
         if mobj is None:
index ecac4ec40b48d34d7b4e849cb4af0a4ba565b3b9..91658f8925cac6199bda5f7aa05aa0a2a73e85e4 100644 (file)
@@ -23,14 +23,16 @@ class SteamIE(InfoExtractor):
                 u"file": u"81300.flv",
                 u"md5": u"f870007cee7065d7c76b88f0a45ecc07",
                 u"info_dict": {
-                        u"title": u"Terraria 1.1 Trailer"
+                        u"title": u"Terraria 1.1 Trailer",
+                        u'playlist_index': 1,
                 }
             },
             {
                 u"file": u"80859.flv",
                 u"md5": u"61aaf31a5c5c3041afb58fb83cbb5751",
                 u"info_dict": {
-                    u"title": u"Terraria Trailer"
+                    u"title": u"Terraria Trailer",
+                    u'playlist_index': 2,
                 }
             }
         ]
diff --git a/youtube_dl/extractor/veoh.py b/youtube_dl/extractor/veoh.py
new file mode 100644 (file)
index 0000000..00672c9
--- /dev/null
@@ -0,0 +1,47 @@
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+)
+
+class VeohIE(InfoExtractor):
+    _VALID_URL = r'http://www\.veoh\.com/watch/v(?P<id>\d*)'
+
+    _TEST = {
+        u'url': u'http://www.veoh.com/watch/v56314296nk7Zdmz3',
+        u'file': u'56314296.mp4',
+        u'md5': u'620e68e6a3cff80086df3348426c9ca3',
+        u'info_dict': {
+            u'title': u'Straight Backs Are Stronger',
+            u'uploader': u'LUMOback',
+            u'description': u'At LUMOback, we believe straight backs are stronger.  The LUMOback Posture & Movement Sensor:  It gently vibrates when you slouch, inspiring improved posture and mobility.  Use the app to track your data and improve your posture over time. ',
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        webpage = self._download_webpage(url, video_id)
+
+        m_youtube = re.search(r'http://www\.youtube\.com/v/(.*?)(\&|")', webpage)
+        if m_youtube is not None:
+            youtube_id = m_youtube.group(1)
+            self.to_screen(u'%s: detected Youtube video.' % video_id)
+            return self.url_result(youtube_id, 'Youtube')
+
+        self.report_extraction(video_id)
+        info = self._search_regex(r'videoDetailsJSON = \'({.*?})\';', webpage, 'info')
+        info = json.loads(info)
+        video_url =  info.get('fullPreviewHashHighPath') or info.get('fullPreviewHashLowPath')
+
+        return {'id': info['videoId'], 
+                'title': info['title'],
+                'ext': determine_ext(video_url),
+                'url': video_url,
+                'uploader': info['username'],
+                'thumbnail': info.get('highResImage') or info.get('medResImage'),
+                'description': info['description'],
+                'view_count': info['views'],
+                }
index 61b7b561f46e4dbe835704dd24d79c358c27278b..87f9994bac5f8ef5b46730b10664d0c15b52aa6c 100644 (file)
@@ -117,7 +117,19 @@ class YoutubeIE(InfoExtractor):
                 u"uploader": u"IconaPop",
                 u"uploader_id": u"IconaPop"
             }
-        }
+        },
+        {
+            u"url":  u"https://www.youtube.com/watch?v=07FYdnEawAQ",
+            u"file":  u"07FYdnEawAQ.mp4",
+            u"note": u"Test VEVO video with age protection (#956)",
+            u"info_dict": {
+                u"upload_date": u"20130703",
+                u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
+                u"description": u"md5:64249768eec3bc4276236606ea996373",
+                u"uploader": u"justintimberlakeVEVO",
+                u"uploader_id": u"justintimberlakeVEVO"
+            }
+        },
     ]
 
 
@@ -178,7 +190,7 @@ class YoutubeIE(InfoExtractor):
         elif len(s) == 84:
             return s[83:36:-1] + s[2] + s[35:26:-1] + s[3] + s[25:3:-1] + s[26]
         elif len(s) == 83:
-            return s[52] + s[81:55:-1] + s[2] + s[54:52:-1] + s[82] + s[51:36:-1] + s[55] + s[35:2:-1] + s[36]
+            return s[:81]
         elif len(s) == 82:
             return s[36] + s[79:67:-1] + s[81] + s[66:40:-1] + s[33] + s[39:36:-1] + s[40] + s[35] + s[0] + s[67] + s[32:0:-1] + s[34]
 
@@ -410,15 +422,35 @@ class YoutubeIE(InfoExtractor):
 
         # Get video info
         self.report_video_info_webpage_download(video_id)
-        for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
-            video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
-                    % (video_id, el_type))
+        if re.search(r'player-age-gate-content">', video_webpage) is not None:
+            self.report_age_confirmation()
+            age_gate = True
+            # We simulate the access to the video from www.youtube.com/v/{video_id}
+            # this can be viewed without login into Youtube
+            data = compat_urllib_parse.urlencode({'video_id': video_id,
+                                                  'el': 'embedded',
+                                                  'gl': 'US',
+                                                  'hl': 'en',
+                                                  'eurl': 'https://youtube.googleapis.com/v/' + video_id,
+                                                  'asv': 3,
+                                                  'sts':'1588',
+                                                  })
+            video_info_url = 'https://www.youtube.com/get_video_info?' + data
             video_info_webpage = self._download_webpage(video_info_url, video_id,
                                     note=False,
                                     errnote='unable to download video info webpage')
             video_info = compat_parse_qs(video_info_webpage)
-            if 'token' in video_info:
-                break
+        else:
+            age_gate = False
+            for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
+                video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
+                        % (video_id, el_type))
+                video_info_webpage = self._download_webpage(video_info_url, video_id,
+                                        note=False,
+                                        errnote='unable to download video info webpage')
+                video_info = compat_parse_qs(video_info_webpage)
+                if 'token' in video_info:
+                    break
         if 'token' not in video_info:
             if 'reason' in video_info:
                 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
@@ -535,6 +567,8 @@ class YoutubeIE(InfoExtractor):
             self.report_rtmp_download()
             video_url_list = [(None, video_info['conn'][0])]
         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
+            if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
+                raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
             url_map = {}
             for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
                 url_data = compat_parse_qs(url_data_str)
@@ -545,9 +579,15 @@ class YoutubeIE(InfoExtractor):
                     elif 's' in url_data:
                         if self._downloader.params.get('verbose'):
                             s = url_data['s'][0]
-                            player = self._search_regex(r'html5player-(.+?)\.js', video_webpage,
-                                'html5 player', fatal=False)
-                            self.to_screen('encrypted signature length %d (%d.%d), itag %s, html5 player %s' %
+                            if age_gate:
+                                player_version = self._search_regex(r'ad3-(.+?)\.swf',
+                                    video_info['ad3_module'][0], 'flash player',
+                                    fatal=False)
+                                player = 'flash player %s' % player_version
+                            else:
+                                player = u'html5 player %s' % self._search_regex(r'html5player-(.+?)\.js', video_webpage,
+                                    'html5 player', fatal=False)
+                            self.to_screen('encrypted signature length %d (%d.%d), itag %s, %s' %
                                 (len(s), len(s.split('.')[0]), len(s.split('.')[1]), url_data['itag'][0], player))
                         signature = self._decrypt_signature(url_data['s'][0])
                         url += '&signature=' + signature
index e7a15714ac829e409ce8aae5a8c3a81e5b14409a..2f20826c24a316f575e004ae2f02adf93e942b9a 100644 (file)
@@ -1,2 +1,2 @@
 
-__version__ = '2013.07.08.1'
+__version__ = '2013.07.10'