Merge remote-tracking branch 'yasoob/master'
authorPhilipp Hagemeister <phihag@phihag.de>
Sat, 20 Jul 2013 09:23:56 +0000 (11:23 +0200)
committerPhilipp Hagemeister <phihag@phihag.de>
Sat, 20 Jul 2013 09:23:56 +0000 (11:23 +0200)
13 files changed:
README.md
devscripts/youtube_genalgo.py
test/test_youtube_sig.py
youtube_dl/__init__.py
youtube_dl/extractor/__init__.py
youtube_dl/extractor/c56.py [new file with mode: 0644]
youtube_dl/extractor/instagram.py
youtube_dl/extractor/sina.py [new file with mode: 0644]
youtube_dl/extractor/ted.py
youtube_dl/extractor/weibo.py [new file with mode: 0644]
youtube_dl/extractor/youku.py
youtube_dl/extractor/youtube.py
youtube_dl/version.py

index b246d3c53317848166351daba7abba9eddbc5359..560bcdca185494cc096c7e1ec7f5a55cf0c34732 100644 (file)
--- a/README.md
+++ b/README.md
@@ -16,7 +16,9 @@ which means you can modify it, redistribute it or use it however you like.
 # OPTIONS
     -h, --help                 print this help text and exit
     --version                  print program version and exit
-    -U, --update               update this program to latest version
+    -U, --update               update this program to latest version. Make sure
+                               that you have sufficient permissions (run with
+                               sudo if needed)
     -i, --ignore-errors        continue on download errors
     --dump-user-agent          display the current browser identification
     --user-agent UA            specify a custom user agent
index 1fce316dd21fcc723f9d2a421a735ce3865dafa0..2b3879f0a5749bce22551c5a9d5a9950d2a1b665 100644 (file)
@@ -5,6 +5,12 @@
 import sys
 
 tests = [
+    # 92 - vflQw-fB4 2013/07/17
+    ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<'`~\"",
+     "mrtyuioplkjhgfdsazxcvbnq1234567890QWERTY}IOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]\"|:;"),
+    # 90
+    ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<'`",
+     "mrtyuioplkjhgfdsazxcvbne1234567890QWER[YUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={`]}|"),
     # 88
     ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<",
      "J:|}][{=+-_)(*&;%$#@>MNBVCXZASDFGH^KLPOIUYTREWQ0987654321mnbvcxzasdfghrklpoiuytej"),
@@ -14,9 +20,9 @@ tests = [
     # 86 - vfl_ymO4Z 2013/06/27
     ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[|};?/>.<",
      "ertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!/#$%^&*()_-+={[|};?@"),
-    # 85
+    # 85 - vflSAFCP9 2013/07/19
     ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?/>.<",
-     "{>/?;}[.=+-_)(*&^%$#@!MqBVCXZASDFwHJKLPOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytr"),
+     "ertyuiqplkjhgfdsazx$vbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#<%^&*()_-+={[};?/c"),
     # 84
     ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?>.<",
      "<.>?;}[{=+-_)(*&^%$#@!MNBVCXZASDFGHJKLPOIUYTREWe098765432rmnbvcxzasdfghjklpoiuyt1"),
index b8e51f479b05a674ccf3368c7f4ef5180752b323..51b300532ddaf7b6c92c02b1844b519d4d444b8b 100644 (file)
@@ -13,9 +13,14 @@ from helper import FakeYDL
 sig = YoutubeIE(FakeYDL())._decrypt_signature
 
 class TestYoutubeSig(unittest.TestCase):
-    def test_43_43(self):
-        wrong = '5AEEAE0EC39677BC65FD9021CCD115F1F2DBD5A59E4.C0B243A3E2DED6769199AF3461781E75122AE135135'
-        right = '931EA22157E1871643FA9519676DED253A342B0C.4E95A5DBD2F1F511DCC1209DF56CB77693CE0EAE'
+    def test_92(self):
+        wrong = "F9F9B6E6FD47029957AB911A964CC20D95A181A5D37A2DBEFD67D403DB0E8BE4F4910053E4E8A79.0B70B.0B80B8"
+        right = "69B6E6FD47029957AB911A9F4CC20D95A181A5D3.A2DBEFD67D403DB0E8BE4F4910053E4E8A7980B7"
+        self.assertEqual(sig(wrong), right)
+
+    def test_90(self):
+        wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<'`"
+        right = "mrtyuioplkjhgfdsazxcvbne1234567890QWER[YUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={`]}|"
         self.assertEqual(sig(wrong), right)
 
     def test_88(self):
@@ -35,7 +40,7 @@ class TestYoutubeSig(unittest.TestCase):
 
     def test_85(self):
         wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?/>.<"
-        right = "{>/?;}[.=+-_)(*&^%$#@!MqBVCXZASDFwHJKLPOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytr"
+        right = "ertyuiqplkjhgfdsazx$vbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#<%^&*()_-+={[};?/c"
         self.assertEqual(sig(wrong), right)
 
     def test_84(self):
@@ -58,10 +63,5 @@ class TestYoutubeSig(unittest.TestCase):
         right = "urty8ioplkjhgfdsazxcvbqm1234567e90QWERTYUIOPLKHGFDSnZXCVBNM!@#$%^&*(-+={[};?/>."
         self.assertEqual(sig(wrong), right)
 
-    def test_92(self):
-        wrong = "F9F9B6E6FD47029957AB911A964CC20D95A181A5D37A2DBEFD67D403DB0E8BE4F4910053E4E8A79.0B70B.0B80B8"
-        right = "69B6E6FD47029957AB911A9F4CC20D95A181A5D3.A2DBEFD67D403DB0E8BE4F4910053E4E8A7980B7"
-        self.assertEqual(sig(wrong), right)
-
 if __name__ == '__main__':
     unittest.main()
index 31427f7c52fa56db4213fd6906b7e60d4c9345aa..eb23c53a570fa84eabd64ec7f83a2fc045a8727c 100644 (file)
@@ -129,7 +129,7 @@ def parseOpts(overrideArguments=None):
     general.add_option('-v', '--version',
             action='version', help='print program version and exit')
     general.add_option('-U', '--update',
-            action='store_true', dest='update_self', help='update this program to latest version')
+            action='store_true', dest='update_self', help='update this program to latest version. Make sure that you have sufficient permissions (run with sudo if needed)')
     general.add_option('-i', '--ignore-errors',
             action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
     general.add_option('--dump-user-agent',
index 70a2363e454f1b5b7fc83fd3551aa0b15630e24d..b4a1c20e9489042f6876b1c98fb0d1589dfcfce4 100644 (file)
@@ -52,6 +52,7 @@ from .pornotube import PornotubeIE
 from .rbmaradio import RBMARadioIE
 from .redtube import RedTubeIE
 from .ringtv import RingTVIE
+from .sina import SinaIE
 from .soundcloud import SoundcloudIE, SoundcloudSetIE
 from .spiegel import SpiegelIE
 from .stanfordoc import StanfordOpenClassroomIE
@@ -71,7 +72,9 @@ from .veoh import VeohIE
 from .vevo import VevoIE
 from .vimeo import VimeoIE
 from .vine import VineIE
+from .c56 import C56IE
 from .wat import WatIE
+from .weibo import WeiboIE
 from .wimp import WimpIE
 from .worldstarhiphop import WorldStarHipHopIE
 from .xhamster import XHamsterIE
diff --git a/youtube_dl/extractor/c56.py b/youtube_dl/extractor/c56.py
new file mode 100644 (file)
index 0000000..4c8a8af
--- /dev/null
@@ -0,0 +1,36 @@
+# coding: utf-8
+
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import determine_ext
+
+class C56IE(InfoExtractor):
+    _VALID_URL = r'https?://((www|player)\.)?56\.com/(.+?/)?(v_|(play_album.+-))(?P<textid>.+?)\.(html|swf)'
+    IE_NAME = u'56.com'
+
+    _TEST ={
+        u'url': u'http://www.56.com/u39/v_OTM0NDA3MTY.html',
+        u'file': u'93440716.mp4',
+        u'md5': u'9dc07b5c8e978112a6441f9e75d2b59e',
+        u'info_dict': {
+            u'title': u'网事知多少 第32期:车怒',
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE)
+        text_id = mobj.group('textid')
+        info_page = self._download_webpage('http://vxml.56.com/json/%s/' % text_id,
+                                           text_id, u'Downloading video info')
+        info = json.loads(info_page)['info']
+        best_format = sorted(info['rfiles'], key=lambda f: int(f['filesize']))[-1]
+        video_url = best_format['url']
+
+        return {'id': info['vid'],
+                'title': info['Subject'],
+                'url': video_url,
+                'ext': determine_ext(video_url),
+                'thumbnail': info.get('bimg') or info.get('img'),
+                }
index f9ac8d5b45b72764a441cc2d520841414b742f55..ddc42882a436a216cbd24b0b28d03da89ec27b0d 100644 (file)
@@ -10,7 +10,8 @@ class InstagramIE(InfoExtractor):
         u'md5': u'0d2da106a9d2631273e192b372806516',
         u'info_dict': {
             u"uploader_id": u"naomipq", 
-            u"title": u"Video by naomipq"
+            u"title": u"Video by naomipq",
+            u'description': u'md5:1f17f0ab29bd6fe2bfad705f58de3cb8',
         }
     }
 
@@ -18,20 +19,17 @@ class InstagramIE(InfoExtractor):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group(1)
         webpage = self._download_webpage(url, video_id)
-        html_title = self._html_search_regex(
-            r'<title>(.+?)</title>',
-            webpage, u'title', flags=re.DOTALL)
-        title = re.sub(u'(?: *\(Videos?\))? \u2022 Instagram$', '', html_title).strip()
-        uploader_id = self._html_search_regex(
-            r'<div class="media-user" id="media_user">.*?<h2><a href="[^"]*">([^<]*)</a></h2>',
-            webpage, u'uploader id', fatal=False, flags=re.DOTALL)
-        ext = 'mp4'
+        uploader_id = self._search_regex(r'"owner":{"username":"(.+?)"',
+            webpage, u'uploader id', fatal=False)
+        desc = self._search_regex(r'"caption":"(.*?)"', webpage, u'description',
+            fatal=False)
 
         return [{
             'id':        video_id,
             'url':       self._og_search_video_url(webpage),
-            'ext':       ext,
-            'title':     title,
+            'ext':       'mp4',
+            'title':     u'Video by %s' % uploader_id,
             'thumbnail': self._og_search_thumbnail(webpage),
-            'uploader_id' : uploader_id
+            'uploader_id' : uploader_id,
+            'description': desc,
         }]
diff --git a/youtube_dl/extractor/sina.py b/youtube_dl/extractor/sina.py
new file mode 100644 (file)
index 0000000..14b1c65
--- /dev/null
@@ -0,0 +1,67 @@
+# coding: utf-8
+
+import re
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_urllib_request,
+    compat_urllib_parse,
+)
+
+
+class SinaIE(InfoExtractor):
+    _VALID_URL = r'''https?://(.*?\.)?video\.sina\.com\.cn/
+                        (
+                            (.+?/(((?P<pseudo_id>\d+).html)|(.*?(\#|(vid=))(?P<id>\d+?)($|&))))
+                            |
+                            # This is used by external sites like Weibo
+                            (api/sinawebApi/outplay.php/(?P<token>.+?)\.swf)
+                        )
+                  '''
+
+    _TEST = {
+        u'url': u'http://video.sina.com.cn/news/vlist/zt/chczlj2013/?opsubject_id=top12#110028898',
+        u'file': u'110028898.flv',
+        u'md5': u'd65dd22ddcf44e38ce2bf58a10c3e71f',
+        u'info_dict': {
+            u'title': u'《中国新闻》 朝鲜要求巴拿马立即释放被扣船员',
+        }
+    }
+
+    @classmethod
+    def suitable(cls, url):
+        return re.match(cls._VALID_URL, url, flags=re.VERBOSE) is not None
+
+    def _extract_video(self, video_id):
+        data = compat_urllib_parse.urlencode({'vid': video_id})
+        url_page = self._download_webpage('http://v.iask.com/v_play.php?%s' % data,
+            video_id, u'Downloading video url')
+        image_page = self._download_webpage(
+            'http://interface.video.sina.com.cn/interface/common/getVideoImage.php?%s' % data,
+            video_id, u'Downloading thumbnail info')
+        url_doc = xml.etree.ElementTree.fromstring(url_page.encode('utf-8'))
+
+        return {'id': video_id,
+                'url': url_doc.find('./durl/url').text,
+                'ext': 'flv',
+                'title': url_doc.find('./vname').text,
+                'thumbnail': image_page.split('=')[1],
+                }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE)
+        video_id = mobj.group('id')
+        if mobj.group('token') is not None:
+            # The video id is in the redirected url
+            self.to_screen(u'Getting video id')
+            request = compat_urllib_request.Request(url)
+            request.get_method = lambda: 'HEAD'
+            (_, urlh) = self._download_webpage_handle(request, 'NA', False)
+            return self._real_extract(urlh.geturl())
+        elif video_id is None:
+            pseudo_id = mobj.group('pseudo_id')
+            webpage = self._download_webpage(url, pseudo_id)
+            video_id = self._search_regex(r'vid:\'(\d+?)\'', webpage, u'video id')
+
+        return self._extract_video(video_id)
index 8b73b8340c40badad0023a53cc5b10b363e57b6a..4c11f7a03c37136c0c80677e55b66598c647edeb 100644 (file)
@@ -67,7 +67,7 @@ class TEDIE(InfoExtractor):
         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
         self.report_extraction(video_name)
         # If the url includes the language we get the title translated
-        title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
+        title = self._html_search_regex(r'<span .*?id="altHeadline".+?>(?P<title>.*)</span>',
                                         webpage, 'title')
         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
                                     webpage, 'json data')
diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py
new file mode 100644 (file)
index 0000000..0757495
--- /dev/null
@@ -0,0 +1,48 @@
+# coding: utf-8
+
+import re
+import json
+
+from .common import InfoExtractor
+
+class WeiboIE(InfoExtractor):
+    """
+    The videos in Weibo come from different sites, this IE just finds the link
+    to the external video and returns it.
+    """
+    _VALID_URL = r'https?://video\.weibo\.com/v/weishipin/t_(?P<id>.+?)\.htm'
+
+    _TEST = {
+        u'url': u'http://video.weibo.com/v/weishipin/t_zjUw2kZ.htm',
+        u'file': u'98322879.flv',
+        u'info_dict': {
+            u'title': u'魔声耳机最新广告“All Eyes On Us”',
+        },
+        u'note': u'Sina video',
+        u'params': {
+            u'skip_download': True,
+        },
+    }
+
+    # Additional example videos from different sites
+    # Youku: http://video.weibo.com/v/weishipin/t_zQGDWQ8.htm
+    # 56.com: http://video.weibo.com/v/weishipin/t_zQ44HxN.htm
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE)
+        video_id = mobj.group('id')
+        info_url = 'http://video.weibo.com/?s=v&a=play_list&format=json&mix_video_id=t_%s' % video_id
+        info_page = self._download_webpage(info_url, video_id)
+        info = json.loads(info_page)
+
+        videos_urls = map(lambda v: v['play_page_url'], info['result']['data'])
+        #Prefer sina video since they have thumbnails
+        videos_urls = sorted(videos_urls, key=lambda u: u'video.sina.com' in u)
+        player_url = videos_urls[-1]
+        m_sina = re.match(r'https?://video.sina.com.cn/v/b/(\d+)-\d+.html', player_url)
+        if m_sina is not None:
+            self.to_screen('Sina video detected')
+            sina_id = m_sina.group(1)
+            player_url = 'http://you.video.sina.com.cn/swf/quotePlayer.swf?vid=%s' % sina_id
+        return self.url_result(player_url)
+
index eb98298019c04334276688a7d9c6a5db8bd90664..996d384784cb827ed4baa3304b61782542ea0767 100644 (file)
@@ -13,7 +13,7 @@ from ..utils import (
 
 
 class YoukuIE(InfoExtractor):
-    _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
+    _VALID_URL =  r'(?:http://)?(v|player)\.youku\.com/(v_show/id_|player\.php/sid/)(?P<ID>[A-Za-z0-9]+)(\.html|/v.swf)'
     _TEST =   {
         u"url": u"http://v.youku.com/v_show/id_XNDgyMDQ2NTQw.html",
         u"file": u"XNDgyMDQ2NTQw_part00.flv",
index 580f39ee848064380fbc420bcf6eb2eaae4b26aa..12e8fc25d6f5c19f2eca21dae9b0b1caec508650 100644 (file)
@@ -179,14 +179,18 @@ class YoutubeIE(InfoExtractor):
     def _decrypt_signature(self, s):
         """Turn the encrypted s field into a working signature"""
 
-        if len(s) == 88:
+        if len(s) == 92:
+            return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
+        elif len(s) == 90:
+            return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
+        elif len(s) == 88:
             return s[48] + s[81:67:-1] + s[82] + s[66:62:-1] + s[85] + s[61:48:-1] + s[67] + s[47:12:-1] + s[3] + s[11:3:-1] + s[2] + s[12]
         elif len(s) == 87:
             return s[62] + s[82:62:-1] + s[83] + s[61:52:-1] + s[0] + s[51:2:-1]
         elif len(s) == 86:
             return s[2:63] + s[82] + s[64:82] + s[63]
         elif len(s) == 85:
-            return s[76] + s[82:76:-1] + s[83] + s[75:60:-1] + s[0] + s[59:50:-1] + s[1] + s[49:2:-1]
+            return s[2:8] + s[0] + s[9:21] + s[65] + s[22:65] + s[84] + s[66:82] + s[21]
         elif len(s) == 84:
             return s[83:36:-1] + s[2] + s[35:26:-1] + s[3] + s[25:3:-1] + s[26]
         elif len(s) == 83:
@@ -195,8 +199,6 @@ class YoutubeIE(InfoExtractor):
             return s[36] + s[79:67:-1] + s[81] + s[66:40:-1] + s[33] + s[39:36:-1] + s[40] + s[35] + s[0] + s[67] + s[32:0:-1] + s[34]
         elif len(s) == 81:
             return s[6] + s[3:6] + s[33] + s[7:24] + s[0] + s[25:33] + s[2] + s[34:53] + s[24] + s[54:81]
-        elif len(s) == 92:
-            return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83];
 
         else:
             raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
index cba83a48828875873897fac4cee38f9d15af0f4d..32eb27dad67690f6f83cc9366db6a1ee0fcbab40 100644 (file)
@@ -1,2 +1,2 @@
 
-__version__ = '2013.07.17.1'
+__version__ = '2013.07.19'