Merge pull request #1956 from dstftw/master
authorPhilipp Hagemeister <phihag@phihag.de>
Fri, 13 Dec 2013 15:41:34 +0000 (07:41 -0800)
committerPhilipp Hagemeister <phihag@phihag.de>
Fri, 13 Dec 2013 15:41:34 +0000 (07:41 -0800)
Fix typo in month name

17 files changed:
test/test_YoutubeDL.py
youtube_dl/YoutubeDL.py
youtube_dl/extractor/appletrailers.py
youtube_dl/extractor/clipsyndicate.py
youtube_dl/extractor/common.py
youtube_dl/extractor/dailymotion.py
youtube_dl/extractor/daum.py
youtube_dl/extractor/metacritic.py
youtube_dl/extractor/mixcloud.py
youtube_dl/extractor/mtv.py
youtube_dl/extractor/naver.py
youtube_dl/extractor/pornhub.py
youtube_dl/extractor/soundcloud.py
youtube_dl/extractor/vimeo.py
youtube_dl/extractor/zdf.py
youtube_dl/utils.py
youtube_dl/version.py

index 58cf9c313607020d1493b420f8b93e18ccccd474..3100c362aa6940d2c557dffb5cabb0f5564ef4a8 100644 (file)
@@ -7,6 +7,7 @@ import unittest
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
 from test.helper import FakeYDL
+from youtube_dl import YoutubeDL
 
 
 class YDL(FakeYDL):
@@ -140,6 +141,20 @@ class TestFormatSelection(unittest.TestCase):
         self.assertEqual(test_dict['extractor'], 'Foo')
         self.assertEqual(test_dict['playlist'], 'funny videos')
 
+    def test_prepare_filename(self):
+        info = {
+            u'id': u'1234',
+            u'ext': u'mp4',
+            u'width': None,
+        }
+        def fname(templ):
+            ydl = YoutubeDL({'outtmpl': templ})
+            return ydl.prepare_filename(info)
+        self.assertEqual(fname(u'%(id)s.%(ext)s'), u'1234.mp4')
+        self.assertEqual(fname(u'%(id)s-%(width)s.%(ext)s'), u'1234-NA.mp4')
+        # Replace missing fields with 'NA'
+        self.assertEqual(fname(u'%(uploader_date)s-%(id)s.%(ext)s'), u'NA-1234.mp4')
+
 
 if __name__ == '__main__':
     unittest.main()
index 2dd7e490710eb81ae5300bbfa5ee03cce932040a..c77777ba04cc734ece8e3e12bdc34df72f325919 100644 (file)
@@ -3,6 +3,7 @@
 
 from __future__ import absolute_import
 
+import collections
 import errno
 import io
 import json
@@ -396,18 +397,17 @@ class YoutubeDL(object):
                 template_dict['playlist_index'] = u'%05d' % template_dict['playlist_index']
 
             sanitize = lambda k, v: sanitize_filename(
-                u'NA' if v is None else compat_str(v),
+                compat_str(v),
                 restricted=self.params.get('restrictfilenames'),
                 is_id=(k == u'id'))
             template_dict = dict((k, sanitize(k, v))
-                                 for k, v in template_dict.items())
+                                 for k, v in template_dict.items()
+                                 if v is not None)
+            template_dict = collections.defaultdict(lambda: u'NA', template_dict)
 
             tmpl = os.path.expanduser(self.params['outtmpl'])
             filename = tmpl % template_dict
             return filename
-        except KeyError as err:
-            self.report_error(u'Erroneous output template')
-            return None
         except ValueError as err:
             self.report_error(u'Error in output template: ' + str(err) + u' (encoding: ' + repr(preferredencoding()) + ')')
             return None
@@ -827,7 +827,7 @@ class YoutubeDL(object):
         if self.params.get('writethumbnail', False):
             if info_dict.get('thumbnail') is not None:
                 thumb_format = determine_ext(info_dict['thumbnail'], u'jpg')
-                thumb_filename = filename.rpartition('.')[0] + u'.' + thumb_format
+                thumb_filename = os.path.splitext(filename)[0] + u'.' + thumb_format
                 self.to_screen(u'[%s] %s: Downloading thumbnail ...' %
                                (info_dict['extractor'], info_dict['id']))
                 try:
index a527f10de250596e42f19f0957433e2a72fe5bbf..ef5644aa54fe28002dc4d8c76308941c264252e3 100644 (file)
@@ -1,5 +1,4 @@
 import re
-import xml.etree.ElementTree
 import json
 
 from .common import InfoExtractor
@@ -65,18 +64,18 @@ class AppleTrailersIE(InfoExtractor):
         uploader_id = mobj.group('company')
 
         playlist_url = compat_urlparse.urljoin(url, u'includes/playlists/itunes.inc')
-        playlist_snippet = self._download_webpage(playlist_url, movie)
-        playlist_cleaned = re.sub(r'(?s)<script[^<]*?>.*?</script>', u'', playlist_snippet)
-        playlist_cleaned = re.sub(r'<img ([^<]*?)>', r'<img \1/>', playlist_cleaned)
-        # The ' in the onClick attributes are not escaped, it couldn't be parsed
-        # with xml.etree.ElementTree.fromstring
-        # like: http://trailers.apple.com/trailers/wb/gravity/
-        def _clean_json(m):
-            return u'iTunes.playURL(%s);' % m.group(1).replace('\'', '&#39;')
-        playlist_cleaned = re.sub(self._JSON_RE, _clean_json, playlist_cleaned)
-        playlist_html = u'<html>' + playlist_cleaned + u'</html>'
+        def fix_html(s):
+            s = re.sub(r'(?s)<script[^<]*?>.*?</script>', u'', s)
+            s = re.sub(r'<img ([^<]*?)>', r'<img \1/>', s)
+            # The ' in the onClick attributes are not escaped, it couldn't be parsed
+            # like: http://trailers.apple.com/trailers/wb/gravity/
+            def _clean_json(m):
+                return u'iTunes.playURL(%s);' % m.group(1).replace('\'', '&#39;')
+            s = re.sub(self._JSON_RE, _clean_json, s)
+            s = u'<html>' + s + u'</html>'
+            return s
+        doc = self._download_xml(playlist_url, movie, transform_source=fix_html)
 
-        doc = xml.etree.ElementTree.fromstring(playlist_html)
         playlist = []
         for li in doc.findall('./div/ul/li'):
             on_click = li.find('.//a').attrib['onClick']
index d4fc869732a8ae15e60f0963ef3418abac1a9201..c60089ad353274adaa380671cee9d4e3ce2e2718 100644 (file)
@@ -1,9 +1,9 @@
 import re
-import xml.etree.ElementTree
 
 from .common import InfoExtractor
 from ..utils import (
     find_xpath_attr,
+    fix_xml_all_ampersand,
 )
 
 
@@ -30,12 +30,10 @@ class ClipsyndicateIE(InfoExtractor):
         # it includes a required token
         flvars = self._search_regex(r'flvars: "(.*?)"', js_player, u'flvars')
 
-        playlist_page = self._download_webpage(
+        pdoc = self._download_xml(
             'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars,
-            video_id, u'Downloading video info') 
-        # Fix broken xml
-        playlist_page = re.sub('&', '&amp;', playlist_page)
-        pdoc = xml.etree.ElementTree.fromstring(playlist_page.encode('utf-8'))
+            video_id, u'Downloading video info',
+            transform_source=fix_xml_all_ampersand) 
 
         track_doc = pdoc.find('trackList/track')
         def find_param(name):
index 534908a2b89af8db08f5d7ba0fc7f983c1199fa9..69a083b68aa3cee7d8ec2a6af9af7108a28bdf9b 100644 (file)
@@ -230,9 +230,12 @@ class InfoExtractor(object):
             return content
 
     def _download_xml(self, url_or_request, video_id,
-                      note=u'Downloading XML', errnote=u'Unable to download XML'):
+                      note=u'Downloading XML', errnote=u'Unable to download XML',
+                      transform_source=None):
         """Return the xml as an xml.etree.ElementTree.Element"""
         xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
+        if transform_source:
+            xml_string = transform_source(xml_string)
         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 
     def to_screen(self, msg):
index 3bd0b862c6551c8f40207f62db2daf964621db47..aea7e557e85457b0526812afb37c46f249e35826 100644 (file)
@@ -101,10 +101,6 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
             self.to_screen(u'Vevo video detected: %s' % vevo_id)
             return self.url_result(u'vevo:%s' % vevo_id, ie='Vevo')
 
-        video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
-                                             # Looking for official user
-                                             r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
-                                            webpage, 'video uploader', fatal=False)
         age_limit = self._rta_search(webpage)
 
         video_upload_date = None
@@ -147,13 +143,15 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
             self._list_available_subtitles(video_id, webpage)
             return
 
-        view_count = str_to_int(self._search_regex(
-            r'video_views_value[^>]+>([\d\.,]+)<', webpage, u'view count'))
+        view_count = self._search_regex(
+            r'video_views_count[^>]+>\s+([\d\.,]+)', webpage, u'view count', fatal=False)
+        if view_count is not None:
+            view_count = str_to_int(view_count)
 
         return {
             'id':       video_id,
             'formats': formats,
-            'uploader': video_uploader,
+            'uploader': info['owner_screenname'],
             'upload_date':  video_upload_date,
             'title':    self._og_search_title(webpage),
             'subtitles':    video_subtitles,
index d418ce4a8a29c122e811c96aac76d388c790b560..4876ecb4812710e2509eec8fc19f00dac60d2fde 100644 (file)
@@ -9,7 +9,7 @@ from ..utils import (
 
 
 class DaumIE(InfoExtractor):
-    _VALID_URL = r'https?://tvpot\.daum\.net/.*?clipid=(?P<id>\d+)'
+    _VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/.*?clipid=(?P<id>\d+)'
     IE_NAME = u'daum.net'
 
     _TEST = {
index 6b95b4998852ac61d1061e0dcf6c3f442772fee2..e560c1d354d8b03a05133bf1458ce8d28b84b7bc 100644 (file)
@@ -1,8 +1,10 @@
 import re
-import xml.etree.ElementTree
 import operator
 
 from .common import InfoExtractor
+from ..utils import (
+    fix_xml_all_ampersand,
+)
 
 
 class MetacriticIE(InfoExtractor):
@@ -23,9 +25,8 @@ class MetacriticIE(InfoExtractor):
         video_id = mobj.group('id')
         webpage = self._download_webpage(url, video_id)
         # The xml is not well formatted, there are raw '&'
-        info_xml = self._download_webpage('http://www.metacritic.com/video_data?video=' + video_id,
-            video_id, u'Downloading info xml').replace('&', '&amp;')
-        info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
+        info = self._download_xml('http://www.metacritic.com/video_data?video=' + video_id,
+            video_id, u'Downloading info xml', transform_source=fix_xml_all_ampersand)
 
         clip = next(c for c in info.findall('playList/clip') if c.find('id').text == video_id)
         formats = []
index 04fa3ac7ac7a3bc5ffa570d201ddfbb54762fc2b..125d81551c26ea67eff82f2d2189bd058d16b873 100644 (file)
@@ -37,6 +37,9 @@ class MixcloudIE(InfoExtractor):
 
         return None
 
+    def _get_url(self, template_url):
+        return self.check_urls(template_url % i for i in range(30))
+
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
 
@@ -52,13 +55,18 @@ class MixcloudIE(InfoExtractor):
         preview_url = self._search_regex(r'data-preview-url="(.+?)"', webpage, u'preview url')
         song_url = preview_url.replace('/previews/', '/cloudcasts/originals/')
         template_url = re.sub(r'(stream\d*)', 'stream%d', song_url)
-        final_song_url = self.check_urls(template_url % i for i in range(30))
+        final_song_url = self._get_url(template_url)
+        if final_song_url is None:
+            self.to_screen('Trying with m4a extension')
+            template_url = template_url.replace('.mp3', '.m4a').replace('originals/', 'm4a/64/')
+            final_song_url = self._get_url(template_url)
+        if final_song_url is None:
+            raise ExtractorError(u'Unable to extract track url')
 
         return {
             'id': track_id,
             'title': info['name'],
             'url': final_song_url,
-            'ext': 'mp3',
             'description': info.get('description'),
             'thumbnail': info['pictures'].get('extra_large'),
             'uploader': info['user']['name'],
index 6b3feb560768f96c4d5b3bb3adc0989ecf1c1d4f..5b2bd96334e2eb1a090db2260f08ab9cc1a4b882 100644 (file)
@@ -82,8 +82,13 @@ class MTVServicesInfoExtractor(InfoExtractor):
     def _get_videos_info(self, uri):
         video_id = self._id_from_uri(uri)
         data = compat_urllib_parse.urlencode({'uri': uri})
-        idoc = self._download_xml(self._FEED_URL +'?' + data, video_id,
-                                         u'Downloading info')
+
+        def fix_ampersand(s):
+            """ Fix unencoded ampersand in XML """
+            return s.replace(u'& ', '&amp; ')
+        idoc = self._download_xml(
+            self._FEED_URL + '?' + data, video_id,
+            u'Downloading info', transform_source=fix_ampersand)
         return [self._get_video_info(item) for item in idoc.findall('.//item')]
 
 
index c012ec0cfacb2afea6b395c5c87509f53ed58614..4cab30631956b903682fc2de7aa5dd551bcdd4a3 100644 (file)
@@ -9,7 +9,7 @@ from ..utils import (
 
 
 class NaverIE(InfoExtractor):
-    _VALID_URL = r'https?://tvcast\.naver\.com/v/(?P<id>\d+)'
+    _VALID_URL = r'https?://(?:m\.)?tvcast\.naver\.com/v/(?P<id>\d+)'
 
     _TEST = {
         u'url': u'http://tvcast.naver.com/v/81652',
index 8b3471919565d4c7044d51eb24e8ef01cc8e77fc..d9135c6b929765e87b13e58f6fd6af5567c55199 100644 (file)
@@ -12,7 +12,7 @@ from ..aes import (
 )
 
 class PornHubIE(InfoExtractor):
-    _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>pornhub\.com/view_video\.php\?viewkey=(?P<videoid>[0-9]+))'
+    _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>pornhub\.com/view_video\.php\?viewkey=(?P<videoid>[0-9a-f]+))'
     _TEST = {
         u'url': u'http://www.pornhub.com/view_video.php?viewkey=648719015',
         u'file': u'648719015.mp4',
index 5c026c0b8f914ed014b9dceb0816d741a7cfddb0..cbba4094bf14c1096cd5b5d6d657513d230655a5 100644 (file)
@@ -73,6 +73,19 @@ class SoundcloudIE(InfoExtractor):
                 u'upload_date': u'20131209',
             },
         },
+        # downloadable song
+        {
+            u'url': u'https://soundcloud.com/simgretina/just-your-problem-baby-1',
+            u'md5': u'56a8b69568acaa967b4c49f9d1d52d19',
+            u'info_dict': {
+                u'id': u'105614606',
+                u'ext': u'wav',
+                u'title': u'Just Your Problem Baby (Acapella)',
+                u'description': u'Vocals',
+                u'uploader': u'Sim Gretina',
+                u'upload_date': u'20130815',
+            },
+        },
     ]
 
     _CLIENT_ID = 'b45b1aa10f1ac2941910a7f0d10f8e28'
@@ -99,7 +112,7 @@ class SoundcloudIE(InfoExtractor):
         thumbnail = info['artwork_url']
         if thumbnail is not None:
             thumbnail = thumbnail.replace('-large', '-t500x500')
-        ext = info.get('original_format', u'mp3')
+        ext = u'mp3'
         result = {
             'id': track_id,
             'uploader': info['user']['username'],
@@ -115,7 +128,7 @@ class SoundcloudIE(InfoExtractor):
                     track_id, self._CLIENT_ID))
             result['formats'] = [{
                 'format_id': 'download',
-                'ext': ext,
+                'ext': info.get('original_format', u'mp3'),
                 'url': format_url,
                 'vcodec': 'none',
             }]
index fb2bd225ab0b3c21b16b9a717475cfc42232d4e5..ea440952898a15f24019ba421c9efbe810ff1fec 100644 (file)
@@ -115,7 +115,7 @@ class VimeoIE(InfoExtractor):
     def _real_initialize(self):
         self._login()
 
-    def _real_extract(self, url, new_video=True):
+    def _real_extract(self, url):
         url, data = unsmuggle_url(url)
         headers = std_headers
         if data is not None:
@@ -151,8 +151,14 @@ class VimeoIE(InfoExtractor):
                 config = json.loads(config_json)
             except RegexNotFoundError:
                 # For pro videos or player.vimeo.com urls
-                config = self._search_regex([r' = {config:({.+?}),assets:', r'(?:c|b)=({.+?});'],
-                    webpage, u'info section', flags=re.DOTALL)
+                # We try to find out to which variable is assigned the config dic
+                m_variable_name = re.search('(\w)\.video\.id', webpage)
+                if m_variable_name is not None:
+                    config_re = r'%s=({.+?});' % re.escape(m_variable_name.group(1))
+                else:
+                    config_re = [r' = {config:({.+?}),assets:', r'(?:[abc])=({.+?});']
+                config = self._search_regex(config_re, webpage, u'info section',
+                    flags=re.DOTALL)
                 config = json.loads(config)
         except Exception as e:
             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
index 689f19735456e2a05defe8f2eb49c5b2f4848580..35ece354a6ecdf7ba5705184d4ceff22d57eb3d4 100644 (file)
@@ -73,14 +73,14 @@ class ZDFIE(InfoExtractor):
             try:
                 proto_pref = -PROTO_ORDER.index(format_m.group('proto'))
             except ValueError:
-                proto_pref = 999
+                proto_pref = -999
 
             quality = fnode.find('./quality').text
             QUALITY_ORDER = ['veryhigh', '300', 'high', 'med', 'low']
             try:
                 quality_pref = -QUALITY_ORDER.index(quality)
             except ValueError:
-                quality_pref = 999
+                quality_pref = -999
 
             abr = int(fnode.find('./audioBitrate').text) // 1000
             vbr = int(fnode.find('./videoBitrate').text) // 1000
index d9bf6c24c39d4d821025d510f8f14f2d66ba652b..4e8a84a56a5b717db73e986c91c2621920fcba47 100644 (file)
@@ -1057,3 +1057,8 @@ def month_by_name(name):
         return ENGLISH_NAMES.index(name) + 1
     except ValueError:
         return None
+
+
+def fix_xml_all_ampersand(xml_str):
+    """Replace all the '&' by '&amp;' in XML"""
+    return xml_str.replace(u'&', u'&amp;')
index 8906d6090dcdff38270aa1c97ec5d9087ac1543e..b9a52fcfab43ee9063395c605d69d9f95509339c 100644 (file)
@@ -1,2 +1,2 @@
 
-__version__ = '2013.12.09.4'
+__version__ = '2013.12.11.2'