Merge pull request #2939 from codesparkle/upload-date-fix
authorSergey M. <dstftw@gmail.com>
Tue, 20 May 2014 12:53:28 +0000 (19:53 +0700)
committerSergey M. <dstftw@gmail.com>
Tue, 20 May 2014 12:53:28 +0000 (19:53 +0700)
No longer erroneously calculate upload_date within some extractors

youtube_dl/extractor/bandcamp.py
youtube_dl/extractor/generic.py
youtube_dl/extractor/ndr.py
youtube_dl/extractor/nowness.py
youtube_dl/version.py

index 929aafdff3e848af3295eacf1520ec0ec0334966..dcbbdef4346c36c789e49531df1dc602bc35255b 100644 (file)
@@ -19,7 +19,7 @@ class BandcampIE(InfoExtractor):
         'md5': 'c557841d5e50261777a6585648adf439',
         'info_dict': {
             "title": "youtube-dl  \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad",
-            "duration": 10,
+            "duration": 9.8485,
         },
         '_skip': 'There is a limit of 200 free downloads / month for the test song'
     }]
@@ -28,36 +28,32 @@ class BandcampIE(InfoExtractor):
         mobj = re.match(self._VALID_URL, url)
         title = mobj.group('title')
         webpage = self._download_webpage(url, title)
-        # We get the link to the free download page
         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
-        if m_download is None:
+        if not m_download:
             m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage)
             if m_trackinfo:
                 json_code = m_trackinfo.group(1)
-                data = json.loads(json_code)
-                d = data[0]
+                data = json.loads(json_code)[0]
 
-                duration = int(round(d['duration']))
                 formats = []
-                for format_id, format_url in d['file'].items():
-                    ext, _, abr_str = format_id.partition('-')
-
+                for format_id, format_url in data['file'].items():
+                    ext, abr_str = format_id.split('-', 1)
                     formats.append({
                         'format_id': format_id,
                         'url': format_url,
-                        'ext': format_id.partition('-')[0],
+                        'ext': ext,
                         'vcodec': 'none',
-                        'acodec': format_id.partition('-')[0],
-                        'abr': int(format_id.partition('-')[2]),
+                        'acodec': ext,
+                        'abr': int(abr_str),
                     })
 
                 self._sort_formats(formats)
 
                 return {
-                    'id': compat_str(d['id']),
-                    'title': d['title'],
+                    'id': compat_str(data['id']),
+                    'title': data['title'],
                     'formats': formats,
-                    'duration': duration,
+                    'duration': float(data['duration']),
                 }
             else:
                 raise ExtractorError('No free songs found')
@@ -67,11 +63,9 @@ class BandcampIE(InfoExtractor):
             r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
             webpage, re.MULTILINE | re.DOTALL).group('id')
 
-        download_webpage = self._download_webpage(download_link, video_id,
-                                                  'Downloading free downloads page')
-        # We get the dictionary of the track from some javascrip code
-        info = re.search(r'items: (.*?),$',
-                         download_webpage, re.MULTILINE).group(1)
+        download_webpage = self._download_webpage(download_link, video_id, 'Downloading free downloads page')
+        # We get the dictionary of the track from some javascript code
+        info = re.search(r'items: (.*?),$', download_webpage, re.MULTILINE).group(1)
         info = json.loads(info)[0]
         # We pick mp3-320 for now, until format selection can be easily implemented.
         mp3_info = info['downloads']['mp3-320']
@@ -100,7 +94,7 @@ class BandcampIE(InfoExtractor):
 
 class BandcampAlbumIE(InfoExtractor):
     IE_NAME = 'Bandcamp:album'
-    _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<title>[^?#]+))?'
+    _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<title>[^?#]+))'
 
     _TEST = {
         'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
@@ -123,7 +117,7 @@ class BandcampAlbumIE(InfoExtractor):
         'params': {
             'playlistend': 2
         },
-        'skip': 'Bancamp imposes download limits. See test_playlists:test_bandcamp_album for the playlist test'
+        'skip': 'Bandcamp imposes download limits. See test_playlists:test_bandcamp_album for the playlist test'
     }
 
     def _real_extract(self, url):
index 2861332826b5f20cdf0493edfdba8a7a12f357a8..38a357d3b0406906144e25cbbc45fbe74d2f6c2c 100644 (file)
@@ -363,8 +363,13 @@ class GenericIE(InfoExtractor):
                     return self.url_result('http://' + url)
                 else:
                     if default_search == 'auto_warning':
-                        self._downloader.report_warning(
-                            'Falling back to youtube search for  %s . Set --default-search to "auto" to suppress this warning.' % url)
+                        if re.match(r'^(?:url|URL)$', url):
+                            raise ExtractorError(
+                                'Invalid URL:  %r . Call youtube-dl like this:  youtube-dl -v "https://www.youtube.com/watch?v=BaW_jenozKc"  ' % url,
+                                expected=True)
+                        else:
+                            self._downloader.report_warning(
+                                'Falling back to youtube search for  %s . Set --default-search to "auto" to suppress this warning.' % url)
                     return self.url_result('ytsearch:' + url)
             else:
                 assert ':' in default_search
@@ -560,7 +565,7 @@ class GenericIE(InfoExtractor):
 
         # Look for embedded NovaMov-based player
         mobj = re.search(
-            r'''(?x)<iframe[^>]+?src=(["\'])
+            r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\'])
                     (?P<url>http://(?:(?:embed|www)\.)?
                         (?:novamov\.com|
                            nowvideo\.(?:ch|sx|eu|at|ag|co)|
index 0650f956481c9011032a278fc1a9375b98e26539..53b34f5e646b233dd72a0657be5d285e1a534ddc 100644 (file)
@@ -4,7 +4,10 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..utils import ExtractorError
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+)
 
 
 class NDRIE(InfoExtractor):
@@ -45,13 +48,12 @@ class NDRIE(InfoExtractor):
 
         page = self._download_webpage(url, video_id, 'Downloading page')
 
-        title = self._og_search_title(page)
+        title = self._og_search_title(page).strip()
         description = self._og_search_description(page)
+        if description:
+            description = description.strip()
 
-        mobj = re.search(
-            r'<div class="duration"><span class="min">(?P<minutes>\d+)</span>:<span class="sec">(?P<seconds>\d+)</span></div>',
-            page)
-        duration = int(mobj.group('minutes')) * 60 + int(mobj.group('seconds')) if mobj else None
+        duration = int_or_none(self._html_search_regex(r'duration: (\d+),\n', page, 'duration', fatal=False))
 
         formats = []
 
@@ -66,10 +68,12 @@ class NDRIE(InfoExtractor):
 
         video_url = re.search(r'''3: {src:'(?P<video>.+?)\.hi\.mp4', type:"video/mp4"},''', page)
         if video_url:
-            thumbnail = self._html_search_regex(r'(?m)title: "NDR PLAYER",\s*poster: "([^"]+)",',
-                page, 'thumbnail', fatal=False)
-            if thumbnail:
-                thumbnail = 'http://www.ndr.de' + thumbnail
+            thumbnails = re.findall(r'''\d+: {src: "([^"]+)"(?: \|\| '[^']+')?, quality: '([^']+)'}''', page)
+            if thumbnails:
+                QUALITIES = ['xs', 's', 'm', 'l', 'xl']
+                thumbnails.sort(key=lambda thumb: QUALITIES.index(thumb[1]) if thumb[1] in QUALITIES else -1)
+                thumbnail = 'http://www.ndr.de' + thumbnails[-1][0]
+
             for format_id in ['lo', 'hi', 'hq']:
                 formats.append({
                     'url': '%s.%s.mp4' % (video_url.group('video'), format_id),
index b1bcb7e54cf3f01989eb17c51160acce680eed2c..1c5e9401f36c72a73a701bdffc89529979a1eaaf 100644 (file)
@@ -4,9 +4,7 @@ import re
 
 from .brightcove import BrightcoveIE
 from .common import InfoExtractor
-from ..utils import (
-    ExtractorError,
-)
+from ..utils import ExtractorError
 
 
 class NownessIE(InfoExtractor):
@@ -14,9 +12,10 @@ class NownessIE(InfoExtractor):
 
     _TEST = {
         'url': 'http://www.nowness.com/day/2013/6/27/3131/candor--the-art-of-gesticulation',
-        'file': '2520295746001.mp4',
-        'md5': '0ece2f70a7bd252c7b00f3070182d418',
+        'md5': '068bc0202558c2e391924cb8cc470676',
         'info_dict': {
+            'id': '2520295746001',
+            'ext': 'mp4',
             'description': 'Candor: The Art of Gesticulation',
             'uploader': 'Nowness',
             'title': 'Candor: The Art of Gesticulation',
index 3193fd152cc7dda04ae2f4bfc20732f0cc2513e9..638ff8af5ae995a5ebe7a5d4171a7c02ab0feec4 100644 (file)
@@ -1,2 +1,2 @@
 
-__version__ = '2014.05.17'
+__version__ = '2014.05.19'