Merge pull request #8821 from remitamine/list-thumbnails-order

author remitamine <remitamine@gmail.com>

Sun, 13 Mar 2016 11:44:50 +0000 (12:44 +0100)

committer remitamine <remitamine@gmail.com>

Sun, 13 Mar 2016 11:44:50 +0000 (12:44 +0100)
author remitamine <remitamine@gmail.com>
Sun, 13 Mar 2016 11:44:50 +0000 (12:44 +0100)
committer remitamine <remitamine@gmail.com>
Sun, 13 Mar 2016 11:44:50 +0000 (12:44 +0100)
diff --git a/AUTHORS b/AUTHORS

index b6b47ac5759c4ead1f99ccbcf413a6d064b82f6d..aa48cd5a6015aa965a23b4203349e3bc0a6f690d 100644 (file)
--- a/AUTHORS
+++ b/AUTHORS
@@ -162,3 +162,4 @@ Robin Houtevelts
  Patrick Griffis
  Aidan Rowe
  mutantmonkey
+Ben Congdon
diff --git a/test/test_utils.py b/test/test_utils.py

index 2bcf8ecf03b588e3ca250cb298dff4bcee551f32..bc28ceb344f0388ed11028af3171d2007777c366 100644 (file)
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -41,6 +41,7 @@ from youtube_dl.utils import (
      orderedSet,
      parse_duration,
      parse_filesize,
+    parse_count,
      parse_iso8601,
      read_batch_urls,
      sanitize_filename,
@@ -653,6 +654,15 @@ class TestUtil(unittest.TestCase):
          self.assertEqual(parse_filesize('1.2Tb'), 1200000000000)
          self.assertEqual(parse_filesize('1,24 KB'), 1240)
  
+    def test_parse_count(self):
+        self.assertEqual(parse_count(None), None)
+        self.assertEqual(parse_count(''), None)
+        self.assertEqual(parse_count('0'), 0)
+        self.assertEqual(parse_count('1000'), 1000)
+        self.assertEqual(parse_count('1.000'), 1000)
+        self.assertEqual(parse_count('1.1k'), 1100)
+        self.assertEqual(parse_count('1.1kk'), 1100000)
+
      def test_version_tuple(self):
          self.assertEqual(version_tuple('1'), (1,))
          self.assertEqual(version_tuple('10.23.344'), (10, 23, 344))
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py

index 98de5ddff3f6903f8af818d5dacba4edec6b99ac..c5b80f4aadc423765b6bb3a4ef63289849642655 100644 (file)
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -54,6 +54,7 @@ from .arte import (
  from .atresplayer import AtresPlayerIE
  from .atttechchannel import ATTTechChannelIE
  from .audimedia import AudiMediaIE
+from .audioboom import AudioBoomIE
  from .audiomack import AudiomackIE, AudiomackAlbumIE
  from .azubu import AzubuIE, AzubuLiveIE
  from .baidu import BaiduVideoIE
@@ -188,6 +189,10 @@ from .dumpert import DumpertIE
  from .defense import DefenseGouvFrIE
  from .discovery import DiscoveryIE
  from .dropbox import DropboxIE
+from .dw import (
+    DWIE,
+    DWArticleIE,
+)
  from .eagleplatform import EaglePlatformIE
  from .ebaumsworld import EbaumsWorldIE
  from .echomsk import EchoMskIE
@@ -384,6 +389,7 @@ from .lynda import (
  from .m6 import M6IE
  from .macgamestore import MacGameStoreIE
  from .mailru import MailRuIE
+from .makerschannel import MakersChannelIE
  from .makertv import MakerTVIE
  from .malemotion import MalemotionIE
  from .matchtv import MatchTVIE
@@ -393,6 +399,7 @@ from .metacritic import MetacriticIE
  from .mgoon import MgoonIE
  from .minhateca import MinhatecaIE
  from .ministrygrid import MinistryGridIE
+from .minoto import MinotoIE
  from .miomio import MioMioIE
  from .mit import TechTVMITIE, MITIE, OCWMITIE
  from .mitele import MiTeleIE
@@ -829,7 +836,10 @@ from .vgtv import (
      VGTVIE,
  )
  from .vh1 import VH1IE
-from .vice import ViceIE
+from .vice import (
+    ViceIE,
+    ViceShowIE,
+)
  from .viddler import ViddlerIE
  from .videodetective import VideoDetectiveIE
  from .videofyme import VideofyMeIE
diff --git a/youtube_dl/extractor/audioboom.py b/youtube_dl/extractor/audioboom.py

new file mode 100644 (file)

index 0000000..2ec2d70
--- /dev/null
+++ b/youtube_dl/extractor/audioboom.py
@@ -0,0 +1,66 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import float_or_none
+
+
+class AudioBoomIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?audioboom\.com/boos/(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'https://audioboom.com/boos/4279833-3-09-2016-czaban-hour-3?t=0',
+        'md5': '63a8d73a055c6ed0f1e51921a10a5a76',
+        'info_dict': {
+            'id': '4279833',
+            'ext': 'mp3',
+            'title': '3/09/2016 Czaban Hour 3',
+            'description': 'Guest:   Nate Davis - NFL free agency,   Guest:   Stan Gans',
+            'duration': 2245.72,
+            'uploader': 'Steve Czaban',
+            'uploader_url': 're:https?://(?:www\.)?audioboom\.com/channel/steveczabanyahoosportsradio',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        clip = None
+
+        clip_store = self._parse_json(
+            self._search_regex(
+                r'data-new-clip-store=(["\'])(?P<json>{.*?"clipId"\s*:\s*%s.*?})\1' % video_id,
+                webpage, 'clip store', default='{}', group='json'),
+            video_id, fatal=False)
+        if clip_store:
+            clips = clip_store.get('clips')
+            if clips and isinstance(clips, list) and isinstance(clips[0], dict):
+                clip = clips[0]
+
+        def from_clip(field):
+            if clip:
+                clip.get(field)
+
+        audio_url = from_clip('clipURLPriorToLoading') or self._og_search_property(
+            'audio', webpage, 'audio url')
+        title = from_clip('title') or self._og_search_title(webpage)
+        description = from_clip('description') or self._og_search_description(webpage)
+
+        duration = float_or_none(from_clip('duration') or self._html_search_meta(
+            'weibo:audio:duration', webpage))
+
+        uploader = from_clip('author') or self._og_search_property(
+            'audio:artist', webpage, 'uploader', fatal=False)
+        uploader_url = from_clip('author_url') or self._html_search_meta(
+            'audioboo:channel', webpage, 'uploader url')
+
+        return {
+            'id': video_id,
+            'url': audio_url,
+            'title': title,
+            'description': description,
+            'duration': duration,
+            'uploader': uploader,
+            'uploader_url': uploader_url,
+        }
diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py

index 9d0dfb9611687b15075d0e6fc7d57dfa0244c60a..e62b3860e99b106d08ef79cf593e180fe8c9496c 100644 (file)
--- a/youtube_dl/extractor/bbc.py
+++ b/youtube_dl/extractor/bbc.py
@@ -10,7 +10,6 @@ from ..utils import (
      int_or_none,
      parse_duration,
      parse_iso8601,
-    remove_end,
      unescapeHTML,
  )
  from ..compat import (
@@ -561,7 +560,7 @@ class BBCIE(BBCCoUkIE):
          'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
          'info_dict': {
              'id': '3662a707-0af9-3149-963f-47bea720b460',
-            'title': 'BBC Blogs - Adam Curtis - BUGGER',
+            'title': 'BUGGER',
          },
          'playlist_count': 18,
      }, {
@@ -670,9 +669,17 @@ class BBCIE(BBCCoUkIE):
          'url': 'http://www.bbc.com/sport/0/football/34475836',
          'info_dict': {
              'id': '34475836',
-            'title': 'What Liverpool can expect from Klopp',
+            'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
          },
          'playlist_count': 3,
+    }, {
+        # school report article with single video
+        'url': 'http://www.bbc.co.uk/schoolreport/35744779',
+        'info_dict': {
+            'id': '35744779',
+            'title': 'School which breaks down barriers in Jerusalem',
+        },
+        'playlist_count': 1,
      }, {
          # single video with playlist URL from weather section
          'url': 'http://www.bbc.com/weather/features/33601775',
@@ -735,8 +742,17 @@ class BBCIE(BBCCoUkIE):
  
          json_ld_info = self._search_json_ld(webpage, playlist_id, default=None)
          timestamp = json_ld_info.get('timestamp')
+
          playlist_title = json_ld_info.get('title')
-        playlist_description = json_ld_info.get('description')
+        if not playlist_title:
+            playlist_title = self._og_search_title(
+                webpage, default=None) or self._html_search_regex(
+                r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
+            if playlist_title:
+                playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
+
+        playlist_description = json_ld_info.get(
+            'description') or self._og_search_description(webpage, default=None)
  
          if not timestamp:
              timestamp = parse_iso8601(self._search_regex(
@@ -797,8 +813,6 @@ class BBCIE(BBCCoUkIE):
                                  playlist.get('progressiveDownloadUrl'), playlist_id, timestamp))
  
          if entries:
-            playlist_title = playlist_title or remove_end(self._og_search_title(webpage), ' - BBC News')
-            playlist_description = playlist_description or self._og_search_description(webpage, default=None)
              return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
  
          # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
@@ -829,10 +843,6 @@ class BBCIE(BBCCoUkIE):
                  'subtitles': subtitles,
              }
  
-        playlist_title = self._html_search_regex(
-            r'<title>(.*?)(?:\s*-\s*BBC [^ ]+)?</title>', webpage, 'playlist title')
-        playlist_description = self._og_search_description(webpage, default=None)
-
          def extract_all(pattern):
              return list(filter(None, map(
                  lambda s: self._parse_json(s, playlist_id, fatal=False),
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index bfa9c82f6c42c9e126a7a30fde4391fcf49995c3..ecd7da767f72da9c07903d7013cd95ebf616078a 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -48,6 +48,7 @@ from ..utils import (
      determine_protocol,
      parse_duration,
      mimetype2ext,
+    update_url_query,
  )
  
  
@@ -345,7 +346,7 @@ class InfoExtractor(object):
      def IE_NAME(self):
          return compat_str(type(self).__name__[:-2])
  
-    def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
+    def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None):
          """ Returns the response handle """
          if note is None:
              self.report_download_webpage(video_id)
@@ -354,6 +355,12 @@ class InfoExtractor(object):
                  self.to_screen('%s' % (note,))
              else:
                  self.to_screen('%s: %s' % (video_id, note))
+        # data, headers and query params will be ignored for `Request` objects
+        if isinstance(url_or_request, compat_str):
+            if query:
+                url_or_request = update_url_query(url_or_request, query)
+            if data or headers:
+                url_or_request = sanitized_Request(url_or_request, data, headers or {})
          try:
              return self._downloader.urlopen(url_or_request)
          except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
@@ -369,13 +376,13 @@ class InfoExtractor(object):
                  self._downloader.report_warning(errmsg)
                  return False
  
-    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
+    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers=None, query=None):
          """ Returns a tuple (page content as string, URL handle) """
          # Strip hashes from the URL (#1038)
          if isinstance(url_or_request, (compat_str, str)):
              url_or_request = url_or_request.partition('#')[0]
  
-        urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
+        urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
          if urlh is False:
              assert not fatal
              return False
@@ -462,13 +469,13 @@ class InfoExtractor(object):
  
          return content
  
-    def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
+    def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers=None, query=None):
          """ Returns the data of the page as a string """
          success = False
          try_count = 0
          while success is False:
              try:
-                res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
+                res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
                  success = True
              except compat_http_client.IncompleteRead as e:
                  try_count += 1
@@ -483,10 +490,10 @@ class InfoExtractor(object):
  
      def _download_xml(self, url_or_request, video_id,
                        note='Downloading XML', errnote='Unable to download XML',
-                      transform_source=None, fatal=True, encoding=None):
+                      transform_source=None, fatal=True, encoding=None, data=None, headers=None, query=None):
          """Return the xml as an xml.etree.ElementTree.Element"""
          xml_string = self._download_webpage(
-            url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
+            url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
          if xml_string is False:
              return xml_string
          if transform_source:
@@ -497,10 +504,10 @@ class InfoExtractor(object):
                         note='Downloading JSON metadata',
                         errnote='Unable to download JSON metadata',
                         transform_source=None,
-                       fatal=True, encoding=None):
+                       fatal=True, encoding=None, data=None, headers=None, query=None):
          json_string = self._download_webpage(
              url_or_request, video_id, note, errnote, fatal=fatal,
-            encoding=encoding)
+            encoding=encoding, data=data, headers=headers, query=query)
          if (not fatal) and json_string is False:
              return None
          return self._parse_json(
@@ -966,6 +973,13 @@ class InfoExtractor(object):
          if manifest is False:
              return []
  
+        return self._parse_f4m_formats(
+            manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
+            transform_source=transform_source, fatal=fatal)
+
+    def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
+                           transform_source=lambda s: fix_xml_ampersands(s).strip(),
+                           fatal=True):
          formats = []
          manifest_version = '1.0'
          media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
@@ -991,7 +1005,8 @@ class InfoExtractor(object):
                  # bitrate in f4m downloader
                  if determine_ext(manifest_url) == 'f4m':
                      formats.extend(self._extract_f4m_formats(
-                        manifest_url, video_id, preference, f4m_id, fatal=fatal))
+                        manifest_url, video_id, preference=preference, f4m_id=f4m_id,
+                        transform_source=transform_source, fatal=fatal))
                      continue
              tbr = int_or_none(media_el.attrib.get('bitrate'))
              formats.append({
@@ -1140,8 +1155,8 @@ class InfoExtractor(object):
                  out.append('{%s}%s' % (namespace, c))
          return '/'.join(out)
  
-    def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
-        smil = self._download_smil(smil_url, video_id, fatal=fatal)
+    def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
+        smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
  
          if smil is False:
              assert not fatal
@@ -1158,10 +1173,10 @@ class InfoExtractor(object):
              return {}
          return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
  
-    def _download_smil(self, smil_url, video_id, fatal=True):
+    def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
          return self._download_xml(
              smil_url, video_id, 'Downloading SMIL file',
-            'Unable to download SMIL file', fatal=fatal)
+            'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
  
      def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
          namespace = self._parse_smil_namespace(smil)
@@ -1447,8 +1462,9 @@ class InfoExtractor(object):
                          continue
                      representation_attrib = adaptation_set.attrib.copy()
                      representation_attrib.update(representation.attrib)
-                    mime_type = representation_attrib.get('mimeType')
-                    content_type = mime_type.split('/')[0] if mime_type else representation_attrib.get('contentType')
+                    # According to page 41 of ISO/IEC 29001-1:2014, @mimeType is mandatory
+                    mime_type = representation_attrib['mimeType']
+                    content_type = mime_type.split('/')[0]
                      if content_type == 'text':
                          # TODO implement WebVTT downloading
                          pass
@@ -1471,6 +1487,7 @@ class InfoExtractor(object):
                          f = {
                              'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
                              'url': base_url,
+                            'ext': mimetype2ext(mime_type),
                              'width': int_or_none(representation_attrib.get('width')),
                              'height': int_or_none(representation_attrib.get('height')),
                              'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
diff --git a/youtube_dl/extractor/dw.py b/youtube_dl/extractor/dw.py

new file mode 100644 (file)

index 0000000..b6c9855
--- /dev/null
+++ b/youtube_dl/extractor/dw.py
@@ -0,0 +1,85 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+from ..compat import compat_urlparse
+
+
+class DWIE(InfoExtractor):
+    IE_NAME = 'dw'
+    _VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+av-(?P<id>\d+)'
+    _TESTS = [{
+        # video
+        'url': 'http://www.dw.com/en/intelligent-light/av-19112290',
+        'md5': '7372046e1815c5a534b43f3c3c36e6e9',
+        'info_dict': {
+            'id': '19112290',
+            'ext': 'mp4',
+            'title': 'Intelligent light',
+            'description': 'md5:90e00d5881719f2a6a5827cb74985af1',
+            'upload_date': '20160311',
+        }
+    }, {
+        # audio
+        'url': 'http://www.dw.com/en/worldlink-my-business/av-19111941',
+        'md5': '2814c9a1321c3a51f8a7aeb067a360dd',
+        'info_dict': {
+            'id': '19111941',
+            'ext': 'mp3',
+            'title': 'WorldLink: My business',
+            'description': 'md5:bc9ca6e4e063361e21c920c53af12405',
+            'upload_date': '20160311',
+        }
+    }]
+
+    def _real_extract(self, url):
+        media_id = self._match_id(url)
+        webpage = self._download_webpage(url, media_id)
+        hidden_inputs = self._hidden_inputs(webpage)
+        title = hidden_inputs['media_title']
+
+        formats = []
+        if hidden_inputs.get('player_type') == 'video' and hidden_inputs.get('stream_file') == '1':
+            formats = self._extract_smil_formats(
+                'http://www.dw.com/smil/v-%s' % media_id, media_id,
+                transform_source=lambda s: s.replace(
+                    'rtmp://tv-od.dw.de/flash/',
+                    'http://tv-download.dw.de/dwtv_video/flv/'))
+        else:
+            formats = [{'url': hidden_inputs['file_name']}]
+
+        return {
+            'id': media_id,
+            'title': title,
+            'description': self._og_search_description(webpage),
+            'thumbnail': hidden_inputs.get('preview_image'),
+            'duration': int_or_none(hidden_inputs.get('file_duration')),
+            'upload_date': hidden_inputs.get('display_date'),
+            'formats': formats,
+        }
+
+
+class DWArticleIE(InfoExtractor):
+    IE_NAME = 'dw:article'
+    _VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+a-(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://www.dw.com/en/no-hope-limited-options-for-refugees-in-idomeni/a-19111009',
+        'md5': '8ca657f9d068bbef74d6fc38b97fc869',
+        'info_dict': {
+            'id': '19105868',
+            'ext': 'mp4',
+            'title': 'The harsh life of refugees in Idomeni',
+            'description': 'md5:196015cc7e48ebf474db9399420043c7',
+            'upload_date': '20160310',
+        }
+    }
+
+    def _real_extract(self, url):
+        article_id = self._match_id(url)
+        webpage = self._download_webpage(url, article_id)
+        hidden_inputs = self._hidden_inputs(webpage)
+        media_id = hidden_inputs['media_id']
+        media_path = self._search_regex(r'href="([^"]+av-%s)"\s+class="overlayLink"' % media_id, webpage, 'media url')
+        media_url = compat_urlparse.urljoin(url, media_path)
+        return self.url_result(media_url, 'DW', media_id)
diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py

index 5e8589479d4ce5ae57240c4432494b8348b5523a..f5bbd39d2d0e90996c118e3fae325034fc2bbb6d 100644 (file)
--- a/youtube_dl/extractor/facebook.py
+++ b/youtube_dl/extractor/facebook.py
@@ -38,7 +38,8 @@ class FacebookIE(InfoExtractor):
                                  story\.php
                              )\?(?:.*?)(?:v|video_id|story_fbid)=|
                              [^/]+/videos/(?:[^/]+/)?|
-                            [^/]+/posts/
+                            [^/]+/posts/|
+                            groups/[^/]+/permalink/
                          )|
                      facebook:
                  )
@@ -123,6 +124,9 @@ class FacebookIE(InfoExtractor):
      }, {
          'url': 'facebook:544765982287235',
          'only_matching': True,
+    }, {
+        'url': 'https://www.facebook.com/groups/164828000315060/permalink/764967300301124/',
+        'only_matching': True,
      }]
  
      def _login(self):
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py

index ca745ae41ecf46dd4b0a6b5dbd80a3e836c5f794..8121f04a5e02cf672dc6ab0f152d21df8b98034e 100644 (file)
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -1242,28 +1242,34 @@ class GenericIE(InfoExtractor):
              full_response = self._request_webpage(request, video_id)
              head_response = full_response
  
+        info_dict = {
+            'id': video_id,
+            'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
+        }
+
          # Check for direct link to a video
          content_type = head_response.headers.get('Content-Type', '')
          m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>.+)$', content_type)
          if m:
              upload_date = unified_strdate(
                  head_response.headers.get('Last-Modified'))
-            formats = []
-            if m.group('format_id').endswith('mpegurl'):
+            format_id = m.group('format_id')
+            if format_id.endswith('mpegurl'):
                  formats = self._extract_m3u8_formats(url, video_id, 'mp4')
+            elif format_id == 'f4m':
+                formats = self._extract_f4m_formats(url, video_id)
              else:
                  formats = [{
                      'format_id': m.group('format_id'),
                      'url': url,
                      'vcodec': 'none' if m.group('type') == 'audio' else None
                  }]
-            return {
-                'id': video_id,
-                'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
+            info_dict.update({
                  'direct': True,
                  'formats': formats,
                  'upload_date': upload_date,
-            }
+            })
+            return info_dict
  
          if not self._downloader.params.get('test', False) and not is_intentional:
              force = self._downloader.params.get('force_generic_extractor', False)
@@ -1291,13 +1297,12 @@ class GenericIE(InfoExtractor):
                  'URL could be a direct video link, returning it as such.')
              upload_date = unified_strdate(
                  head_response.headers.get('Last-Modified'))
-            return {
-                'id': video_id,
-                'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
+            info_dict.update({
                  'direct': True,
                  'url': url,
                  'upload_date': upload_date,
-            }
+            })
+            return info_dict
  
          webpage = self._webpage_read_content(
              full_response, url, video_id, prefix=first_bytes)
@@ -1314,12 +1319,12 @@ class GenericIE(InfoExtractor):
              elif doc.tag == '{http://xspf.org/ns/0/}playlist':
                  return self.playlist_result(self._parse_xspf(doc, video_id), video_id)
              elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag):
-                return {
-                    'id': video_id,
-                    'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
-                    'formats': self._parse_mpd_formats(
-                        doc, video_id, mpd_base_url=url.rpartition('/')[0]),
-                }
+                info_dict['formats'] = self._parse_mpd_formats(
+                    doc, video_id, mpd_base_url=url.rpartition('/')[0])
+                return info_dict
+            elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag):
+                info_dict['formats'] = self._parse_f4m_formats(doc, url, video_id)
+                return info_dict
          except compat_xml_parse_error:
              pass
  
@@ -1985,6 +1990,8 @@ class GenericIE(InfoExtractor):
                  entry_info_dict['formats'] = self._extract_m3u8_formats(video_url, video_id, ext='mp4')
              elif ext == 'mpd':
                  entry_info_dict['formats'] = self._extract_mpd_formats(video_url, video_id)
+            elif ext == 'f4m':
+                entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id)
              else:
                  entry_info_dict['url'] = video_url
  
diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py

index 37be34091895392b014a8fe2167d9c708ca60e71..766fc26d0f01145bdd2456a221940fa60ece6953 100644 (file)
--- a/youtube_dl/extractor/googledrive.py
+++ b/youtube_dl/extractor/googledrive.py
@@ -10,8 +10,8 @@ from ..utils import (
  
  
  class GoogleDriveIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P<id>[a-zA-Z0-9_-]{28})'
-    _TEST = {
+    _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P<id>[a-zA-Z0-9_-]{28,})'
+    _TESTS = [{
          'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1',
          'md5': '881f7700aec4f538571fa1e0eed4a7b6',
          'info_dict': {
@@ -20,7 +20,11 @@ class GoogleDriveIE(InfoExtractor):
              'title': 'Big Buck Bunny.mp4',
              'duration': 46,
          }
-    }
+    }, {
+        # video id is longer than 28 characters
+        'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit',
+        'only_matching': True,
+    }]
      _FORMATS_EXT = {
          '5': 'flv',
          '6': 'flv',
@@ -43,7 +47,7 @@ class GoogleDriveIE(InfoExtractor):
      @staticmethod
      def _extract_url(webpage):
          mobj = re.search(
-            r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28})',
+            r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28,})',
              webpage)
          if mobj:
              return 'https://drive.google.com/file/d/%s' % mobj.group('id')
diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py

index ccbc39c665412980e6b6104e83ffaf2e8574517f..44d7c84a13f9bef9aa1d68dc5d38fe81b0af4a5f 100644 (file)
--- a/youtube_dl/extractor/kaltura.py
+++ b/youtube_dl/extractor/kaltura.py
@@ -8,6 +8,7 @@ from .common import InfoExtractor
  from ..compat import (
      compat_urllib_parse,
      compat_urlparse,
+    compat_parse_qs,
  )
  from ..utils import (
      clean_html,
@@ -20,21 +21,17 @@ from ..utils import (
  class KalturaIE(InfoExtractor):
      _VALID_URL = r'''(?x)
                  (?:
-                    kaltura:(?P<partner_id_s>\d+):(?P<id_s>[0-9a-z_]+)|
+                    kaltura:(?P<partner_id>\d+):(?P<id>[0-9a-z_]+)|
                      https?://
                          (:?(?:www|cdnapi(?:sec)?)\.)?kaltura\.com/
                          (?:
                              (?:
                                  # flash player
-                                index\.php/kwidget/
-                                (?:[^/]+/)*?wid/_(?P<partner_id>\d+)/
-                                (?:[^/]+/)*?entry_id/(?P<id>[0-9a-z_]+)|
+                                index\.php/kwidget|
                                  # html5 player
-                                html5/html5lib/
-                                (?:[^/]+/)*?entry_id/(?P<id_html5>[0-9a-z_]+)
-                                .*\?.*\bwid=_(?P<partner_id_html5>\d+)
+                                html5/html5lib/[^/]+/mwEmbedFrame\.php
                              )
-                        )
+                        )(?:/(?P<path>[^?]+))?(?:\?(?P<query>.*))?
                  )
                  '''
      _API_BASE = 'http://cdnapi.kaltura.com/api_v3/index.php?'
@@ -127,10 +124,41 @@ class KalturaIE(InfoExtractor):
          url, smuggled_data = unsmuggle_url(url, {})
  
          mobj = re.match(self._VALID_URL, url)
-        partner_id = mobj.group('partner_id_s') or mobj.group('partner_id') or mobj.group('partner_id_html5')
-        entry_id = mobj.group('id_s') or mobj.group('id') or mobj.group('id_html5')
-
-        info, flavor_assets = self._get_video_info(entry_id, partner_id)
+        partner_id, entry_id = mobj.group('partner_id', 'id')
+        ks = None
+        if partner_id and entry_id:
+            info, flavor_assets = self._get_video_info(entry_id, partner_id)
+        else:
+            path, query = mobj.group('path', 'query')
+            if not path and not query:
+                raise ExtractorError('Invalid URL', expected=True)
+            params = {}
+            if query:
+                params = compat_parse_qs(query)
+            if path:
+                splitted_path = path.split('/')
+                params.update(dict((zip(splitted_path[::2], [[v] for v in splitted_path[1::2]]))))
+            if 'wid' in params:
+                partner_id = params['wid'][0][1:]
+            elif 'p' in params:
+                partner_id = params['p'][0]
+            else:
+                raise ExtractorError('Invalid URL', expected=True)
+            if 'entry_id' in params:
+                entry_id = params['entry_id'][0]
+                info, flavor_assets = self._get_video_info(entry_id, partner_id)
+            elif 'uiconf_id' in params and 'flashvars[referenceId]' in params:
+                reference_id = params['flashvars[referenceId]'][0]
+                webpage = self._download_webpage(url, reference_id)
+                entry_data = self._parse_json(self._search_regex(
+                    r'window\.kalturaIframePackageData\s*=\s*({.*});',
+                    webpage, 'kalturaIframePackageData'),
+                    reference_id)['entryResult']
+                info, flavor_assets = entry_data['meta'], entry_data['contextData']['flavorAssets']
+                entry_id = info['id']
+            else:
+                raise ExtractorError('Invalid URL', expected=True)
+            ks = params.get('flashvars[ks]', [None])[0]
  
          source_url = smuggled_data.get('source_url')
          if source_url:
@@ -140,14 +168,19 @@ class KalturaIE(InfoExtractor):
          else:
              referrer = None
  
+        def sign_url(unsigned_url):
+            if ks:
+                unsigned_url += '/ks/%s' % ks
+            if referrer:
+                unsigned_url += '?referrer=%s' % referrer
+            return unsigned_url
+
          formats = []
          for f in flavor_assets:
              # Continue if asset is not ready
              if f['status'] != 2:
                  continue
-            video_url = '%s/flavorId/%s' % (info['dataUrl'], f['id'])
-            if referrer:
-                video_url += '?referrer=%s' % referrer
+            video_url = sign_url('%s/flavorId/%s' % (info['dataUrl'], f['id']))
              formats.append({
                  'format_id': '%(fileExt)s-%(bitrate)s' % f,
                  'ext': f.get('fileExt'),
@@ -160,9 +193,7 @@ class KalturaIE(InfoExtractor):
                  'width': int_or_none(f.get('width')),
                  'url': video_url,
              })
-        m3u8_url = info['dataUrl'].replace('format/url', 'format/applehttp')
-        if referrer:
-            m3u8_url += '?referrer=%s' % referrer
+        m3u8_url = sign_url(info['dataUrl'].replace('format/url', 'format/applehttp'))
          formats.extend(self._extract_m3u8_formats(
              m3u8_url, entry_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
  
diff --git a/youtube_dl/extractor/makerschannel.py b/youtube_dl/extractor/makerschannel.py

new file mode 100644 (file)

index 0000000..f5d00e6
--- /dev/null
+++ b/youtube_dl/extractor/makerschannel.py
@@ -0,0 +1,40 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class MakersChannelIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?makerschannel\.com/.*(?P<id_type>video|production)_id=(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://makerschannel.com/en/zoomin/community-highlights?video_id=849',
+        'md5': '624a512c6969236b5967bf9286345ad1',
+        'info_dict': {
+            'id': '849',
+            'ext': 'mp4',
+            'title': 'Landing a bus on a plane is an epic win',
+            'uploader': 'ZoomIn',
+            'description': 'md5:cd9cca2ea7b69b78be81d07020c97139',
+        }
+    }
+
+    def _real_extract(self, url):
+        id_type, url_id = re.match(self._VALID_URL, url).groups()
+        webpage = self._download_webpage(url, url_id)
+        video_data = self._html_search_regex(r'<div([^>]+data-%s-id="%s"[^>]+)>' % (id_type, url_id), webpage, 'video data')
+
+        def extract_data_val(attr, fatal=False):
+            return self._html_search_regex(r'data-%s\s*=\s*"([^"]+)"' % attr, video_data, attr, fatal=fatal)
+        minoto_id = self._search_regex(r'/id/([a-zA-Z0-9]+)', extract_data_val('video-src', True), 'minoto id')
+
+        return {
+            '_type': 'url_transparent',
+            'url': 'minoto:%s' % minoto_id,
+            'id': extract_data_val('video-id', True),
+            'title': extract_data_val('title', True),
+            'description': extract_data_val('description'),
+            'thumbnail': extract_data_val('image'),
+            'uploader': extract_data_val('channel'),
+        }
diff --git a/youtube_dl/extractor/minoto.py b/youtube_dl/extractor/minoto.py

new file mode 100644 (file)

index 0000000..959a105
--- /dev/null
+++ b/youtube_dl/extractor/minoto.py
@@ -0,0 +1,56 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class MinotoIE(InfoExtractor):
+    _VALID_URL = r'(?:minoto:|https?://(?:play|iframe|embed)\.minoto-video\.com/(?P<player_id>[0-9]+)/)(?P<id>[a-zA-Z0-9]+)'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        player_id = mobj.group('player_id') or '1'
+        video_id = mobj.group('id')
+        video_data = self._download_json('http://play.minoto-video.com/%s/%s.js' % (player_id, video_id), video_id)
+        video_metadata = video_data['video-metadata']
+        formats = []
+        for fmt in video_data['video-files']:
+            fmt_url = fmt.get('url')
+            if not fmt_url:
+                continue
+            container = fmt.get('container')
+            if container == 'hls':
+                formats.extend(fmt_url, video_id, 'mp4', m3u8_id='hls', fatal=False)
+            else:
+                fmt_profile = fmt.get('profile') or {}
+                f = {
+                    'format_id': fmt_profile.get('name-short'),
+                    'format_note': fmt_profile.get('name'),
+                    'url': fmt_url,
+                    'container': container,
+                    'tbr': int_or_none(fmt.get('bitrate')),
+                    'filesize': int_or_none(fmt.get('filesize')),
+                    'width': int_or_none(fmt.get('width')),
+                    'height': int_or_none(fmt.get('height')),
+                }
+                codecs = fmt.get('codecs')
+                if codecs:
+                    codecs = codecs.split(',')
+                    if len(codecs) == 2:
+                        f.update({
+                            'vcodec': codecs[0],
+                            'acodec': codecs[1],
+                        })
+                formats.append(f)
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': video_metadata['title'],
+            'description': video_metadata.get('description'),
+            'thumbnail': video_metadata.get('video-poster', {}).get('url'),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py

index c2b7ed9abbd27a2b2c8e0d9d95c59e387630180a..101497118275b7f1b5bf0564048f1dc9fc4b878b 100644 (file)
--- a/youtube_dl/extractor/mixcloud.py
+++ b/youtube_dl/extractor/mixcloud.py
@@ -7,6 +7,7 @@ from ..compat import compat_urllib_parse_unquote
  from ..utils import (
      ExtractorError,
      HEADRequest,
+    parse_count,
      str_to_int,
  )
  
@@ -85,8 +86,8 @@ class MixcloudIE(InfoExtractor):
          uploader_id = self._search_regex(
              r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False)
          description = self._og_search_description(webpage)
-        like_count = str_to_int(self._search_regex(
-            r'\bbutton-favorite\b[^>]+m-ajax-toggle-count="([^"]+)"',
+        like_count = parse_count(self._search_regex(
+            r'\bbutton-favorite[^>]+>.*?<span[^>]+class=["\']toggle-number[^>]+>\s*([^<]+)',
              webpage, 'like count', fatal=False))
          view_count = str_to_int(self._search_regex(
              [r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"',
diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py

index a65fc8ed706efa02e4d008494ea8269c10ad8b3a..256396bb8c21174fdaca4524c0b21d1ef7f802a9 100644 (file)
--- a/youtube_dl/extractor/safari.py
+++ b/youtube_dl/extractor/safari.py
@@ -4,14 +4,13 @@ from __future__ import unicode_literals
  import re
  
  from .common import InfoExtractor
-from .brightcove import BrightcoveLegacyIE
  
  from ..utils import (
      ExtractorError,
      sanitized_Request,
-    smuggle_url,
      std_headers,
      urlencode_postdata,
+    update_url_query,
  )
  
  
@@ -20,21 +19,22 @@ class SafariBaseIE(InfoExtractor):
      _SUCCESSFUL_LOGIN_REGEX = r'<a href="/accounts/logout/"[^>]*>Sign Out</a>'
      _NETRC_MACHINE = 'safari'
  
-    _API_BASE = 'https://www.safaribooksonline.com/api/v1/book'
+    _API_BASE = 'https://www.safaribooksonline.com/api/v1'
      _API_FORMAT = 'json'
  
      LOGGED_IN = False
  
      def _real_initialize(self):
-        # We only need to log in once for courses or individual videos
-        if not self.LOGGED_IN:
-            self._login()
-            SafariBaseIE.LOGGED_IN = True
+        self._login()
  
      def _login(self):
+        # We only need to log in once for courses or individual videos
+        if self.LOGGED_IN:
+            return
+
          (username, password) = self._get_login_info()
          if username is None:
-            self.raise_login_required('safaribooksonline.com account is required')
+            return
  
          headers = std_headers.copy()
          if 'Referer' not in headers:
@@ -67,6 +67,8 @@ class SafariBaseIE(InfoExtractor):
                  'Login failed; make sure your credentials are correct and try again.',
                  expected=True)
  
+        SafariBaseIE.LOGGED_IN = True
+
          self.to_screen('Login successful')
  
  
@@ -86,13 +88,15 @@ class SafariIE(SafariBaseIE):
  
      _TESTS = [{
          'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html',
-        'md5': '5b0c4cc1b3c1ba15dda7344085aa5592',
+        'md5': 'dcc5a425e79f2564148652616af1f2a3',
          'info_dict': {
-            'id': '2842601850001',
+            'id': '0_qbqx90ic',
              'ext': 'mp4',
-            'title': 'Introduction',
+            'title': 'Introduction to Hadoop Fundamentals LiveLessons',
+            'timestamp': 1437758058,
+            'upload_date': '20150724',
+            'uploader_id': 'stork',
          },
-        'skip': 'Requires safaribooksonline account credentials',
      }, {
          'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html',
          'only_matching': True,
@@ -107,15 +111,30 @@ class SafariIE(SafariBaseIE):
          course_id = mobj.group('course_id')
          part = mobj.group('part')
  
-        webpage = self._download_webpage(
-            '%s/%s/chapter-content/%s.html' % (self._API_BASE, course_id, part),
-            part)
+        webpage = self._download_webpage(url, '%s/%s' % (course_id, part))
+        reference_id = self._search_regex(r'data-reference-id="([^"]+)"', webpage, 'kaltura reference id')
+        partner_id = self._search_regex(r'data-partner-id="([^"]+)"', webpage, 'kaltura widget id')
+        ui_id = self._search_regex(r'data-ui-id="([^"]+)"', webpage, 'kaltura uiconf id')
+
+        query = {
+            'wid': '_%s' % partner_id,
+            'uiconf_id': ui_id,
+            'flashvars[referenceId]': reference_id,
+        }
  
-        bc_url = BrightcoveLegacyIE._extract_brightcove_url(webpage)
-        if not bc_url:
-            raise ExtractorError('Could not extract Brightcove URL from %s' % url, expected=True)
+        if self.LOGGED_IN:
+            kaltura_session = self._download_json(
+                '%s/player/kaltura_session/?reference_id=%s' % (self._API_BASE, reference_id),
+                course_id, 'Downloading kaltura session JSON',
+                'Unable to download kaltura session JSON', fatal=False)
+            if kaltura_session:
+                session = kaltura_session.get('session')
+                if session:
+                    query['flashvars[ks]'] = session
  
-        return self.url_result(smuggle_url(bc_url, {'Referer': url}), 'BrightcoveLegacy')
+        return self.url_result(update_url_query(
+            'https://cdnapisec.kaltura.com/html5/html5lib/v2.37.1/mwEmbedFrame.php', query),
+            'Kaltura')
  
  
  class SafariCourseIE(SafariBaseIE):
@@ -141,7 +160,7 @@ class SafariCourseIE(SafariBaseIE):
          course_id = self._match_id(url)
  
          course_json = self._download_json(
-            '%s/%s/?override_format=%s' % (self._API_BASE, course_id, self._API_FORMAT),
+            '%s/book/%s/?override_format=%s' % (self._API_BASE, course_id, self._API_FORMAT),
              course_id, 'Downloading course JSON')
  
          if 'chapters' not in course_json:
diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py

index 3db6286e48c1402ff210d8f4cc666dac1a918a86..46c785ae183d72207ab12500618f3eb7b765373d 100644 (file)
--- a/youtube_dl/extractor/vice.py
+++ b/youtube_dl/extractor/vice.py
@@ -1,31 +1,37 @@
  from __future__ import unicode_literals
  
+import re
+
  from .common import InfoExtractor
  from .ooyala import OoyalaIE
  from ..utils import ExtractorError
  
  
  class ViceIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)+(?P<id>.+)'
-
-    _TESTS = [
-        {
-            'url': 'http://www.vice.com/Fringes/cowboy-capitalists-part-1',
-            'info_dict': {
-                'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp',
-                'ext': 'mp4',
-                'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov',
-                'duration': 725.983,
-            },
-            'params': {
-                # Requires ffmpeg (m3u8 manifest)
-                'skip_download': True,
-            },
-        }, {
-            'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab',
-            'only_matching': True,
-        }
-    ]
+    _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?videos?/(?P<id>[^/?#&]+)'
+
+    _TESTS = [{
+        'url': 'http://www.vice.com/video/cowboy-capitalists-part-1',
+        'info_dict': {
+            'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp',
+            'ext': 'mp4',
+            'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov',
+            'duration': 725.983,
+        },
+        'params': {
+            # Requires ffmpeg (m3u8 manifest)
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.vice.com/ru/video/big-night-out-ibiza-clive-martin-229',
+        'only_matching': True,
+    }, {
+        'url': 'https://munchies.vice.com/en/videos/watch-the-trailer-for-our-new-series-the-pizza-show',
+        'only_matching': True,
+    }]
  
      def _real_extract(self, url):
          video_id = self._match_id(url)
@@ -38,3 +44,35 @@ class ViceIE(InfoExtractor):
          except ExtractorError:
              raise ExtractorError('The page doesn\'t contain a video', expected=True)
          return self.url_result(ooyala_url, ie='Ooyala')
+
+
+class ViceShowIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?show/(?P<id>[^/?#&]+)'
+
+    _TEST = {
+        'url': 'https://munchies.vice.com/en/show/fuck-thats-delicious-2',
+        'info_dict': {
+            'id': 'fuck-thats-delicious-2',
+            'title': "Fuck, That's Delicious",
+            'description': 'Follow the culinary adventures of rapper Action Bronson during his ongoing world tour.',
+        },
+        'playlist_count': 17,
+    }
+
+    def _real_extract(self, url):
+        show_id = self._match_id(url)
+        webpage = self._download_webpage(url, show_id)
+
+        entries = [
+            self.url_result(video_url, ViceIE.ie_key())
+            for video_url, _ in re.findall(
+                r'<h2[^>]+class="article-title"[^>]+data-id="\d+"[^>]*>\s*<a[^>]+href="(%s.*?)"'
+                % ViceIE._VALID_URL, webpage)]
+
+        title = self._search_regex(
+            r'<title>(.+?)</title>', webpage, 'title', default=None)
+        if title:
+            title = re.sub(r'(.+)\s*\|\s*.+$', r'\1', title).strip()
+        description = self._html_search_meta('description', webpage, 'description')
+
+        return self.playlist_result(entries, show_id, title, description)
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py

index 22a39a0ab7af191095242e6b2a3f580e59e17c37..9fd0ec8d5856cbee27534c0f4a02cc90b05f8389 100644 (file)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -1316,6 +1316,17 @@ def format_bytes(bytes):
      return '%.2f%s' % (converted, suffix)
  
  
+def lookup_unit_table(unit_table, s):
+    units_re = '|'.join(re.escape(u) for u in unit_table)
+    m = re.match(
+        r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
+    if not m:
+        return None
+    num_str = m.group('num').replace(',', '.')
+    mult = unit_table[m.group('unit')]
+    return int(float(num_str) * mult)
+
+
  def parse_filesize(s):
      if s is None:
          return None
@@ -1359,15 +1370,28 @@ def parse_filesize(s):
          'Yb': 1000 ** 8,
      }
  
-    units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
-    m = re.match(
-        r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
-    if not m:
+    return lookup_unit_table(_UNIT_TABLE, s)
+
+
+def parse_count(s):
+    if s is None:
          return None
  
-    num_str = m.group('num').replace(',', '.')
-    mult = _UNIT_TABLE[m.group('unit')]
-    return int(float(num_str) * mult)
+    s = s.strip()
+
+    if re.match(r'^[\d,.]+$', s):
+        return str_to_int(s)
+
+    _UNIT_TABLE = {
+        'k': 1000,
+        'K': 1000,
+        'm': 1000 ** 2,
+        'M': 1000 ** 2,
+        'kk': 1000 ** 2,
+        'KK': 1000 ** 2,
+    }
+
+    return lookup_unit_table(_UNIT_TABLE, s)
  
  
  def month_by_name(name):
author	remitamine <remitamine@gmail.com>
	Sun, 13 Mar 2016 11:44:50 +0000 (12:44 +0100)
committer	remitamine <remitamine@gmail.com>
	Sun, 13 Mar 2016 11:44:50 +0000 (12:44 +0100)
AUTHORS		patch \| blob \| history
test/test_utils.py		patch \| blob \| history
youtube_dl/extractor/__init__.py		patch \| blob \| history
youtube_dl/extractor/audioboom.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/bbc.py		patch \| blob \| history
youtube_dl/extractor/common.py		patch \| blob \| history
youtube_dl/extractor/dw.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/facebook.py		patch \| blob \| history
youtube_dl/extractor/generic.py		patch \| blob \| history
youtube_dl/extractor/googledrive.py		patch \| blob \| history
youtube_dl/extractor/kaltura.py		patch \| blob \| history
youtube_dl/extractor/makerschannel.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/minoto.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/mixcloud.py		patch \| blob \| history
youtube_dl/extractor/safari.py		patch \| blob \| history
youtube_dl/extractor/vice.py		patch \| blob \| history
youtube_dl/utils.py		patch \| blob \| history