Merge pull request #7599 from lalinsky/fix-youtube

author Sergey M <dstftw@gmail.com>

Mon, 23 Nov 2015 14:52:23 +0000 (20:52 +0600)

committer Sergey M <dstftw@gmail.com>

Mon, 23 Nov 2015 14:52:23 +0000 (20:52 +0600)
author Sergey M <dstftw@gmail.com>
Mon, 23 Nov 2015 14:52:23 +0000 (20:52 +0600)
committer Sergey M <dstftw@gmail.com>
Mon, 23 Nov 2015 14:52:23 +0000 (20:52 +0600)
diff --git a/README.md b/README.md

index 38db97c5980e8a3ff186e6e6d8d80fd801eb4e51..b286651cd712933fd4538b88499d9325b25d88e7 100644 (file)
--- a/README.md
+++ b/README.md
@@ -329,8 +329,8 @@ which means you can modify it, redistribute it or use it however you like.
  
  ## Subtitle Options:
      --write-sub                      Write subtitle file
-    --write-auto-sub                 Write automatic subtitle file (YouTube
-                                     only)
+    --write-auto-sub                 Write automatically generated subtitle file
+                                     (YouTube only)
      --all-subs                       Download all the available subtitles of the
                                       video
      --list-subs                      List all available subtitles for the video
diff --git a/docs/supportedsites.md b/docs/supportedsites.md

index 2e52837474fae2cbb664f0b1466d1d6e2a6115e2..1df4086101f18f42daa5e436512136932bf680fe 100644 (file)
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@@ -129,6 +129,7 @@
   - **Discovery**
   - **Dotsub**
   - **DouyuTV**: 斗鱼
+ - **DPlay**
   - **dramafever**
   - **dramafever:series**
   - **DRBonanza**
@@ -493,6 +494,7 @@
   - **soompi:show**
   - **soundcloud**
   - **soundcloud:playlist**
+ - **soundcloud:search**: Soundcloud search
   - **soundcloud:set**
   - **soundcloud:user**
   - **soundgasm**
@@ -706,6 +708,7 @@
   - **youtube:show**: YouTube.com (multi-season) shows
   - **youtube:subscriptions**: YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)
   - **youtube:user**: YouTube.com user videos (URL or "ytuser" keyword)
+ - **youtube:user:playlists**: YouTube.com user playlists
   - **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication)
   - **Zapiks**
   - **ZDF**
diff --git a/test/test_utils.py b/test/test_utils.py

index ea1ff0547cbb1e9dd081fb6b90243c0269d7d9ef..501355c74ad9a745bf8788d0f2e2c603f11b39d8 100644 (file)
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -21,6 +21,7 @@ from youtube_dl.utils import (
      clean_html,
      DateRange,
      detect_exe_version,
+    determine_ext,
      encodeFilename,
      escape_rfc3986,
      escape_url,
@@ -238,6 +239,13 @@ class TestUtil(unittest.TestCase):
          self.assertEqual(unified_strdate('25-09-2014'), '20140925')
          self.assertEqual(unified_strdate('UNKNOWN DATE FORMAT'), None)
  
+    def test_determine_ext(self):
+        self.assertEqual(determine_ext('http://example.com/foo/bar.mp4/?download'), 'mp4')
+        self.assertEqual(determine_ext('http://example.com/foo/bar/?download', None), None)
+        self.assertEqual(determine_ext('http://example.com/foo/bar.nonext/?download', None), None)
+        self.assertEqual(determine_ext('http://example.com/foo/bar/mp4?download', None), None)
+        self.assertEqual(determine_ext('http://example.com/foo/bar.m3u8//?download'), 'm3u8')
+
      def test_find_xpath_attr(self):
          testxml = '''<root>
              <node/>
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py

index 1783ce01bf683ab0cfa349a1b040362f88cc1a47..fba99af8dce2143e7cc8feba3a61460f2b029839 100755 (executable)
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -156,7 +156,7 @@ class YoutubeDL(object):
      writethumbnail:    Write the thumbnail image to a file
      write_all_thumbnails:  Write all thumbnail formats to files
      writesubtitles:    Write the video subtitles to a file
-    writeautomaticsub: Write the automatic subtitles to a file
+    writeautomaticsub: Write the automatically generated subtitles to a file
      allsubtitles:      Downloads all the subtitles of the video
                         (requires writesubtitles or writeautomaticsub)
      listsubtitles:     Lists all available subtitles for the video
@@ -833,6 +833,7 @@ class YoutubeDL(object):
                                                        extra_info=extra)
                  playlist_results.append(entry_result)
              ie_result['entries'] = playlist_results
+            self.to_screen('[download] Finished downloading playlist: %s' % playlist)
              return ie_result
          elif result_type == 'compat_list':
              self.report_warning(
@@ -937,7 +938,7 @@ class YoutubeDL(object):
                      filter_parts.append(string)
  
          def _remove_unused_ops(tokens):
-            # Remove operators that we don't use and join them with the sourrounding strings
+            # Remove operators that we don't use and join them with the surrounding strings
              # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
              ALLOWED_OPS = ('/', '+', ',', '(', ')')
              last_string, last_start, last_end, last_line = None, None, None, None
diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py

index 29a4500d3a02920ec586d38338c659ff4c43124c..b8bf8daf8c3265f9baa53d3ac30ee78d1c149587 100644 (file)
--- a/youtube_dl/downloader/common.py
+++ b/youtube_dl/downloader/common.py
@@ -42,7 +42,7 @@ class FileDownloader(object):
      min_filesize:       Skip files smaller than this size
      max_filesize:       Skip files larger than this size
      xattr_set_filesize: Set ytdl.filesize user xattribute with expected size.
-                        (experimenatal)
+                        (experimental)
      external_downloader_args:  A list of additional command-line arguments for the
                          external downloader.
  
diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py

index f1d219ba97494d7a036354331aadfcb049817417..14d56db47decb9b828c9d78ff8bb91eddb5c7d8a 100644 (file)
--- a/youtube_dl/downloader/rtmp.py
+++ b/youtube_dl/downloader/rtmp.py
@@ -117,7 +117,7 @@ class RtmpFD(FileDownloader):
              return False
  
          # Download using rtmpdump. rtmpdump returns exit code 2 when
-        # the connection was interrumpted and resuming appears to be
+        # the connection was interrupted and resuming appears to be
          # possible. This is part of rtmpdump's normal usage, AFAIK.
          basic_args = [
              'rtmpdump', '--verbose', '-r', url,
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py

index 59c82f65d316ac5dea7cfea23e39fc4f366ebd9d..947b836832b5551a6abfc7df34631a4180af394f 100644 (file)
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -132,6 +132,7 @@ from .dfb import DFBIE
  from .dhm import DHMIE
  from .dotsub import DotsubIE
  from .douyutv import DouyuTVIE
+from .dplay import DPlayIE
  from .dramafever import (
      DramaFeverIE,
      DramaFeverSeriesIE,
@@ -575,7 +576,8 @@ from .soundcloud import (
      SoundcloudIE,
      SoundcloudSetIE,
      SoundcloudUserIE,
-    SoundcloudPlaylistIE
+    SoundcloudPlaylistIE,
+    SoundcloudSearchIE
  )
  from .soundgasm import (
      SoundgasmIE,
@@ -832,6 +834,7 @@ from .youtube import (
      YoutubeTruncatedIDIE,
      YoutubeTruncatedURLIE,
      YoutubeUserIE,
+    YoutubeUserPlaylistsIE,
      YoutubeWatchLaterIE,
  )
  from .zapiks import ZapiksIE
diff --git a/youtube_dl/extractor/bloomberg.py b/youtube_dl/extractor/bloomberg.py

index 0dca29b712c79a27fb621f094a6f64ab503ba3df..11ace91dd310b62b8071e2f5b4603c54c2f91d52 100644 (file)
--- a/youtube_dl/extractor/bloomberg.py
+++ b/youtube_dl/extractor/bloomberg.py
@@ -6,9 +6,9 @@ from .common import InfoExtractor
  
  
  class BloombergIE(InfoExtractor):
-    _VALID_URL = r'https?://www\.bloomberg\.com/news/videos/[^/]+/(?P<id>[^/?#]+)'
+    _VALID_URL = r'https?://www\.bloomberg\.com/news/[^/]+/[^/]+/(?P<id>[^/?#]+)'
  
-    _TEST = {
+    _TESTS = [{
          'url': 'http://www.bloomberg.com/news/videos/b/aaeae121-5949-481e-a1ce-4562db6f5df2',
          # The md5 checksum changes
          'info_dict': {
@@ -17,7 +17,10 @@ class BloombergIE(InfoExtractor):
              'title': 'Shah\'s Presentation on Foreign-Exchange Strategies',
              'description': 'md5:a8ba0302912d03d246979735c17d2761',
          },
-    }
+    }, {
+        'url': 'http://www.bloomberg.com/news/articles/2015-11-12/five-strange-things-that-have-been-happening-in-financial-markets',
+        'only_matching': True,
+    }]
  
      def _real_extract(self, url):
          name = self._match_id(url)
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index 5e263f8b5a2cf46fbb26e928f5df85c87c42dfde..71bdcad5ae34e47690ec70ac7873e6d9960b894b 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -891,6 +891,11 @@ class InfoExtractor(object):
          if not media_nodes:
              manifest_version = '2.0'
              media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
+        base_url = xpath_text(
+            manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
+            'base URL', default=None)
+        if base_url:
+            base_url = base_url.strip()
          for i, media_el in enumerate(media_nodes):
              if manifest_version == '2.0':
                  media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
@@ -898,7 +903,7 @@ class InfoExtractor(object):
                      continue
                  manifest_url = (
                      media_url if media_url.startswith('http://') or media_url.startswith('https://')
-                    else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url))
+                    else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
                  # If media_url is itself a f4m manifest do the recursive extraction
                  # since bitrates in parent manifest (this one) and media_url manifest
                  # may differ leading to inability to resolve the format by requested
diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py

new file mode 100644 (file)

index 0000000..6cda56a
--- /dev/null
+++ b/youtube_dl/extractor/dplay.py
@@ -0,0 +1,51 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import time
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class DPlayIE(InfoExtractor):
+    _VALID_URL = r'http://www\.dplay\.se/[^/]+/(?P<id>[^/?#]+)'
+
+    _TEST = {
+        'url': 'http://www.dplay.se/nugammalt-77-handelser-som-format-sverige/season-1-svensken-lar-sig-njuta-av-livet/',
+        'info_dict': {
+            'id': '3172',
+            'ext': 'mp4',
+            'display_id': 'season-1-svensken-lar-sig-njuta-av-livet',
+            'title': 'Svensken lär sig njuta av livet',
+            'duration': 2650,
+        },
+    }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        video_id = self._search_regex(
+            r'data-video-id="(\d+)"', webpage, 'video id')
+
+        info = self._download_json(
+            'http://www.dplay.se/api/v2/ajax/videos?video_id=' + video_id,
+            video_id)['data'][0]
+
+        self._set_cookie(
+            'secure.dplay.se', 'dsc-geo',
+            '{"countryCode":"NL","expiry":%d}' % ((time.time() + 20 * 60) * 1000))
+        # TODO: consider adding support for 'stream_type=hds', it seems to
+        # require setting some cookies
+        manifest_url = self._download_json(
+            'https://secure.dplay.se/secure/api/v2/user/authorization/stream/%s?stream_type=hls' % video_id,
+            video_id, 'Getting manifest url for hls stream')['hls']
+        formats = self._extract_m3u8_formats(
+            manifest_url, video_id, ext='mp4', entry_protocol='m3u8_native')
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': info['title'],
+            'formats': formats,
+            'duration': int_or_none(info.get('video_metadata_length'), scale=1000),
+        }
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py

index 51516a38a14646f29bdc160b60e0298ad1234739..2b934148dda74880fd3d0e6466dcd0342f9459d2 100644 (file)
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -823,6 +823,19 @@ class GenericIE(InfoExtractor):
                  'title': 'Os Guinness // Is It Fools Talk? // Unbelievable? Conference 2014',
              },
          },
+        # Kaltura embed protected with referrer
+        {
+            'url': 'http://www.disney.nl/disney-channel/filmpjes/achter-de-schermen#/videoId/violetta-achter-de-schermen-ruggero',
+            'info_dict': {
+                'id': '1_g4fbemnq',
+                'ext': 'mp4',
+                'title': 'Violetta - Achter De Schermen - Ruggero',
+                'description': 'Achter de schermen met Ruggero',
+                'timestamp': 1435133761,
+                'upload_date': '20150624',
+                'uploader_id': 'echojecka',
+            },
+        },
          # Eagle.Platform embed (generic URL)
          {
              'url': 'http://lenta.ru/news/2015/03/06/navalny/',
@@ -1045,6 +1058,20 @@ class GenericIE(InfoExtractor):
                  'description': 'Tabletop: Dread, Last Thoughts',
                  'duration': 51690,
              },
+        },
+        # JWPlayer with M3U8
+        {
+            'url': 'http://ren.tv/novosti/2015-09-25/sluchaynyy-prohozhiy-poymal-avtougonshchika-v-murmanske-video',
+            'info_dict': {
+                'id': 'playlist',
+                'ext': 'mp4',
+                'title': 'Случайный прохожий поймал автоугонщика в Мурманске. ВИДЕО | РЕН ТВ',
+                'uploader': 'ren.tv',
+            },
+            'params': {
+                # m3u8 downloads
+                'skip_download': True,
+            }
          }
      ]
  
@@ -1694,7 +1721,9 @@ class GenericIE(InfoExtractor):
          mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_?[Ii]d'\s*:\s*'(?P<id>[^']+)',", webpage) or
                  re.search(r'(?s)(?P<q1>["\'])(?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?(?P=q1).*?entry_?[Ii]d\s*:\s*(?P<q2>["\'])(?P<id>.+?)(?P=q2)', webpage))
          if mobj is not None:
-            return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura')
+            return self.url_result(smuggle_url(
+                'kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(),
+                {'source_url': url}), 'Kaltura')
  
          # Look for Eagle.Platform embeds
          mobj = re.search(
@@ -1739,7 +1768,7 @@ class GenericIE(InfoExtractor):
  
          # Look for UDN embeds
          mobj = re.search(
-            r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._VALID_URL, webpage)
+            r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._PROTOCOL_RELATIVE_VALID_URL, webpage)
          if mobj is not None:
              return self.url_result(
                  compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed')
@@ -1859,6 +1888,7 @@ class GenericIE(InfoExtractor):
  
          entries = []
          for video_url in found:
+            video_url = video_url.replace('\\/', '/')
              video_url = compat_urlparse.urljoin(url, video_url)
              video_id = compat_urllib_parse_unquote(os.path.basename(video_url))
  
@@ -1870,25 +1900,24 @@ class GenericIE(InfoExtractor):
              # here's a fun little line of code for you:
              video_id = os.path.splitext(video_id)[0]
  
+            entry_info_dict = {
+                'id': video_id,
+                'uploader': video_uploader,
+                'title': video_title,
+                'age_limit': age_limit,
+            }
+
              ext = determine_ext(video_url)
              if ext == 'smil':
-                entries.append({
-                    'id': video_id,
-                    'formats': self._extract_smil_formats(video_url, video_id),
-                    'uploader': video_uploader,
-                    'title': video_title,
-                    'age_limit': age_limit,
-                })
+                entry_info_dict['formats'] = self._extract_smil_formats(video_url, video_id)
              elif ext == 'xspf':
                  return self.playlist_result(self._extract_xspf_playlist(video_url, video_id), video_id)
+            elif ext == 'm3u8':
+                entry_info_dict['formats'] = self._extract_m3u8_formats(video_url, video_id, ext='mp4')
              else:
-                entries.append({
-                    'id': video_id,
-                    'url': video_url,
-                    'uploader': video_uploader,
-                    'title': video_title,
-                    'age_limit': age_limit,
-                })
+                entry_info_dict['url'] = video_url
+
+            entries.append(entry_info_dict)
  
          if len(entries) == 1:
              return entries[0]
diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py

index fce179000786cfeacad39d9d65786aa9fea915b1..c158f206410467e8c66a8bc2526d0436cc1a4e3c 100644 (file)
--- a/youtube_dl/extractor/instagram.py
+++ b/youtube_dl/extractor/instagram.py
@@ -10,7 +10,7 @@ from ..utils import (
  
  
  class InstagramIE(InfoExtractor):
-    _VALID_URL = r'https://instagram\.com/p/(?P<id>[^/?#&]+)'
+    _VALID_URL = r'https?://(?:www\.)?instagram\.com/p/(?P<id>[^/?#&]+)'
      _TESTS = [{
          'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc',
          'md5': '0d2da106a9d2631273e192b372806516',
diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py

index 0dcd6cd0503e262031cee4fe8810947f685807b5..583b1a5adbc2692d78b5c01dc0063e37822e910c 100644 (file)
--- a/youtube_dl/extractor/kaltura.py
+++ b/youtube_dl/extractor/kaltura.py
@@ -2,12 +2,18 @@
  from __future__ import unicode_literals
  
  import re
+import base64
  
  from .common import InfoExtractor
-from ..compat import compat_urllib_parse
+from ..compat import (
+    compat_urllib_parse,
+    compat_urlparse,
+)
  from ..utils import (
+    clean_html,
      ExtractorError,
      int_or_none,
+    unsmuggle_url,
  )
  
  
@@ -121,31 +127,47 @@ class KalturaIE(InfoExtractor):
              video_id, actions, note='Downloading video info JSON')
  
      def _real_extract(self, url):
+        url, smuggled_data = unsmuggle_url(url, {})
+
          mobj = re.match(self._VALID_URL, url)
          partner_id = mobj.group('partner_id_s') or mobj.group('partner_id') or mobj.group('partner_id_html5')
          entry_id = mobj.group('id_s') or mobj.group('id') or mobj.group('id_html5')
  
          info, source_data = self._get_video_info(entry_id, partner_id)
  
-        formats = [{
-            'format_id': '%(fileExt)s-%(bitrate)s' % f,
-            'ext': f['fileExt'],
-            'tbr': f['bitrate'],
-            'fps': f.get('frameRate'),
-            'filesize_approx': int_or_none(f.get('size'), invscale=1024),
-            'container': f.get('containerFormat'),
-            'vcodec': f.get('videoCodecId'),
-            'height': f.get('height'),
-            'width': f.get('width'),
-            'url': '%s/flavorId/%s' % (info['dataUrl'], f['id']),
-        } for f in source_data['flavorAssets']]
+        source_url = smuggled_data.get('source_url')
+        if source_url:
+            referrer = base64.b64encode(
+                '://'.join(compat_urlparse.urlparse(source_url)[:2])
+                .encode('utf-8')).decode('utf-8')
+        else:
+            referrer = None
+
+        formats = []
+        for f in source_data['flavorAssets']:
+            video_url = '%s/flavorId/%s' % (info['dataUrl'], f['id'])
+            if referrer:
+                video_url += '?referrer=%s' % referrer
+            formats.append({
+                'format_id': '%(fileExt)s-%(bitrate)s' % f,
+                'ext': f.get('fileExt'),
+                'tbr': int_or_none(f['bitrate']),
+                'fps': int_or_none(f.get('frameRate')),
+                'filesize_approx': int_or_none(f.get('size'), invscale=1024),
+                'container': f.get('containerFormat'),
+                'vcodec': f.get('videoCodecId'),
+                'height': int_or_none(f.get('height')),
+                'width': int_or_none(f.get('width')),
+                'url': video_url,
+            })
+        self._check_formats(formats, entry_id)
          self._sort_formats(formats)
  
          return {
              'id': entry_id,
              'title': info['name'],
              'formats': formats,
-            'description': info.get('description'),
+            'description': clean_html(info.get('description')),
              'thumbnail': info.get('thumbnailUrl'),
              'duration': info.get('duration'),
              'timestamp': info.get('createdAt'),
diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py

index 8fb9b1849cfd96e5ce21ef2ffcffba200f8ba482..b787e2a73c66a0ff1bf2d17b9f20c13bf166aae7 100644 (file)
--- a/youtube_dl/extractor/pbs.py
+++ b/youtube_dl/extractor/pbs.py
@@ -22,7 +22,7 @@ class PBSIE(InfoExtractor):
             # Article with embedded player (or direct video)
             (?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) |
             # Player
-           video\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)/
+           (?:video|player)\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)/
          )
      '''
  
@@ -170,6 +170,10 @@ class PBSIE(InfoExtractor):
              'params': {
                  'skip_download': True,  # requires ffmpeg
              },
+        },
+        {
+            'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true',
+            'only_matching': True,
          }
      ]
      _ERRORS = {
@@ -259,7 +263,7 @@ class PBSIE(InfoExtractor):
              return self.playlist_result(entries, display_id)
  
          info = self._download_json(
-            'http://video.pbs.org/videoInfo/%s?format=json&type=partner' % video_id,
+            'http://player.pbs.org/videoInfo/%s?format=json&type=partner' % video_id,
              display_id)
  
          formats = []
diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py

index fd32836ccaf61b171f9dcafbbfce24c7764b18c1..792316db849bcb4c165a5d25b153a526e091788e 100644 (file)
--- a/youtube_dl/extractor/pluralsight.py
+++ b/youtube_dl/extractor/pluralsight.py
@@ -1,7 +1,8 @@
  from __future__ import unicode_literals
  
-import re
  import json
+import random
+import collections
  
  from .common import InfoExtractor
  from ..compat import (
@@ -17,13 +18,18 @@ from ..utils import (
  )
  
  
-class PluralsightIE(InfoExtractor):
+class PluralsightBaseIE(InfoExtractor):
+    _API_BASE = 'http://app.pluralsight.com'
+
+
+class PluralsightIE(PluralsightBaseIE):
      IE_NAME = 'pluralsight'
-    _VALID_URL = r'https?://(?:www\.)?pluralsight\.com/training/player\?author=(?P<author>[^&]+)&name=(?P<name>[^&]+)(?:&mode=live)?&clip=(?P<clip>\d+)&course=(?P<course>[^&]+)'
-    _LOGIN_URL = 'https://www.pluralsight.com/id/'
+    _VALID_URL = r'https?://(?:(?:www|app)\.)?pluralsight\.com/training/player\?'
+    _LOGIN_URL = 'https://app.pluralsight.com/id/'
+
      _NETRC_MACHINE = 'pluralsight'
  
-    _TEST = {
+    _TESTS = [{
          'url': 'http://www.pluralsight.com/training/player?author=mike-mckeown&name=hosting-sql-server-windows-azure-iaas-m7-mgmt&mode=live&clip=3&course=hosting-sql-server-windows-azure-iaas',
          'md5': '4d458cf5cf4c593788672419a8dd4cf8',
          'info_dict': {
@@ -33,7 +39,14 @@ class PluralsightIE(InfoExtractor):
              'duration': 338,
          },
          'skip': 'Requires pluralsight account credentials',
-    }
+    }, {
+        'url': 'https://app.pluralsight.com/training/player?course=angularjs-get-started&author=scott-allen&name=angularjs-get-started-m1-introduction&clip=0&mode=live',
+        'only_matching': True,
+    }, {
+        # available without pluralsight account
+        'url': 'http://app.pluralsight.com/training/player?author=scott-allen&name=angularjs-get-started-m1-introduction&mode=live&clip=0&course=angularjs-get-started',
+        'only_matching': True,
+    }]
  
      def _real_initialize(self):
          self._login()
@@ -41,7 +54,7 @@ class PluralsightIE(InfoExtractor):
      def _login(self):
          (username, password) = self._get_login_info()
          if username is None:
-            self.raise_login_required('Pluralsight account is required')
+            return
  
          login_page = self._download_webpage(
              self._LOGIN_URL, None, 'Downloading login page')
@@ -73,30 +86,47 @@ class PluralsightIE(InfoExtractor):
          if error:
              raise ExtractorError('Unable to login: %s' % error, expected=True)
  
+        if all(p not in response for p in ('__INITIAL_STATE__', '"currentUser"')):
+            raise ExtractorError('Unable to log in')
+
      def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        author = mobj.group('author')
-        name = mobj.group('name')
-        clip_id = mobj.group('clip')
-        course = mobj.group('course')
+        qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+
+        author = qs.get('author', [None])[0]
+        name = qs.get('name', [None])[0]
+        clip_id = qs.get('clip', [None])[0]
+        course = qs.get('course', [None])[0]
+
+        if any(not f for f in (author, name, clip_id, course,)):
+            raise ExtractorError('Invalid URL', expected=True)
  
          display_id = '%s-%s' % (name, clip_id)
  
          webpage = self._download_webpage(url, display_id)
  
-        collection = self._parse_json(
-            self._search_regex(
-                r'moduleCollection\s*:\s*new\s+ModuleCollection\((\[.+?\])\s*,\s*\$rootScope\)',
-                webpage, 'modules'),
-            display_id)
+        modules = self._search_regex(
+            r'moduleCollection\s*:\s*new\s+ModuleCollection\((\[.+?\])\s*,\s*\$rootScope\)',
+            webpage, 'modules', default=None)
+
+        if modules:
+            collection = self._parse_json(modules, display_id)
+        else:
+            # Webpage may be served in different layout (see
+            # https://github.com/rg3/youtube-dl/issues/7607)
+            collection = self._parse_json(
+                self._search_regex(
+                    r'var\s+initialState\s*=\s*({.+?});\n', webpage, 'initial state'),
+                display_id)['course']['modules']
  
          module, clip = None, None
  
          for module_ in collection:
-            if module_.get('moduleName') == name:
+            if name in (module_.get('moduleName'), module_.get('name')):
                  module = module_
                  for clip_ in module_.get('clips', []):
                      clip_index = clip_.get('clipIndex')
+                    if clip_index is None:
+                        clip_index = clip_.get('index')
                      if clip_index is None:
                          continue
                      if compat_str(clip_index) == clip_id:
@@ -112,13 +142,33 @@ class PluralsightIE(InfoExtractor):
              'high': {'width': 1024, 'height': 768},
          }
  
+        AllowedQuality = collections.namedtuple('AllowedQuality', ['ext', 'qualities'])
+
          ALLOWED_QUALITIES = (
-            ('webm', ('high',)),
-            ('mp4', ('low', 'medium', 'high',)),
+            AllowedQuality('webm', ('high',)),
+            AllowedQuality('mp4', ('low', 'medium', 'high',)),
          )
  
+        # In order to minimize the number of calls to ViewClip API and reduce
+        # the probability of being throttled or banned by Pluralsight we will request
+        # only single format until formats listing was explicitly requested.
+        if self._downloader.params.get('listformats', False):
+            allowed_qualities = ALLOWED_QUALITIES
+        else:
+            def guess_allowed_qualities():
+                req_format = self._downloader.params.get('format') or 'best'
+                req_format_split = req_format.split('-')
+                if len(req_format_split) > 1:
+                    req_ext, req_quality = req_format_split
+                    for allowed_quality in ALLOWED_QUALITIES:
+                        if req_ext == allowed_quality.ext and req_quality in allowed_quality.qualities:
+                            return (AllowedQuality(req_ext, (req_quality, )), )
+                req_ext = 'webm' if self._downloader.params.get('prefer_free_formats') else 'mp4'
+                return (AllowedQuality(req_ext, ('high', )), )
+            allowed_qualities = guess_allowed_qualities()
+
          formats = []
-        for ext, qualities in ALLOWED_QUALITIES:
+        for ext, qualities in allowed_qualities:
              for quality in qualities:
                  f = QUALITIES[quality].copy()
                  clip_post = {
@@ -132,12 +182,23 @@ class PluralsightIE(InfoExtractor):
                      'q': '%dx%d' % (f['width'], f['height']),
                  }
                  request = compat_urllib_request.Request(
-                    'http://www.pluralsight.com/training/Player/ViewClip',
+                    '%s/training/Player/ViewClip' % self._API_BASE,
                      json.dumps(clip_post).encode('utf-8'))
                  request.add_header('Content-Type', 'application/json;charset=utf-8')
                  format_id = '%s-%s' % (ext, quality)
                  clip_url = self._download_webpage(
                      request, display_id, 'Downloading %s URL' % format_id, fatal=False)
+
+                # Pluralsight tracks multiple sequential calls to ViewClip API and start
+                # to return 429 HTTP errors after some time (see
+                # https://github.com/rg3/youtube-dl/pull/6989). Moreover it may even lead
+                # to account ban (see https://github.com/rg3/youtube-dl/issues/6842).
+                # To somewhat reduce the probability of these consequences
+                # we will sleep random amount of time before each call to ViewClip.
+                self._sleep(
+                    random.randint(2, 5), display_id,
+                    '%(video_id)s: Waiting for %(timeout)s seconds to avoid throttling')
+
                  if not clip_url:
                      continue
                  f.update({
@@ -163,10 +224,10 @@ class PluralsightIE(InfoExtractor):
          }
  
  
-class PluralsightCourseIE(InfoExtractor):
+class PluralsightCourseIE(PluralsightBaseIE):
      IE_NAME = 'pluralsight:course'
-    _VALID_URL = r'https?://(?:www\.)?pluralsight\.com/courses/(?P<id>[^/]+)'
-    _TEST = {
+    _VALID_URL = r'https?://(?:(?:www|app)\.)?pluralsight\.com/(?:library/)?courses/(?P<id>[^/]+)'
+    _TESTS = [{
          # Free course from Pluralsight Starter Subscription for Microsoft TechNet
          # https://offers.pluralsight.com/technet?loc=zTS3z&prod=zOTprodz&tech=zOttechz&prog=zOTprogz&type=zSOz&media=zOTmediaz&country=zUSz
          'url': 'http://www.pluralsight.com/courses/hosting-sql-server-windows-azure-iaas',
@@ -176,7 +237,14 @@ class PluralsightCourseIE(InfoExtractor):
              'description': 'md5:61b37e60f21c4b2f91dc621a977d0986',
          },
          'playlist_count': 31,
-    }
+    }, {
+        # available without pluralsight account
+        'url': 'https://www.pluralsight.com/courses/angularjs-get-started',
+        'only_matching': True,
+    }, {
+        'url': 'https://app.pluralsight.com/library/courses/understanding-microsoft-azure-amazon-aws/table-of-contents',
+        'only_matching': True,
+    }]
  
      def _real_extract(self, url):
          course_id = self._match_id(url)
@@ -184,14 +252,14 @@ class PluralsightCourseIE(InfoExtractor):
          # TODO: PSM cookie
  
          course = self._download_json(
-            'http://www.pluralsight.com/data/course/%s' % course_id,
+            '%s/data/course/%s' % (self._API_BASE, course_id),
              course_id, 'Downloading course JSON')
  
          title = course['title']
          description = course.get('description') or course.get('shortDescription')
  
          course_data = self._download_json(
-            'http://www.pluralsight.com/data/course/content/%s' % course_id,
+            '%s/data/course/content/%s' % (self._API_BASE, course_id),
              course_id, 'Downloading course data JSON')
  
          entries = []
@@ -201,7 +269,7 @@ class PluralsightCourseIE(InfoExtractor):
                  if not player_parameters:
                      continue
                  entries.append(self.url_result(
-                    'http://www.pluralsight.com/training/player?%s' % player_parameters,
+                    '%s/training/player?%s' % (self._API_BASE, player_parameters),
                      'Pluralsight'))
  
          return self.playlist_result(entries, course_id, title, description)
diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py

index dbb2c3bd95fdd88df1edb6ea7a1a416262076620..57c78ba52a994a9c2aff224470b86b913702241f 100644 (file)
--- a/youtube_dl/extractor/pornhd.py
+++ b/youtube_dl/extractor/pornhd.py
@@ -36,7 +36,8 @@ class PornHdIE(InfoExtractor):
          webpage = self._download_webpage(url, display_id or video_id)
  
          title = self._html_search_regex(
-            r'<title>(.+) porn HD.+?</title>', webpage, 'title')
+            [r'<span[^>]+class=["\']video-name["\'][^>]*>([^<]+)',
+             r'<title>(.+?) - .*?[Pp]ornHD.*?</title>'], webpage, 'title')
          description = self._html_search_regex(
              r'<div class="description">([^<]+)</div>', webpage, 'description', fatal=False)
          view_count = int_or_none(self._html_search_regex(
diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py

index 5b97d33caec2a08c79e648483ec67a1c288b6e4f..0fe6356dbf9b35f96b7de28ecd5df29b17dc7ae3 100644 (file)
--- a/youtube_dl/extractor/rtve.py
+++ b/youtube_dl/extractor/rtve.py
@@ -6,7 +6,7 @@ import re
  import time
  
  from .common import InfoExtractor
-from ..compat import compat_urllib_request, compat_urlparse
+from ..compat import compat_urllib_request
  from ..utils import (
      ExtractorError,
      float_or_none,
@@ -107,15 +107,9 @@ class RTVEALaCartaIE(InfoExtractor):
          png = self._download_webpage(png_request, video_id, 'Downloading url information')
          video_url = _decrypt_url(png)
          if not video_url.endswith('.f4m'):
-            auth_url = video_url.replace(
+            video_url = video_url.replace(
                  'resources/', 'auth/resources/'
              ).replace('.net.rtve', '.multimedia.cdn.rtve')
-            video_path = self._download_webpage(
-                auth_url, video_id, 'Getting video url')
-            # Use mvod1.akcdn instead of flash.akamaihd.multimedia.cdn to get
-            # the right Content-Length header and the mp4 format
-            video_url = compat_urlparse.urljoin(
-                'http://mvod1.akcdn.rtve.es/', video_path)
  
          subtitles = None
          if info.get('sbtFile') is not None:
diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py

index d94dc7399f33d91543dcf85a37cab7863304ea15..6b09550b01d19b76484adb3301ec2a56156b5b1f 100644 (file)
--- a/youtube_dl/extractor/rutube.py
+++ b/youtube_dl/extractor/rutube.py
@@ -9,7 +9,7 @@ from ..compat import (
      compat_str,
  )
  from ..utils import (
-    ExtractorError,
+    determine_ext,
      unified_strdate,
  )
  
@@ -51,10 +51,25 @@ class RutubeIE(InfoExtractor):
              'http://rutube.ru/api/play/options/%s/?format=json' % video_id,
              video_id, 'Downloading options JSON')
  
-        m3u8_url = options['video_balancer'].get('m3u8')
-        if m3u8_url is None:
-            raise ExtractorError('Couldn\'t find m3u8 manifest url')
-        formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4')
+        formats = []
+        for format_id, format_url in options['video_balancer'].items():
+            ext = determine_ext(format_url)
+            if ext == 'm3u8':
+                m3u8_formats = self._extract_m3u8_formats(
+                    format_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)
+                if m3u8_formats:
+                    formats.extend(m3u8_formats)
+            elif ext == 'f4m':
+                f4m_formats = self._extract_f4m_formats(
+                    format_url, video_id, f4m_id=format_id, fatal=False)
+                if f4m_formats:
+                    formats.extend(f4m_formats)
+            else:
+                formats.append({
+                    'url': format_url,
+                    'format_id': format_id,
+                })
+        self._sort_formats(formats)
  
          return {
              'id': video['id'],
@@ -74,9 +89,9 @@ class RutubeIE(InfoExtractor):
  class RutubeEmbedIE(InfoExtractor):
      IE_NAME = 'rutube:embed'
      IE_DESC = 'Rutube embedded videos'
-    _VALID_URL = 'https?://rutube\.ru/video/embed/(?P<id>[0-9]+)'
+    _VALID_URL = 'https?://rutube\.ru/(?:video|play)/embed/(?P<id>[0-9]+)'
  
-    _TEST = {
+    _TESTS = [{
          'url': 'http://rutube.ru/video/embed/6722881?vk_puid37=&vk_puid38=',
          'info_dict': {
              'id': 'a10e53b86e8f349080f718582ce4c661',
@@ -90,7 +105,10 @@ class RutubeEmbedIE(InfoExtractor):
          'params': {
              'skip_download': 'Requires ffmpeg',
          },
-    }
+    }, {
+        'url': 'http://rutube.ru/play/embed/8083783',
+        'only_matching': True,
+    }]
  
      def _real_extract(self, url):
          embed_id = self._match_id(url)
diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py

index 2b60d354a15b19f2ae72a24ec793399f9bab0744..02e64e09436a5299c5d4f87f1a3ba871c63af230 100644 (file)
--- a/youtube_dl/extractor/soundcloud.py
+++ b/youtube_dl/extractor/soundcloud.py
@@ -4,13 +4,17 @@ from __future__ import unicode_literals
  import re
  import itertools
  
-from .common import InfoExtractor
+from .common import (
+    InfoExtractor,
+    SearchInfoExtractor
+)
  from ..compat import (
      compat_str,
      compat_urlparse,
      compat_urllib_parse,
  )
  from ..utils import (
+    encode_dict,
      ExtractorError,
      int_or_none,
      unified_strdate,
@@ -469,3 +473,60 @@ class SoundcloudPlaylistIE(SoundcloudIE):
              'description': data.get('description'),
              'entries': entries,
          }
+
+
+class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE):
+    IE_NAME = 'soundcloud:search'
+    IE_DESC = 'Soundcloud search'
+    _MAX_RESULTS = float('inf')
+    _TESTS = [{
+        'url': 'scsearch15:post-avant jazzcore',
+        'info_dict': {
+            'title': 'post-avant jazzcore',
+        },
+        'playlist_count': 15,
+    }]
+
+    _SEARCH_KEY = 'scsearch'
+    _MAX_RESULTS_PER_PAGE = 200
+    _DEFAULT_RESULTS_PER_PAGE = 50
+    _API_V2_BASE = 'https://api-v2.soundcloud.com'
+
+    def _get_collection(self, endpoint, collection_id, **query):
+        limit = min(
+            query.get('limit', self._DEFAULT_RESULTS_PER_PAGE),
+            self._MAX_RESULTS_PER_PAGE)
+        query['limit'] = limit
+        query['client_id'] = self._CLIENT_ID
+        query['linked_partitioning'] = '1'
+        query['offset'] = 0
+        data = compat_urllib_parse.urlencode(encode_dict(query))
+        next_url = '{0}{1}?{2}'.format(self._API_V2_BASE, endpoint, data)
+
+        collected_results = 0
+
+        for i in itertools.count(1):
+            response = self._download_json(
+                next_url, collection_id, 'Downloading page {0}'.format(i),
+                'Unable to download API page')
+
+            collection = response.get('collection', [])
+            if not collection:
+                break
+
+            collection = list(filter(bool, collection))
+            collected_results += len(collection)
+
+            for item in collection:
+                yield self.url_result(item['uri'], SoundcloudIE.ie_key())
+
+            if not collection or collected_results >= limit:
+                break
+
+            next_url = response.get('next_href')
+            if not next_url:
+                break
+
+    def _get_n_results(self, query, n):
+        tracks = self._get_collection('/search/tracks', query, limit=n, q=query)
+        return self.playlist_result(tracks, playlist_title=query)
diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py

index 43315e75db76311d09983b98f2f52cedd0b5bb20..1555aa77cac30c18de3f0c2db9e13ea00cc569f6 100644 (file)
--- a/youtube_dl/extractor/theplatform.py
+++ b/youtube_dl/extractor/theplatform.py
@@ -187,8 +187,12 @@ class ThePlatformIE(ThePlatformBaseIE):
              # Seems there's no pattern for the interested script filename, so
              # I try one by one
              for script in reversed(scripts):
-                feed_script = self._download_webpage(script, video_id, 'Downloading feed script')
-                feed_id = self._search_regex(r'defaultFeedId\s*:\s*"([^"]+)"', feed_script, 'default feed id', default=None)
+                feed_script = self._download_webpage(
+                    self._proto_relative_url(script, 'http:'),
+                    video_id, 'Downloading feed script')
+                feed_id = self._search_regex(
+                    r'defaultFeedId\s*:\s*"([^"]+)"', feed_script,
+                    'default feed id', default=None)
                  if feed_id is not None:
                      break
              if feed_id is None:
diff --git a/youtube_dl/extractor/udn.py b/youtube_dl/extractor/udn.py

index 2151f83382d6b3185722b54de2d0eab2a988c6ae..ee35b7227372c0ddc128dfc694577578f9fc6009 100644 (file)
--- a/youtube_dl/extractor/udn.py
+++ b/youtube_dl/extractor/udn.py
@@ -12,7 +12,8 @@ from ..compat import compat_urlparse
  
  class UDNEmbedIE(InfoExtractor):
      IE_DESC = '聯合影音'
-    _VALID_URL = r'https?://video\.udn\.com/(?:embed|play)/news/(?P<id>\d+)'
+    _PROTOCOL_RELATIVE_VALID_URL = r'//video\.udn\.com/(?:embed|play)/news/(?P<id>\d+)'
+    _VALID_URL = r'https?:' + _PROTOCOL_RELATIVE_VALID_URL
      _TESTS = [{
          'url': 'http://video.udn.com/embed/news/300040',
          'md5': 'de06b4c90b042c128395a88f0384817e',
diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py

index b72341a2bef0d97de284ed987265aeedc0b62011..057c72f39c13884b02871e5abf444bd6e73691ec 100644 (file)
--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@@ -189,6 +189,10 @@ class VimeoIE(VimeoBaseInfoExtractor):
              'note': 'Video not completely processed, "failed" seed status',
              'only_matching': True,
          },
+        {
+            'url': 'https://vimeo.com/groups/travelhd/videos/22439234',
+            'only_matching': True,
+        },
      ]
  
      @staticmethod
@@ -486,8 +490,7 @@ class VimeoChannelIE(VimeoBaseInfoExtractor):
              password_request, list_id,
              'Verifying the password', 'Wrong password')
  
-    def _extract_videos(self, list_id, base_url):
-        video_ids = []
+    def _title_and_entries(self, list_id, base_url):
          for pagenum in itertools.count(1):
              page_url = self._page_url(base_url, pagenum)
              webpage = self._download_webpage(
@@ -496,18 +499,18 @@ class VimeoChannelIE(VimeoBaseInfoExtractor):
  
              if pagenum == 1:
                  webpage = self._login_list_password(page_url, list_id, webpage)
+                yield self._extract_list_title(webpage)
+
+            for video_id in re.findall(r'id="clip_(\d+?)"', webpage):
+                yield self.url_result('https://vimeo.com/%s' % video_id, 'Vimeo')
  
-            video_ids.extend(re.findall(r'id="clip_(\d+?)"', webpage))
              if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None:
                  break
  
-        entries = [self.url_result('https://vimeo.com/%s' % video_id, 'Vimeo')
-                   for video_id in video_ids]
-        return {'_type': 'playlist',
-                'id': list_id,
-                'title': self._extract_list_title(webpage),
-                'entries': entries,
-                }
+    def _extract_videos(self, list_id, base_url):
+        title_and_entries = self._title_and_entries(list_id, base_url)
+        list_title = next(title_and_entries)
+        return self.playlist_result(title_and_entries, list_id, list_title)
  
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
@@ -568,7 +571,7 @@ class VimeoAlbumIE(VimeoChannelIE):
  
  class VimeoGroupsIE(VimeoAlbumIE):
      IE_NAME = 'vimeo:group'
-    _VALID_URL = r'https://vimeo\.com/groups/(?P<name>[^/]+)'
+    _VALID_URL = r'https://vimeo\.com/groups/(?P<name>[^/]+)(?:/(?!videos?/\d+)|$)'
      _TESTS = [{
          'url': 'https://vimeo.com/groups/rolexawards',
          'info_dict': {
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py

index 247769067c4789bee293aa240de4215359c3a261..1580c54fe779d9d300481fabfcfa4d2b5174ce06 100644 (file)
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -178,15 +178,13 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
              return
  
  
-class YoutubePlaylistBaseInfoExtractor(InfoExtractor):
-    # Extract the video ids from the playlist pages
+class YoutubeEntryListBaseInfoExtractor(InfoExtractor):
+    # Extract entries from page with "Load more" button
      def _entries(self, page, playlist_id):
          more_widget_html = content_html = page
          for page_num in itertools.count(1):
-            for video_id, video_title in self.extract_videos_from_page(content_html):
-                yield self.url_result(
-                    video_id, 'Youtube', video_id=video_id,
-                    video_title=video_title)
+            for entry in self._process_page(content_html):
+                yield entry
  
              mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
              if not mobj:
@@ -203,6 +201,12 @@ class YoutubePlaylistBaseInfoExtractor(InfoExtractor):
                  break
              more_widget_html = more['load_more_widget_html']
  
+
+class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
+    def _process_page(self, content):
+        for video_id, video_title in self.extract_videos_from_page(content):
+            yield self.url_result(video_id, 'Youtube', video_id, video_title)
+
      def extract_videos_from_page(self, page):
          ids_in_page = []
          titles_in_page = []
@@ -224,6 +228,19 @@ class YoutubePlaylistBaseInfoExtractor(InfoExtractor):
          return zip(ids_in_page, titles_in_page)
  
  
+class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
+    def _process_page(self, content):
+        for playlist_id in re.findall(r'href="/?playlist\?list=(.+?)"', content):
+            yield self.url_result(
+                'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+        webpage = self._download_webpage(url, playlist_id)
+        title = self._og_search_title(webpage, fatal=False)
+        return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
+
+
  class YoutubeIE(YoutubeBaseInfoExtractor):
      IE_DESC = 'YouTube.com'
      _VALID_URL = r"""(?x)^
@@ -1637,7 +1654,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor, YoutubePlaylistBaseInfoExtract
                  self.report_warning('Youtube gives an alert message: ' + match)
  
          playlist_title = self._html_search_regex(
-            r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
+            r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',
              page, 'title')
  
          return self.playlist_result(self._entries(page, playlist_id), playlist_id, playlist_title)
@@ -1764,6 +1781,29 @@ class YoutubeUserIE(YoutubeChannelIE):
              return super(YoutubeUserIE, cls).suitable(url)
  
  
+class YoutubeUserPlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
+    IE_DESC = 'YouTube.com user playlists'
+    _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/user/(?P<id>[^/]+)/playlists'
+    IE_NAME = 'youtube:user:playlists'
+
+    _TESTS = [{
+        'url': 'http://www.youtube.com/user/ThirstForScience/playlists',
+        'playlist_mincount': 4,
+        'info_dict': {
+            'id': 'ThirstForScience',
+            'title': 'Thirst for Science',
+        },
+    }, {
+        # with "Load more" button
+        'url': 'http://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
+        'playlist_mincount': 70,
+        'info_dict': {
+            'id': 'igorkle1',
+            'title': 'Игорь Клейнер',
+        },
+    }]
+
+
  class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE):
      IE_DESC = 'YouTube.com searches'
      # there doesn't appear to be a real limit, for example if you search for
@@ -1859,7 +1899,7 @@ class YoutubeSearchURLIE(InfoExtractor):
          }
  
  
-class YoutubeShowIE(InfoExtractor):
+class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):
      IE_DESC = 'YouTube.com (multi-season) shows'
      _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
      IE_NAME = 'youtube:show'
@@ -1873,26 +1913,9 @@ class YoutubeShowIE(InfoExtractor):
      }]
  
      def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        playlist_id = mobj.group('id')
-        webpage = self._download_webpage(
-            'https://www.youtube.com/show/%s/playlists' % playlist_id, playlist_id, 'Downloading show webpage')
-        # There's one playlist for each season of the show
-        m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
-        self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
-        entries = [
-            self.url_result(
-                'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
-            for season in m_seasons
-        ]
-        title = self._og_search_title(webpage, fatal=False)
-
-        return {
-            '_type': 'playlist',
-            'id': playlist_id,
-            'title': title,
-            'entries': entries,
-        }
+        playlist_id = self._match_id(url)
+        return super(YoutubeShowIE, self)._real_extract(
+            'https://www.youtube.com/show/%s/playlists' % playlist_id)
  
  
  class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
diff --git a/youtube_dl/options.py b/youtube_dl/options.py

index 3dd6d290b830615326c872aaa044156e350fb295..079fe7e8a35af5eedf6eb4f3c450deeada96d5bc 100644 (file)
--- a/youtube_dl/options.py
+++ b/youtube_dl/options.py
@@ -363,7 +363,7 @@ def parseOpts(overrideArguments=None):
      subtitles.add_option(
          '--write-auto-sub', '--write-automatic-sub',
          action='store_true', dest='writeautomaticsub', default=False,
-        help='Write automatic subtitle file (YouTube only)')
+        help='Write automatically generated subtitle file (YouTube only)')
      subtitles.add_option(
          '--all-subs',
          action='store_true', dest='allsubtitles', default=False,
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py

index d00b14b86d8ea8d43aef6bd7cf5a575cda5f94af..c0325f054ddfccd2c601acaee3812be5e00a4e6e 100644 (file)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -925,6 +925,21 @@ def determine_ext(url, default_ext='unknown_video'):
      guess = url.partition('?')[0].rpartition('.')[2]
      if re.match(r'^[A-Za-z0-9]+$', guess):
          return guess
+    elif guess.rstrip('/') in (
+            'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
+            'flv', 'f4v', 'f4a', 'f4b',
+            'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
+            'mkv', 'mka', 'mk3d',
+            'avi', 'divx',
+            'mov',
+            'asf', 'wmv', 'wma',
+            '3gp', '3g2',
+            'mp3',
+            'flac',
+            'ape',
+            'wav',
+            'f4f', 'f4m', 'm3u8', 'smil'):
+        return guess.rstrip('/')
      else:
          return default_ext
  
@@ -1668,7 +1683,9 @@ def urlencode_postdata(*args, **kargs):
  
  
  def encode_dict(d, encoding='utf-8'):
-    return dict((k.encode(encoding), v.encode(encoding)) for k, v in d.items())
+    def encode(v):
+        return v.encode(encoding) if isinstance(v, compat_basestring) else v
+    return dict((encode(k), encode(v)) for k, v in d.items())
  
  
  US_RATINGS = {
diff --git a/youtube_dl/version.py b/youtube_dl/version.py

index 6f601cbb1e9a2104448b1f53ed1004856c69068c..2baf1ac4250e2e9e96b884d17a6d6fa767fdc824 100644 (file)
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,3 +1,3 @@
  from __future__ import unicode_literals
  
-__version__ = '2015.11.15'
+__version__ = '2015.11.21'
author	Sergey M <dstftw@gmail.com>
	Mon, 23 Nov 2015 14:52:23 +0000 (20:52 +0600)
committer	Sergey M <dstftw@gmail.com>
	Mon, 23 Nov 2015 14:52:23 +0000 (20:52 +0600)
README.md		patch \| blob \| history
docs/supportedsites.md		patch \| blob \| history
test/test_utils.py		patch \| blob \| history
youtube_dl/YoutubeDL.py		patch \| blob \| history
youtube_dl/downloader/common.py		patch \| blob \| history
youtube_dl/downloader/rtmp.py		patch \| blob \| history
youtube_dl/extractor/__init__.py		patch \| blob \| history
youtube_dl/extractor/bloomberg.py		patch \| blob \| history
youtube_dl/extractor/common.py		patch \| blob \| history
youtube_dl/extractor/dplay.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/generic.py		patch \| blob \| history
youtube_dl/extractor/instagram.py		patch \| blob \| history
youtube_dl/extractor/kaltura.py		patch \| blob \| history
youtube_dl/extractor/pbs.py		patch \| blob \| history
youtube_dl/extractor/pluralsight.py		patch \| blob \| history
youtube_dl/extractor/pornhd.py		patch \| blob \| history
youtube_dl/extractor/rtve.py		patch \| blob \| history
youtube_dl/extractor/rutube.py		patch \| blob \| history
youtube_dl/extractor/soundcloud.py		patch \| blob \| history
youtube_dl/extractor/theplatform.py		patch \| blob \| history
youtube_dl/extractor/udn.py		patch \| blob \| history
youtube_dl/extractor/vimeo.py		patch \| blob \| history
youtube_dl/extractor/youtube.py		patch \| blob \| history
youtube_dl/options.py		patch \| blob \| history
youtube_dl/utils.py		patch \| blob \| history
youtube_dl/version.py		patch \| blob \| history