Merge pull request #7681 from remitamine/skynewarabia
authorremitamine <remitamine@gmail.com>
Thu, 3 Dec 2015 17:41:38 +0000 (18:41 +0100)
committerremitamine <remitamine@gmail.com>
Thu, 3 Dec 2015 17:41:38 +0000 (18:41 +0100)
[skynewsarabia] Add new extractor

18 files changed:
AUTHORS
Makefile
youtube_dl/downloader/hls.py
youtube_dl/extractor/bbc.py
youtube_dl/extractor/beeg.py
youtube_dl/extractor/bloomberg.py
youtube_dl/extractor/common.py
youtube_dl/extractor/cspan.py
youtube_dl/extractor/facebook.py
youtube_dl/extractor/gametrailers.py
youtube_dl/extractor/nrk.py
youtube_dl/extractor/pornhub.py
youtube_dl/extractor/spiegel.py
youtube_dl/extractor/udemy.py
youtube_dl/extractor/vodlocker.py
youtube_dl/extractor/youtube.py
youtube_dl/options.py
youtube_dl/utils.py

diff --git a/AUTHORS b/AUTHORS
index f465d20edcb4fc5bdb23a69b488333b83a608785..cdb56de3b9c3c65d3bcd8fa2326161f9d3fd2e3e 100644 (file)
--- a/AUTHORS
+++ b/AUTHORS
@@ -146,3 +146,4 @@ Lukáš Lalinský
 Qijiang Fan
 Rémy Léone
 Marco Ferragina
+reiv
index fdb1abb60cacfe49295a7438e3d0f4f51c248359..f826c16857846635e2c84ae8954d44f9aac3e48b 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -61,34 +61,34 @@ youtube-dl: youtube_dl/*.py youtube_dl/*/*.py
        chmod a+x youtube-dl
 
 README.md: youtube_dl/*.py youtube_dl/*/*.py
-       COLUMNS=80 python youtube_dl/__main__.py --help | python devscripts/make_readme.py
+       COLUMNS=80 $(PYTHON) youtube_dl/__main__.py --help | $(PYTHON) devscripts/make_readme.py
 
 CONTRIBUTING.md: README.md
-       python devscripts/make_contributing.py README.md CONTRIBUTING.md
+       $(PYTHON) devscripts/make_contributing.py README.md CONTRIBUTING.md
 
 supportedsites:
-       python devscripts/make_supportedsites.py docs/supportedsites.md
+       $(PYTHON) devscripts/make_supportedsites.py docs/supportedsites.md
 
 README.txt: README.md
        pandoc -f markdown -t plain README.md -o README.txt
 
 youtube-dl.1: README.md
-       python devscripts/prepare_manpage.py >youtube-dl.1.temp.md
+       $(PYTHON) devscripts/prepare_manpage.py >youtube-dl.1.temp.md
        pandoc -s -f markdown -t man youtube-dl.1.temp.md -o youtube-dl.1
        rm -f youtube-dl.1.temp.md
 
 youtube-dl.bash-completion: youtube_dl/*.py youtube_dl/*/*.py devscripts/bash-completion.in
-       python devscripts/bash-completion.py
+       $(PYTHON) devscripts/bash-completion.py
 
 bash-completion: youtube-dl.bash-completion
 
 youtube-dl.zsh: youtube_dl/*.py youtube_dl/*/*.py devscripts/zsh-completion.in
-       python devscripts/zsh-completion.py
+       $(PYTHON) devscripts/zsh-completion.py
 
 zsh-completion: youtube-dl.zsh
 
 youtube-dl.fish: youtube_dl/*.py youtube_dl/*/*.py devscripts/fish-completion.in
-       python devscripts/fish-completion.py
+       $(PYTHON) devscripts/fish-completion.py
 
 fish-completion: youtube-dl.fish
 
index 92765a3f9c6acdbc91136fbe00d24197af71fa29..b5a3e11676e72d070ec1a48c6483d0cc630c70a7 100644 (file)
@@ -13,6 +13,7 @@ from ..utils import (
     encodeArgument,
     encodeFilename,
     sanitize_open,
+    handle_youtubedl_headers,
 )
 
 
@@ -33,9 +34,10 @@ class HlsFD(FileDownloader):
         if info_dict['http_headers'] and re.match(r'^https?://', url):
             # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv:
             # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header.
+            headers = handle_youtubedl_headers(info_dict['http_headers'])
             args += [
                 '-headers',
-                ''.join('%s: %s\r\n' % (key, val) for key, val in info_dict['http_headers'].items() if key.lower() != 'accept-encoding')]
+                ''.join('%s: %s\r\n' % (key, val) for key, val in headers.items())]
 
         args += ['-i', url, '-f', 'mp4', '-c', 'copy', '-bsf:a', 'aac_adtstoasc']
 
index 33b296eafc0776d50edbc964170b1026b223bb71..7fb80aa38fc39825fd7338b40fc994ccc0c8a185 100644 (file)
@@ -22,7 +22,8 @@ from ..compat import (
 class BBCCoUkIE(InfoExtractor):
     IE_NAME = 'bbc.co.uk'
     IE_DESC = 'BBC iPlayer'
-    _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:programmes/(?!articles/)|iplayer(?:/[^/]+)?/(?:episode/|playlist/))|music/clips[/#])(?P<id>[\da-z]{8})'
+    _ID_REGEX = r'[pb][\da-z]{7}'
+    _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:programmes/(?!articles/)|iplayer(?:/[^/]+)?/(?:episode/|playlist/))|music/clips[/#])(?P<id>%s)' % _ID_REGEX
 
     _MEDIASELECTOR_URLS = [
         # Provides HQ HLS streams with even better quality that pc mediaset but fails
@@ -465,7 +466,7 @@ class BBCCoUkIE(InfoExtractor):
 
         if not programme_id:
             programme_id = self._search_regex(
-                r'"vpid"\s*:\s*"([\da-z]{8})"', webpage, 'vpid', fatal=False, default=None)
+                r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
 
         if programme_id:
             formats, subtitles = self._download_media_selector(programme_id)
@@ -780,8 +781,9 @@ class BBCIE(BBCCoUkIE):
 
         # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
         programme_id = self._search_regex(
-            [r'data-video-player-vpid="([\da-z]{8})"',
-             r'<param[^>]+name="externalIdentifier"[^>]+value="([\da-z]{8})"'],
+            [r'data-video-player-vpid="(%s)"' % self._ID_REGEX,
+             r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
+             r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
             webpage, 'vpid', default=None)
 
         if programme_id:
@@ -816,7 +818,7 @@ class BBCIE(BBCCoUkIE):
 
         # Multiple video article (e.g.
         # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
-        EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+[\da-z]{8}(?:\b[^"]+)?'
+        EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
         entries = []
         for match in extract_all(r'new\s+SMP\(({.+?})\)'):
             embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
index 61bc2f7445a6fe115746b8f33b92421c2ab7e32c..1ee4a8b057f88560d40a266f4402f335714f0658 100644 (file)
@@ -29,7 +29,7 @@ class BeegIE(InfoExtractor):
         video_id = self._match_id(url)
 
         video = self._download_json(
-            'http://beeg.com/api/v1/video/%s' % video_id, video_id)
+            'http://beeg.com/api/v3/video/%s' % video_id, video_id)
 
         formats = []
         for format_id, video_url in video.items():
index 11ace91dd310b62b8071e2f5b4603c54c2f91d52..ebeef8f2ab19e9dfcf6cdbc9c30616958344876d 100644 (file)
@@ -6,7 +6,7 @@ from .common import InfoExtractor
 
 
 class BloombergIE(InfoExtractor):
-    _VALID_URL = r'https?://www\.bloomberg\.com/news/[^/]+/[^/]+/(?P<id>[^/?#]+)'
+    _VALID_URL = r'https?://(?:www\.)?bloomberg\.com/(?:[^/]+/)*(?P<id>[^/?#]+)'
 
     _TESTS = [{
         'url': 'http://www.bloomberg.com/news/videos/b/aaeae121-5949-481e-a1ce-4562db6f5df2',
@@ -20,22 +20,36 @@ class BloombergIE(InfoExtractor):
     }, {
         'url': 'http://www.bloomberg.com/news/articles/2015-11-12/five-strange-things-that-have-been-happening-in-financial-markets',
         'only_matching': True,
+    }, {
+        'url': 'http://www.bloomberg.com/politics/videos/2015-11-25/karl-rove-on-jeb-bush-s-struggles-stopping-trump',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
         name = self._match_id(url)
         webpage = self._download_webpage(url, name)
-        video_id = self._search_regex(r'"bmmrId":"(.+?)"', webpage, 'id')
+        video_id = self._search_regex(
+            r'["\']bmmrId["\']\s*:\s*(["\'])(?P<url>.+?)\1',
+            webpage, 'id', group='url')
         title = re.sub(': Video$', '', self._og_search_title(webpage))
 
         embed_info = self._download_json(
             'http://www.bloomberg.com/api/embed?id=%s' % video_id, video_id)
         formats = []
         for stream in embed_info['streams']:
-            if stream["muxing_format"] == "TS":
-                formats.extend(self._extract_m3u8_formats(stream['url'], video_id))
+            stream_url = stream.get('url')
+            if not stream_url:
+                continue
+            if stream['muxing_format'] == 'TS':
+                m3u8_formats = self._extract_m3u8_formats(
+                    stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False)
+                if m3u8_formats:
+                    formats.extend(m3u8_formats)
             else:
-                formats.extend(self._extract_f4m_formats(stream['url'], video_id))
+                f4m_formats = self._extract_f4m_formats(
+                    stream_url, video_id, f4m_id='hds', fatal=False)
+                if f4m_formats:
+                    formats.extend(f4m_formats)
         self._sort_formats(formats)
 
         return {
index eb9bfa3d15a2c5084fbf67f05a401474ad2f881d..6ab2d68d6f3137ff7a9b4b201102959a7e732e9a 100644 (file)
@@ -167,7 +167,7 @@ class InfoExtractor(object):
                     "ext" will be calculated from URL if missing
     automatic_captions: Like 'subtitles', used by the YoutubeIE for
                     automatically generated captions
-    duration:       Length of the video in seconds, as an integer.
+    duration:       Length of the video in seconds, as an integer or float.
     view_count:     How many users have watched the video on the platform.
     like_count:     Number of positive ratings of the video
     dislike_count:  Number of negative ratings of the video
index fbefd37d09a98bb19c82b4c09b7b08c99d147d35..7b685d157ddc07de54d59ba52cfc7db0940f6eb2 100644 (file)
@@ -9,6 +9,7 @@ from ..utils import (
     find_xpath_attr,
     smuggle_url,
     determine_ext,
+    ExtractorError,
 )
 from .senateisvp import SenateISVPIE
 
@@ -18,33 +19,32 @@ class CSpanIE(InfoExtractor):
     IE_DESC = 'C-SPAN'
     _TESTS = [{
         'url': 'http://www.c-span.org/video/?313572-1/HolderonV',
-        'md5': '8e44ce11f0f725527daccc453f553eb0',
+        'md5': '94b29a4f131ff03d23471dd6f60b6a1d',
         'info_dict': {
             'id': '315139',
             'ext': 'mp4',
             'title': 'Attorney General Eric Holder on Voting Rights Act Decision',
-            'description': 'Attorney General Eric Holder spoke to reporters following the Supreme Court decision in Shelby County v. Holder in which the court ruled that the preclearance provisions of the Voting Rights Act could not be enforced until Congress established new guidelines for review.',
+            'description': 'Attorney General Eric Holder speaks to reporters following the Supreme Court decision in [Shelby County v. Holder], in which the court ruled that the preclearance provisions of the Voting Rights Act could not be enforced.',
         },
         'skip': 'Regularly fails on travis, for unknown reasons',
     }, {
         'url': 'http://www.c-span.org/video/?c4486943/cspan-international-health-care-models',
-        # For whatever reason, the served video alternates between
-        # two different ones
+        'md5': '8e5fbfabe6ad0f89f3012a7943c1287b',
         'info_dict': {
-            'id': '340723',
+            'id': 'c4486943',
             'ext': 'mp4',
-            'title': 'International Health Care Models',
+            'title': 'CSPAN - International Health Care Models',
             'description': 'md5:7a985a2d595dba00af3d9c9f0783c967',
         }
     }, {
         'url': 'http://www.c-span.org/video/?318608-1/gm-ignition-switch-recall',
-        'md5': '446562a736c6bf97118e389433ed88d4',
+        'md5': '2ae5051559169baadba13fc35345ae74',
         'info_dict': {
             'id': '342759',
             'ext': 'mp4',
             'title': 'General Motors Ignition Switch Recall',
             'duration': 14848,
-            'description': 'md5:70c7c3b8fa63fa60d42772440596034c'
+            'description': 'md5:118081aedd24bf1d3b68b3803344e7f3'
         },
     }, {
         # Video from senate.gov
@@ -57,67 +57,77 @@ class CSpanIE(InfoExtractor):
     }]
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        page_id = mobj.group('id')
-        webpage = self._download_webpage(url, page_id)
-        video_id = self._search_regex(r'progid=\'?([0-9]+)\'?>', webpage, 'video id')
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        matches = re.search(r'data-(prog|clip)id=\'([0-9]+)\'', webpage)
+        if matches:
+            video_type, video_id = matches.groups()
+            if video_type == 'prog':
+                video_type = 'program'
+        else:
+            senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)
+            if senate_isvp_url:
+                title = self._og_search_title(webpage)
+                surl = smuggle_url(senate_isvp_url, {'force_title': title})
+                return self.url_result(surl, 'SenateISVP', video_id, title)
 
-        description = self._html_search_regex(
-            [
-                # The full description
-                r'<div class=\'expandable\'>(.*?)<a href=\'#\'',
-                # If the description is small enough the other div is not
-                # present, otherwise this is a stripped version
-                r'<p class=\'initial\'>(.*?)</p>'
-            ],
-            webpage, 'description', flags=re.DOTALL, default=None)
+        def get_text_attr(d, attr):
+            return d.get(attr, {}).get('#text')
 
-        info_url = 'http://c-spanvideo.org/videoLibrary/assets/player/ajax-player.php?os=android&html5=program&id=' + video_id
-        data = self._download_json(info_url, video_id)
+        data = self._download_json(
+            'http://www.c-span.org/assets/player/ajax-player.php?os=android&html5=%s&id=%s' % (video_type, video_id),
+            video_id)['video']
+        if data['@status'] != 'Success':
+            raise ExtractorError('%s said: %s' % (self.IE_NAME, get_text_attr(data, 'error')), expected=True)
 
         doc = self._download_xml(
-            'http://www.c-span.org/common/services/flashXml.php?programid=' + video_id,
+            'http://www.c-span.org/common/services/flashXml.php?%sid=%s' % (video_type, video_id),
             video_id)
 
+        description = self._html_search_meta('description', webpage)
+
         title = find_xpath_attr(doc, './/string', 'name', 'title').text
         thumbnail = find_xpath_attr(doc, './/string', 'name', 'poster').text
 
-        senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)
-        if senate_isvp_url:
-            surl = smuggle_url(senate_isvp_url, {'force_title': title})
-            return self.url_result(surl, 'SenateISVP', video_id, title)
-
-        files = data['video']['files']
-        try:
-            capfile = data['video']['capfile']['#text']
-        except KeyError:
-            capfile = None
+        files = data['files']
+        capfile = get_text_attr(data, 'capfile')
 
-        entries = [{
-            'id': '%s_%d' % (video_id, partnum + 1),
-            'title': (
-                title if len(files) == 1 else
-                '%s part %d' % (title, partnum + 1)),
-            'url': unescapeHTML(f['path']['#text']),
-            'description': description,
-            'thumbnail': thumbnail,
-            'duration': int_or_none(f.get('length', {}).get('#text')),
-            'subtitles': {
-                'en': [{
-                    'url': capfile,
-                    'ext': determine_ext(capfile, 'dfxp')
-                }],
-            } if capfile else None,
-        } for partnum, f in enumerate(files)]
+        entries = []
+        for partnum, f in enumerate(files):
+            formats = []
+            for quality in f['qualities']:
+                formats.append({
+                    'format_id': '%s-%sp' % (get_text_attr(quality, 'bitrate'), get_text_attr(quality, 'height')),
+                    'url': unescapeHTML(get_text_attr(quality, 'file')),
+                    'height': int_or_none(get_text_attr(quality, 'height')),
+                    'tbr': int_or_none(get_text_attr(quality, 'bitrate')),
+                })
+            self._sort_formats(formats)
+            entries.append({
+                'id': '%s_%d' % (video_id, partnum + 1),
+                'title': (
+                    title if len(files) == 1 else
+                    '%s part %d' % (title, partnum + 1)),
+                'formats': formats,
+                'description': description,
+                'thumbnail': thumbnail,
+                'duration': int_or_none(get_text_attr(f, 'length')),
+                'subtitles': {
+                    'en': [{
+                        'url': capfile,
+                        'ext': determine_ext(capfile, 'dfxp')
+                    }],
+                } if capfile else None,
+            })
 
         if len(entries) == 1:
             entry = dict(entries[0])
-            entry['id'] = video_id
+            entry['id'] = 'c' + video_id if video_type == 'clip' else video_id
             return entry
         else:
             return {
                 '_type': 'playlist',
                 'entries': entries,
                 'title': title,
-                'id': video_id,
+                'id': 'c' + video_id if video_type == 'clip' else video_id,
             }
index fd854411b554185708d4edeeb338897e0a0da890..321eec59ef672eb6f88bf07f78466ba83456afbc 100644 (file)
@@ -164,7 +164,7 @@ class FacebookIE(InfoExtractor):
         if not video_title:
             video_title = self._html_search_regex(
                 r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(.*?)</span>',
-                webpage, 'alternative title', fatal=False)
+                webpage, 'alternative title', default=None)
             video_title = limit_length(video_title, 80)
         if not video_title:
             video_title = 'Facebook video #%s' % video_id
index a6ab795aef1bab4a56b2655515983aed35886a77..c3f031d9cd4341184cc3b70eea77c1f360a1a3c6 100644 (file)
@@ -1,19 +1,62 @@
 from __future__ import unicode_literals
 
-from .mtv import MTVServicesInfoExtractor
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    parse_age_limit,
+    url_basename,
+)
 
 
-class GametrailersIE(MTVServicesInfoExtractor):
-    _VALID_URL = r'http://www\.gametrailers\.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
+class GametrailersIE(InfoExtractor):
+    _VALID_URL = r'http://www\.gametrailers\.com/videos/view/[^/]+/(?P<id>.+)'
+
     _TEST = {
-        'url': 'http://www.gametrailers.com/videos/zbvr8i/mirror-s-edge-2-e3-2013--debut-trailer',
-        'md5': '4c8e67681a0ea7ec241e8c09b3ea8cf7',
+        'url': 'http://www.gametrailers.com/videos/view/gametrailers-com/116437-Just-Cause-3-Review',
+        'md5': 'f28c4efa0bdfaf9b760f6507955b6a6a',
         'info_dict': {
-            'id': '70e9a5d7-cf25-4a10-9104-6f3e7342ae0d',
+            'id': '2983958',
             'ext': 'mp4',
-            'title': 'E3 2013: Debut Trailer',
-            'description': 'Faith is back!  Check out the World Premiere trailer for Mirror\'s Edge 2 straight from the EA Press Conference at E3 2013!',
+            'display_id': '116437-Just-Cause-3-Review',
+            'title': 'Just Cause 3 - Review',
+            'description': 'It\'s a lot of fun to shoot at things and then watch them explode in Just Cause 3, but should there be more to the experience than that?',
         },
     }
 
-    _FEED_URL = 'http://www.gametrailers.com/feeds/mrss'
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        title = self._html_search_regex(
+            r'<title>(.+?)\|', webpage, 'title').strip()
+        embed_url = self._proto_relative_url(
+            self._search_regex(
+                r'src=\'(//embed.gametrailers.com/embed/[^\']+)\'', webpage,
+                'embed url'),
+            scheme='http:')
+        video_id = url_basename(embed_url)
+        embed_page = self._download_webpage(embed_url, video_id)
+        embed_vars_json = self._search_regex(
+            r'(?s)var embedVars = (\{.*?\})\s*</script>', embed_page,
+            'embed vars')
+        info = self._parse_json(embed_vars_json, video_id)
+
+        formats = []
+        for media in info['media']:
+            if media['mediaPurpose'] == 'play':
+                formats.append({
+                    'url': media['uri'],
+                    'height': media['height'],
+                    'width:': media['width'],
+                })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'formats': formats,
+            'thumbnail': info.get('thumbUri'),
+            'description': self._og_search_description(webpage),
+            'duration': int_or_none(info.get('videoLengthInSeconds')),
+            'age_limit': parse_age_limit(info.get('audienceRating')),
+        }
index 8ac38a174b4c64bbc8a3b40ec36c10d6fab595fa..6ff13050dc7e5d6ee583678a2e07ea2383277891 100644 (file)
@@ -6,6 +6,7 @@ import re
 from .common import InfoExtractor
 from ..compat import compat_urlparse
 from ..utils import (
+    determine_ext,
     ExtractorError,
     float_or_none,
     parse_duration,
@@ -48,12 +49,22 @@ class NRKIE(InfoExtractor):
             'http://v8.psapi.nrk.no/mediaelement/%s' % video_id,
             video_id, 'Downloading media JSON')
 
-        if data['usageRights']['isGeoBlocked']:
-            raise ExtractorError(
-                'NRK har ikke rettigheter til å vise dette programmet utenfor Norge',
-                expected=True)
+        media_url = data.get('mediaUrl')
 
-        video_url = data['mediaUrl'] + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81'
+        if not media_url:
+            if data['usageRights']['isGeoBlocked']:
+                raise ExtractorError(
+                    'NRK har ikke rettigheter til å vise dette programmet utenfor Norge',
+                    expected=True)
+
+        if determine_ext(media_url) == 'f4m':
+            formats = self._extract_f4m_formats(
+                media_url + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81', video_id, f4m_id='hds')
+        else:
+            formats = [{
+                'url': media_url,
+                'ext': 'flv',
+            }]
 
         duration = parse_duration(data.get('duration'))
 
@@ -67,12 +78,11 @@ class NRKIE(InfoExtractor):
 
         return {
             'id': video_id,
-            'url': video_url,
-            'ext': 'flv',
             'title': data['title'],
             'description': data['description'],
             'duration': duration,
             'thumbnail': thumbnail,
+            'formats': formats,
         }
 
 
index 965940a4b07fc5d17fa61cf1208a029ce4794216..08275687dde33e4668c167c1db4831d36427cd41 100644 (file)
@@ -147,7 +147,8 @@ class PornHubPlaylistIE(InfoExtractor):
 
         entries = [
             self.url_result('http://www.pornhub.com/%s' % video_url, 'PornHub')
-            for video_url in set(re.findall('href="/?(view_video\.php\?viewkey=\d+[^"]*)"', webpage))
+            for video_url in set(re.findall(
+                r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"', webpage))
         ]
 
         playlist = self._parse_json(
index 5bd3c00875234c5efcf68772178b672261cc2a9f..39a7aaf9d630203dc1796b3b5621aad3c433f575 100644 (file)
@@ -58,7 +58,8 @@ class SpiegelIE(InfoExtractor):
         description = self._html_search_meta('description', webpage, 'description')
 
         base_url = self._search_regex(
-            r'var\s+server\s*=\s*"([^"]+)\"', webpage, 'server URL')
+            [r'server\s*:\s*(["\'])(?P<url>.+?)\1', r'var\s+server\s*=\s*"(?P<url>[^"]+)\"'],
+            webpage, 'server URL', group='url')
 
         xml_url = base_url + video_id + '.xml'
         idoc = self._download_xml(xml_url, video_id)
index 825172806ab62b2edc28c205a299809023297281..59832b1ece75d480afdfa16c3c398a701f532a06 100644 (file)
@@ -1,14 +1,15 @@
 from __future__ import unicode_literals
 
-import re
-
 from .common import InfoExtractor
 from ..compat import (
+    compat_HTTPError,
     compat_urllib_parse,
     compat_urllib_request,
 )
 from ..utils import (
     ExtractorError,
+    float_or_none,
+    int_or_none,
     sanitized_Request,
 )
 
@@ -18,6 +19,8 @@ class UdemyIE(InfoExtractor):
     _VALID_URL = r'https?://www\.udemy\.com/(?:[^#]+#/lecture/|lecture/view/?\?lectureId=)(?P<id>\d+)'
     _LOGIN_URL = 'https://www.udemy.com/join/login-popup/?displayType=ajax&showSkipButton=1'
     _ORIGIN_URL = 'https://www.udemy.com'
+    _SUCCESSFULLY_ENROLLED = '>You have enrolled in this course!<'
+    _ALREADY_ENROLLED = '>You are already taking this course.<'
     _NETRC_MACHINE = 'udemy'
 
     _TESTS = [{
@@ -33,6 +36,29 @@ class UdemyIE(InfoExtractor):
         'skip': 'Requires udemy account credentials',
     }]
 
+    def _enroll_course(self, webpage, course_id):
+        enroll_url = self._search_regex(
+            r'href=(["\'])(?P<url>https?://(?:www\.)?udemy\.com/course/subscribe/.+?)\1',
+            webpage, 'enroll url', group='url',
+            default='https://www.udemy.com/course/subscribe/?courseId=%s' % course_id)
+        webpage = self._download_webpage(enroll_url, course_id, 'Enrolling in the course')
+        if self._SUCCESSFULLY_ENROLLED in webpage:
+            self.to_screen('%s: Successfully enrolled in' % course_id)
+        elif self._ALREADY_ENROLLED in webpage:
+            self.to_screen('%s: Already enrolled in' % course_id)
+
+    def _download_lecture(self, course_id, lecture_id):
+        return self._download_json(
+            'https://www.udemy.com/api-2.0/users/me/subscribed-courses/%s/lectures/%s?%s' % (
+                course_id, lecture_id, compat_urllib_parse.urlencode({
+                    'video_only': '',
+                    'auto_play': '',
+                    'fields[lecture]': 'title,description,asset',
+                    'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,data',
+                    'instructorPreviewMode': 'False',
+                })),
+            lecture_id, 'Downloading lecture JSON')
+
     def _handle_error(self, response):
         if not isinstance(response, dict):
             return
@@ -54,6 +80,7 @@ class UdemyIE(InfoExtractor):
                 headers['X-Udemy-Client-Id'] = cookie.value
             elif cookie.name == 'access_token':
                 headers['X-Udemy-Bearer-Token'] = cookie.value
+                headers['X-Udemy-Authorization'] = 'Bearer %s' % cookie.value
 
         if isinstance(url_or_request, compat_urllib_request.Request):
             for header, value in headers.items():
@@ -71,7 +98,7 @@ class UdemyIE(InfoExtractor):
     def _login(self):
         (username, password) = self._get_login_info()
         if username is None:
-            self.raise_login_required('Udemy account is required')
+            return
 
         login_popup = self._download_webpage(
             self._LOGIN_URL, None, 'Downloading login popup')
@@ -109,44 +136,76 @@ class UdemyIE(InfoExtractor):
     def _real_extract(self, url):
         lecture_id = self._match_id(url)
 
-        lecture = self._download_json(
-            'https://www.udemy.com/api-1.1/lectures/%s' % lecture_id,
-            lecture_id, 'Downloading lecture JSON')
+        webpage = self._download_webpage(url, lecture_id)
+
+        course_id = self._search_regex(
+            r'data-course-id=["\'](\d+)', webpage, 'course id')
+
+        try:
+            lecture = self._download_lecture(course_id, lecture_id)
+        except ExtractorError as e:
+            # Error could possibly mean we are not enrolled in the course
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+                self._enroll_course(webpage, course_id)
+                lecture_id = self._download_lecture(course_id, lecture_id)
+            else:
+                raise
+
+        title = lecture['title']
+        description = lecture.get('description')
 
-        asset_type = lecture.get('assetType') or lecture.get('asset_type')
+        asset = lecture['asset']
+
+        asset_type = asset.get('assetType') or asset.get('asset_type')
         if asset_type != 'Video':
             raise ExtractorError(
                 'Lecture %s is not a video' % lecture_id, expected=True)
 
-        asset = lecture['asset']
-
         stream_url = asset.get('streamUrl') or asset.get('stream_url')
-        mobj = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', stream_url)
-        if mobj:
-            return self.url_result(mobj.group(1), 'Youtube')
+        if stream_url:
+            youtube_url = self._search_regex(
+                r'(https?://www\.youtube\.com/watch\?v=.*)', stream_url, 'youtube URL', default=None)
+            if youtube_url:
+                return self.url_result(youtube_url, 'Youtube')
 
         video_id = asset['id']
         thumbnail = asset.get('thumbnailUrl') or asset.get('thumbnail_url')
-        duration = asset['data']['duration']
-
-        download_url = asset.get('downloadUrl') or asset.get('download_url')
-
-        video = download_url.get('Video') or download_url.get('video')
-        video_480p = download_url.get('Video480p') or download_url.get('video_480p')
-
-        formats = [
-            {
-                'url': video_480p[0],
-                'format_id': '360p',
-            },
-            {
-                'url': video[0],
-                'format_id': '720p',
-            },
-        ]
-
-        title = lecture['title']
-        description = lecture['description']
+        duration = float_or_none(asset.get('data', {}).get('duration'))
+        outputs = asset.get('data', {}).get('outputs', {})
+
+        formats = []
+        for format_ in asset.get('download_urls', {}).get('Video', []):
+            video_url = format_.get('file')
+            if not video_url:
+                continue
+            format_id = format_.get('label')
+            f = {
+                'url': format_['file'],
+                'height': int_or_none(format_id),
+            }
+            if format_id:
+                # Some videos contain additional metadata (e.g.
+                # https://www.udemy.com/ios9-swift/learn/#/lecture/3383208)
+                output = outputs.get(format_id)
+                if isinstance(output, dict):
+                    f.update({
+                        'format_id': '%sp' % (output.get('label') or format_id),
+                        'width': int_or_none(output.get('width')),
+                        'height': int_or_none(output.get('height')),
+                        'vbr': int_or_none(output.get('video_bitrate_in_kbps')),
+                        'vcodec': output.get('video_codec'),
+                        'fps': int_or_none(output.get('frame_rate')),
+                        'abr': int_or_none(output.get('audio_bitrate_in_kbps')),
+                        'acodec': output.get('audio_codec'),
+                        'asr': int_or_none(output.get('audio_sample_rate')),
+                        'tbr': int_or_none(output.get('total_bitrate_in_kbps')),
+                        'filesize': int_or_none(output.get('file_size_in_bytes')),
+                    })
+                else:
+                    f['format_id'] = '%sp' % format_id
+            formats.append(f)
+
+        self._sort_formats(formats)
 
         return {
             'id': video_id,
@@ -160,9 +219,7 @@ class UdemyIE(InfoExtractor):
 
 class UdemyCourseIE(UdemyIE):
     IE_NAME = 'udemy:course'
-    _VALID_URL = r'https?://www\.udemy\.com/(?P<coursepath>[\da-z-]+)'
-    _SUCCESSFULLY_ENROLLED = '>You have enrolled in this course!<'
-    _ALREADY_ENROLLED = '>You are already taking this course.<'
+    _VALID_URL = r'https?://www\.udemy\.com/(?P<id>[\da-z-]+)'
     _TESTS = []
 
     @classmethod
@@ -170,24 +227,18 @@ class UdemyCourseIE(UdemyIE):
         return False if UdemyIE.suitable(url) else super(UdemyCourseIE, cls).suitable(url)
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        course_path = mobj.group('coursepath')
+        course_path = self._match_id(url)
+
+        webpage = self._download_webpage(url, course_path)
 
         response = self._download_json(
             'https://www.udemy.com/api-1.1/courses/%s' % course_path,
             course_path, 'Downloading course JSON')
 
-        course_id = int(response['id'])
-        course_title = response['title']
+        course_id = response['id']
+        course_title = response.get('title')
 
-        webpage = self._download_webpage(
-            'https://www.udemy.com/course/subscribe/?courseId=%s' % course_id,
-            course_id, 'Enrolling in the course')
-
-        if self._SUCCESSFULLY_ENROLLED in webpage:
-            self.to_screen('%s: Successfully enrolled in' % course_id)
-        elif self._ALREADY_ENROLLED in webpage:
-            self.to_screen('%s: Already enrolled in' % course_id)
+        self._enroll_course(webpage, course_id)
 
         response = self._download_json(
             'https://www.udemy.com/api-1.1/courses/%s/curriculum' % course_id,
index be0a2780f5299571ee134ac47e17eda6476673ac..357594a11debd4e4946e6fd29b0f2b4d4fb241b9 100644 (file)
@@ -3,11 +3,14 @@ from __future__ import unicode_literals
 
 from .common import InfoExtractor
 from ..compat import compat_urllib_parse
-from ..utils import sanitized_Request
+from ..utils import (
+    ExtractorError,
+    sanitized_Request,
+)
 
 
 class VodlockerIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?vodlocker\.com/(?P<id>[0-9a-zA-Z]+)(?:\..*?)?'
+    _VALID_URL = r'https?://(?:www\.)?vodlocker\.com/(?:embed-)?(?P<id>[0-9a-zA-Z]+)(?:\..*?)?'
 
     _TESTS = [{
         'url': 'http://vodlocker.com/e8wvyzz4sl42',
@@ -24,6 +27,12 @@ class VodlockerIE(InfoExtractor):
         video_id = self._match_id(url)
         webpage = self._download_webpage(url, video_id)
 
+        if any(p in webpage for p in (
+                '>THIS FILE WAS DELETED<',
+                '>File Not Found<',
+                'The file you were looking for could not be found, sorry for any inconvenience.<')):
+            raise ExtractorError('Video %s does not exist' % video_id, expected=True)
+
         fields = self._hidden_inputs(webpage)
 
         if fields['op'] == 'download1':
index cfe9eed551088dfac9d2690cac0de03c375c214e..9b39505ba71cf09880e6d8fcec1910b8806204c0 100644 (file)
@@ -258,7 +258,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                              |(?:                                             # or the v= param in all its forms
                                  (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)?  # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
-                                 (?:.*?&)??                                   # any other preceding param (like /?s=tuff&v=xxxx)
+                                 (?:.*?[&;])??                                # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
                                  v=
                              )
                          ))
@@ -346,6 +346,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
         '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
         '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
         '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+        # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
         '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
         '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
         '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
@@ -714,6 +715,26 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
             'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
             'only_matching': True,
         },
+        {
+            # Video with yt:stretch=17:0
+            'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
+            'info_dict': {
+                'id': 'Q39EVAstoRM',
+                'ext': 'mp4',
+                'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
+                'description': 'md5:ee18a25c350637c8faff806845bddee9',
+                'upload_date': '20151107',
+                'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
+                'uploader': 'CH GAMER DROID',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
+            'only_matching': True,
+        }
     ]
 
     def __init__(self, *args, **kwargs):
@@ -1459,6 +1480,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
             manifest_url = video_info['hlsvp'][0]
             url_map = self._extract_from_m3u8(manifest_url, video_id)
             formats = _map_to_format_list(url_map)
+            # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
+            for a_format in formats:
+                a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
         else:
             raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
 
@@ -1496,10 +1520,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
             r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
             video_webpage)
         if stretched_m:
-            ratio = float(stretched_m.group('w')) / float(stretched_m.group('h'))
-            for f in formats:
-                if f.get('vcodec') != 'none':
-                    f['stretched_ratio'] = ratio
+            w = float(stretched_m.group('w'))
+            h = float(stretched_m.group('h'))
+            # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
+            # We will only process correct ratios.
+            if w > 0 and h > 0:
+                ratio = w / h
+                for f in formats:
+                    if f.get('vcodec') != 'none':
+                        f['stretched_ratio'] = ratio
 
         self._sort_formats(formats)
 
@@ -1538,7 +1567,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor, YoutubePlaylistBaseInfoExtract
                         youtube\.com/
                         (?:
                            (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
-                           \? (?:.*?&)*? (?:p|a|list)=
+                           \? (?:.*?[&;])*? (?:p|a|list)=
                         |  p/
                         )
                         (
index 359e8d30004e458bcbf3353f78b5cd57fedce785..c46e136bffd6f7a54b4d3a9e0883f42e63a11591 100644 (file)
@@ -338,7 +338,7 @@ def parseOpts(overrideArguments=None):
     video_format.add_option(
         '-F', '--list-formats',
         action='store_true', dest='listformats',
-        help='List all available formats of specified videos')
+        help='List all available formats of requested videos')
     video_format.add_option(
         '--youtube-include-dash-manifest',
         action='store_true', dest='youtube_include_dash_manifest', default=True,
index d7b737e21639679fe665d22e6ca4089f5c791618..d0606b4bcd3d4706912f753441608dff721d7699 100644 (file)
@@ -663,6 +663,16 @@ def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
     return hc
 
 
+def handle_youtubedl_headers(headers):
+    filtered_headers = headers
+
+    if 'Youtubedl-no-compression' in filtered_headers:
+        filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
+        del filtered_headers['Youtubedl-no-compression']
+
+    return filtered_headers
+
+
 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
     """Handler for HTTP requests and responses.
 
@@ -670,7 +680,7 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
     the standard headers to every HTTP request and handles gzipped and
     deflated responses from web servers. If compression is to be avoided in
     a particular request, the original request in the program code only has
-    to include the HTTP header "Youtubedl-No-Compression", which will be
+    to include the HTTP header "Youtubedl-no-compression", which will be
     removed before making the real request.
 
     Part of this code was copied from:
@@ -731,10 +741,8 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
             # The dict keys are capitalized because of this bug by urllib
             if h.capitalize() not in req.headers:
                 req.add_header(h, v)
-        if 'Youtubedl-no-compression' in req.headers:
-            if 'Accept-encoding' in req.headers:
-                del req.headers['Accept-encoding']
-            del req.headers['Youtubedl-no-compression']
+
+        req.headers = handle_youtubedl_headers(req.headers)
 
         if sys.version_info < (2, 7) and '#' in req.get_full_url():
             # Python 2.6 is brain-dead when it comes to fragments