Merge branch 'playtvak' of https://github.com/oskar456/youtube-dl into oskar456-playtvak
authorSergey M․ <dstftw@gmail.com>
Sat, 15 Aug 2015 19:06:38 +0000 (01:06 +0600)
committerSergey M․ <dstftw@gmail.com>
Sat, 15 Aug 2015 19:06:38 +0000 (01:06 +0600)
AUTHORS
README.md
youtube_dl/extractor/__init__.py
youtube_dl/extractor/common.py
youtube_dl/extractor/esri.py [new file with mode: 0644]
youtube_dl/extractor/lynda.py
youtube_dl/extractor/moniker.py
youtube_dl/extractor/shahid.py [new file with mode: 0644]
youtube_dl/extractor/youtube.py

diff --git a/AUTHORS b/AUTHORS
index ded9e87d27e96312f632b20a9886a42080fe3ec1..d1693224ec850681691377a599f8769bef1448a7 100644 (file)
--- a/AUTHORS
+++ b/AUTHORS
@@ -139,3 +139,4 @@ slangangular
 Behrouz Abbasi
 ngld
 nyuszika7h
+Shaun Walbridge
index 15baf75ceadb174e49ed1a871bd2d577f7c32722..542a7c26a1ce29cc7d5318d78e56dc59ceb206c1 100644 (file)
--- a/README.md
+++ b/README.md
@@ -272,6 +272,7 @@ The `-o` option allows users to indicate a template for the output file names. T
  - `autonumber`: The sequence will be replaced by a five-digit number that will be increased with each download, starting at zero.
  - `playlist`: The name or the id of the playlist that contains the video.
  - `playlist_index`: The index of the video in the playlist, a five-digit number.
+ - `format_id`: The sequence will be replaced by the format code specified by `--format`.
 
 The current default template is `%(title)s-%(id)s.%(ext)s`.
 
@@ -544,7 +545,7 @@ If you want to add support for a new site, you can follow this quick list (assum
     ```
 5. Add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py).
 6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will be then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc.
-7. Have a look at [`youtube_dl/common/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L38). Add tests and code for as many as you want.
+7. Have a look at [`youtube_dl/common/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L62-L200). Add tests and code for as many as you want.
 8. If you can, check the code with [flake8](https://pypi.python.org/pypi/flake8).
 9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this:
 
@@ -572,7 +573,7 @@ with youtube_dl.YoutubeDL(ydl_opts) as ydl:
     ydl.download(['http://www.youtube.com/watch?v=BaW_jenozKc'])
 ```
 
-Most likely, you'll want to use various options. For a list of what can be done, have a look at [youtube_dl/YoutubeDL.py](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/YoutubeDL.py#L69). For a start, if you want to intercept youtube-dl's output, set a `logger` object.
+Most likely, you'll want to use various options. For a list of what can be done, have a look at [youtube_dl/YoutubeDL.py](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/YoutubeDL.py#L117-L265). For a start, if you want to intercept youtube-dl's output, set a `logger` object.
 
 Here's a more complete example of a program that outputs only errors (and a short message after the download is finished), and downloads/converts the video to an mp3 file:
 
index 5307240f856d6f47e0b3ef454e3fa6d4af879b23..fa9acc92374b14bf8d3d5671f56f821d0129a65f 100644 (file)
@@ -158,6 +158,7 @@ from .eporner import EpornerIE
 from .eroprofile import EroProfileIE
 from .escapist import EscapistIE
 from .espn import ESPNIE
+from .esri import EsriVideoIE
 from .everyonesmixtape import EveryonesMixtapeIE
 from .exfm import ExfmIE
 from .expotv import ExpoTVIE
@@ -523,6 +524,7 @@ from .senateisvp import SenateISVPIE
 from .servingsys import ServingSysIE
 from .sexu import SexuIE
 from .sexykarma import SexyKarmaIE
+from .shahid import ShahidIE
 from .shared import SharedIE
 from .sharesix import ShareSixIE
 from .sina import SinaIE
index 5982055be888a87ea08f7d388d92a0ba4cc446ab..65835d257197361a7ea3e5159b37de6f03ec62ad 100644 (file)
@@ -15,6 +15,7 @@ import xml.etree.ElementTree
 from ..compat import (
     compat_cookiejar,
     compat_cookies,
+    compat_getpass,
     compat_HTTPError,
     compat_http_client,
     compat_urllib_error,
@@ -610,7 +611,7 @@ class InfoExtractor(object):
 
         return (username, password)
 
-    def _get_tfa_info(self):
+    def _get_tfa_info(self, note='two-factor verification code'):
         """
         Get the two-factor authentication info
         TODO - asking the user will be required for sms/phone verify
@@ -624,7 +625,7 @@ class InfoExtractor(object):
         if downloader_params.get('twofactor', None) is not None:
             return downloader_params['twofactor']
 
-        return None
+        return compat_getpass('Type %s and press [Return]: ' % note)
 
     # Helper functions for extracting OpenGraph info
     @staticmethod
@@ -640,7 +641,7 @@ class InfoExtractor(object):
     @staticmethod
     def _meta_regex(prop):
         return r'''(?isx)<meta
-                    (?=[^>]+(?:itemprop|name|property|id)=(["\']?)%s\1)
+                    (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 
     def _og_search_property(self, prop, html, name=None, **kargs):
@@ -724,16 +725,18 @@ class InfoExtractor(object):
 
     @staticmethod
     def _hidden_inputs(html):
-        return dict([
-            (input.group('name'), input.group('value')) for input in re.finditer(
-                r'''(?x)
-                    <input\s+
-                        type=(?P<q_hidden>["\'])hidden(?P=q_hidden)\s+
-                        name=(?P<q_name>["\'])(?P<name>.+?)(?P=q_name)\s+
-                        (?:id=(?P<q_id>["\']).+?(?P=q_id)\s+)?
-                        value=(?P<q_value>["\'])(?P<value>.*?)(?P=q_value)
-                ''', html)
-        ])
+        hidden_inputs = {}
+        for input in re.findall(r'<input([^>]+)>', html):
+            if not re.search(r'type=(["\'])hidden\1', input):
+                continue
+            name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
+            if not name:
+                continue
+            value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
+            if not value:
+                continue
+            hidden_inputs[name.group('value')] = value.group('value')
+        return hidden_inputs
 
     def _form_hidden_inputs(self, form_id, html):
         form = self._search_regex(
diff --git a/youtube_dl/extractor/esri.py b/youtube_dl/extractor/esri.py
new file mode 100644 (file)
index 0000000..bf5d201
--- /dev/null
@@ -0,0 +1,74 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+    int_or_none,
+    parse_filesize,
+    unified_strdate,
+)
+
+
+class EsriVideoIE(InfoExtractor):
+    _VALID_URL = r'https?://video\.esri\.com/watch/(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'https://video.esri.com/watch/1124/arcgis-online-_dash_-developing-applications',
+        'md5': 'd4aaf1408b221f1b38227a9bbaeb95bc',
+        'info_dict': {
+            'id': '1124',
+            'ext': 'mp4',
+            'title': 'ArcGIS Online - Developing Applications',
+            'description': 'Jeremy Bartley demonstrates how to develop applications with ArcGIS Online.',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'duration': 185,
+            'upload_date': '20120419',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        formats = []
+        for width, height, content in re.findall(
+                r'(?s)<li><strong>(\d+)x(\d+):</strong>(.+?)</li>', webpage):
+            for video_url, ext, filesize in re.findall(
+                    r'<a[^>]+href="([^"]+)">([^<]+)&nbsp;\(([^<]+)\)</a>', content):
+                formats.append({
+                    'url': compat_urlparse.urljoin(url, video_url),
+                    'ext': ext.lower(),
+                    'format_id': '%s-%s' % (ext.lower(), height),
+                    'width': int(width),
+                    'height': int(height),
+                    'filesize_approx': parse_filesize(filesize),
+                })
+        self._sort_formats(formats)
+
+        title = self._html_search_meta('title', webpage, 'title')
+        description = self._html_search_meta(
+            'description', webpage, 'description', fatal=False)
+
+        thumbnail = self._html_search_meta('thumbnail', webpage, 'thumbnail', fatal=False)
+        if thumbnail:
+            thumbnail = re.sub(r'_[st]\.jpg$', '_x.jpg', thumbnail)
+
+        duration = int_or_none(self._search_regex(
+            [r'var\s+videoSeconds\s*=\s*(\d+)', r"'duration'\s*:\s*(\d+)"],
+            webpage, 'duration', fatal=False))
+
+        upload_date = unified_strdate(self._html_search_meta(
+            'last-modified', webpage, 'upload date', fatal=None))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'upload_date': upload_date,
+            'formats': formats
+        }
index deead220af21bd5e37deab93446dc9e9cac59ba8..5b9157ed43efb0169baac327ce37ec7d46eb69f5 100644 (file)
@@ -11,6 +11,7 @@ from ..compat import (
 )
 from ..utils import (
     ExtractorError,
+    clean_html,
     int_or_none,
 )
 
@@ -70,6 +71,15 @@ class LyndaBaseIE(InfoExtractor):
                     'Confirming log in and log out from another device')
 
         if all(not re.search(p, login_page) for p in ('isLoggedIn\s*:\s*true', r'logout\.aspx', r'>Log out<')):
+            if 'login error' in login_page:
+                mobj = re.search(
+                    r'(?s)<h1[^>]+class="topmost">(?P<title>[^<]+)</h1>\s*<div>(?P<description>.+?)</div>',
+                    login_page)
+                if mobj:
+                    raise ExtractorError(
+                        'lynda returned error: %s - %s'
+                        % (mobj.group('title'), clean_html(mobj.group('description'))),
+                        expected=True)
             raise ExtractorError('Unable to log in')
 
 
index 88dcd4f737544356091220d53078bc1c2e222d76..69e4bcd1a28f7be589fd26ca448740bd16794cfc 100644 (file)
@@ -9,7 +9,10 @@ from ..compat import (
     compat_urllib_parse,
     compat_urllib_request,
 )
-from ..utils import ExtractorError
+from ..utils import (
+    ExtractorError,
+    remove_start,
+)
 
 
 class MonikerIE(InfoExtractor):
@@ -24,6 +27,14 @@ class MonikerIE(InfoExtractor):
             'ext': 'mp4',
             'title': 'youtube-dl test video',
         },
+    }, {
+        'url': 'http://allmyvideos.net/embed-jih3nce3x6wn',
+        'md5': '710883dee1bfc370ecf9fa6a89307c88',
+        'info_dict': {
+            'id': 'jih3nce3x6wn',
+            'ext': 'mp4',
+            'title': 'youtube-dl test video',
+        },
     }, {
         'url': 'http://vidspot.net/l2ngsmhs8ci5',
         'md5': '710883dee1bfc370ecf9fa6a89307c88',
@@ -38,7 +49,10 @@ class MonikerIE(InfoExtractor):
     }]
 
     def _real_extract(self, url):
-        video_id = self._match_id(url)
+        orig_video_id = self._match_id(url)
+        video_id = remove_start(orig_video_id, 'embed-')
+        url = url.replace(orig_video_id, video_id)
+        assert re.match(self._VALID_URL, url) is not None
         orig_webpage = self._download_webpage(url, video_id)
 
         if '>File Not Found<' in orig_webpage:
diff --git a/youtube_dl/extractor/shahid.py b/youtube_dl/extractor/shahid.py
new file mode 100644 (file)
index 0000000..6e9903d
--- /dev/null
@@ -0,0 +1,107 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    parse_iso8601,
+)
+
+
+class ShahidIE(InfoExtractor):
+    _VALID_URL = r'https?://shahid\.mbc\.net/ar/episode/(?P<id>\d+)/?'
+    _TESTS = [{
+        'url': 'https://shahid.mbc.net/ar/episode/90574/%D8%A7%D9%84%D9%85%D9%84%D9%83-%D8%B9%D8%A8%D8%AF%D8%A7%D9%84%D9%84%D9%87-%D8%A7%D9%84%D8%A5%D9%86%D8%B3%D8%A7%D9%86-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-3.html',
+        'info_dict': {
+            'id': '90574',
+            'ext': 'm3u8',
+            'title': 'الملك عبدالله الإنسان الموسم 1 كليب 3',
+            'description': 'الفيلم الوثائقي - الملك عبد الله الإنسان',
+            'duration': 2972,
+            'timestamp': 1422057420,
+            'upload_date': '20150123',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        }
+    }, {
+        # shahid plus subscriber only
+        'url': 'https://shahid.mbc.net/ar/episode/90511/%D9%85%D8%B1%D8%A7%D9%8A%D8%A7-2011-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1.html',
+        'only_matching': True
+    }]
+
+    def _handle_error(self, response):
+        if not isinstance(response, dict):
+            return
+        error = response.get('error')
+        if error:
+            raise ExtractorError(
+                '%s returned error: %s' % (self.IE_NAME, '\n'.join(error.values())),
+                expected=True)
+
+    def _download_json(self, url, video_id, note='Downloading JSON metadata'):
+        response = super(ShahidIE, self)._download_json(url, video_id, note)['data']
+        self._handle_error(response)
+        return response
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        api_vars = {
+            'id': video_id,
+            'type': 'player',
+            'url': 'http://api.shahid.net/api/v1_1',
+            'playerType': 'episode',
+        }
+
+        flashvars = self._search_regex(
+            r'var\s+flashvars\s*=\s*({[^}]+})', webpage, 'flashvars', default=None)
+        if flashvars:
+            for key in api_vars.keys():
+                value = self._search_regex(
+                    r'\b%s\s*:\s*(?P<q>["\'])(?P<value>.+?)(?P=q)' % key,
+                    flashvars, 'type', default=None, group='value')
+                if value:
+                    api_vars[key] = value
+
+        player = self._download_json(
+            'https://shahid.mbc.net/arContent/getPlayerContent-param-.id-%s.type-%s.html'
+            % (video_id, api_vars['type']), video_id, 'Downloading player JSON')
+
+        formats = self._extract_m3u8_formats(player['url'], video_id, 'mp4')
+
+        video = self._download_json(
+            '%s/%s/%s?%s' % (
+                api_vars['url'], api_vars['playerType'], api_vars['id'],
+                compat_urllib_parse.urlencode({
+                    'apiKey': 'sh@hid0nlin3',
+                    'hash': 'b2wMCTHpSmyxGqQjJFOycRmLSex+BpTK/ooxy6vHaqs=',
+                }).encode('utf-8')),
+            video_id, 'Downloading video JSON')
+
+        video = video[api_vars['playerType']]
+
+        title = video['title']
+        description = video.get('description')
+        thumbnail = video.get('thumbnailUrl')
+        duration = int_or_none(video.get('duration'))
+        timestamp = parse_iso8601(video.get('referenceDate'))
+        categories = [
+            category['name']
+            for category in video.get('genres', []) if 'name' in category]
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'timestamp': timestamp,
+            'categories': categories,
+            'formats': formats,
+        }
index facd837ad3fb0a239d2736d234ba6d429a09dcdc..887c46d95dcef1febe45166e9dbd2a4c49083e73 100644 (file)
@@ -33,6 +33,7 @@ from ..utils import (
     int_or_none,
     orderedSet,
     parse_duration,
+    remove_start,
     smuggle_url,
     str_to_int,
     unescapeHTML,
@@ -46,7 +47,7 @@ from ..utils import (
 class YoutubeBaseInfoExtractor(InfoExtractor):
     """Provide base functions for Youtube extractors"""
     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
-    _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
+    _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
     _NETRC_MACHINE = 'youtube'
     # If True it will raise an error if no login info is provided
     _LOGIN_REQUIRED = False
@@ -128,40 +129,24 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
         # Two-Factor
         # TODO add SMS and phone call support - these require making a request and then prompting the user
 
-        if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
-            tfa_code = self._get_tfa_info()
+        if re.search(r'(?i)<form[^>]* id="challenge"', login_results) is not None:
+            tfa_code = self._get_tfa_info('2-step verification code')
 
-            if tfa_code is None:
-                self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
-                self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
+            if not tfa_code:
+                self._downloader.report_warning(
+                    'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
+                    '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
                 return False
 
-            # Unlike the first login form, secTok and timeStmp are both required for the TFA form
-
-            match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
-            if match is None:
-                self._downloader.report_warning('Failed to get secTok - did the page structure change?')
-            secTok = match.group(1)
-            match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
-            if match is None:
-                self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
-            timeStmp = match.group(1)
-
-            tfa_form_strs = {
-                'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
-                'smsToken': '',
-                'smsUserPin': tfa_code,
-                'smsVerifyPin': 'Verify',
-
-                'PersistentCookie': 'yes',
-                'checkConnection': '',
-                'checkedDomains': 'youtube',
-                'pstMsg': '1',
-                'secTok': secTok,
-                'timeStmp': timeStmp,
-                'service': 'youtube',
-                'hl': 'en_US',
-            }
+            tfa_code = remove_start(tfa_code, 'G-')
+
+            tfa_form_strs = self._form_hidden_inputs('challenge', login_results)
+
+            tfa_form_strs.update({
+                'Pin': tfa_code,
+                'TrustDevice': 'on',
+            })
+
             tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
             tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
 
@@ -173,8 +158,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
             if tfa_results is False:
                 return False
 
-            if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
-                self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
+            if re.search(r'(?i)<form[^>]* id="challenge"', tfa_results) is not None:
+                self._downloader.report_warning('Two-factor code expired or invalid. Please try again, or use a one-use backup code instead.')
                 return False
             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
                 self._downloader.report_warning('unable to log in - did the page structure change?')