Merge pull request #5961 from dstftw/force-generic-extractor
authorSergey M. <dstftw@gmail.com>
Wed, 24 Jun 2015 14:10:45 +0000 (19:10 +0500)
committerSergey M. <dstftw@gmail.com>
Wed, 24 Jun 2015 14:10:45 +0000 (19:10 +0500)
Add --force-generic-extractor

41 files changed:
AUTHORS
README.md
docs/supportedsites.md
youtube_dl/YoutubeDL.py
youtube_dl/extractor/__init__.py
youtube_dl/extractor/adobetv.py
youtube_dl/extractor/bbccouk.py
youtube_dl/extractor/brightcove.py
youtube_dl/extractor/dailymotion.py
youtube_dl/extractor/dramafever.py [new file with mode: 0644]
youtube_dl/extractor/drbonanza.py
youtube_dl/extractor/faz.py
youtube_dl/extractor/francetv.py
youtube_dl/extractor/generic.py
youtube_dl/extractor/imdb.py
youtube_dl/extractor/lifenews.py
youtube_dl/extractor/liveleak.py
youtube_dl/extractor/niconico.py
youtube_dl/extractor/noco.py
youtube_dl/extractor/pinkbike.py [new file with mode: 0644]
youtube_dl/extractor/pornhub.py
youtube_dl/extractor/prosiebensat1.py
youtube_dl/extractor/safari.py
youtube_dl/extractor/sohu.py
youtube_dl/extractor/spankwire.py
youtube_dl/extractor/tumblr.py
youtube_dl/extractor/tvc.py
youtube_dl/extractor/tvplay.py
youtube_dl/extractor/vbox7.py
youtube_dl/extractor/viki.py
youtube_dl/extractor/vimeo.py
youtube_dl/extractor/vk.py
youtube_dl/extractor/xhamster.py
youtube_dl/extractor/xvideos.py
youtube_dl/extractor/youku.py
youtube_dl/extractor/youtube.py
youtube_dl/options.py
youtube_dl/postprocessor/embedthumbnail.py
youtube_dl/postprocessor/ffmpeg.py
youtube_dl/utils.py
youtube_dl/version.py

diff --git a/AUTHORS b/AUTHORS
index bf2a25cb8a804f9d0d500816c126480eb5083b58..889d599a2d113b83dc505a664207d7258398fda6 100644 (file)
--- a/AUTHORS
+++ b/AUTHORS
@@ -127,3 +127,4 @@ Julian Richen
 Ping O.
 Mister Hat
 Peter Ding
+jackyzy823
index f3d83c89fafbdff0dc6b22b6d207a28428b59f96..5f3a08f5a9839e2438498cc32d284b341c4fb7f8 100644 (file)
--- a/README.md
+++ b/README.md
@@ -52,7 +52,7 @@ which means you can modify it, redistribute it or use it however you like.
     -i, --ignore-errors              Continue on download errors, for example to skip unavailable videos in a playlist
     --abort-on-error                 Abort downloading of further videos (in the playlist or the command line) if an error occurs
     --dump-user-agent                Display the current browser identification
-    --list-extractors                List all supported extractors and the URLs they would handle
+    --list-extractors                List all supported extractors
     --extractor-descriptions         Output descriptions of all supported extractors
     --default-search PREFIX          Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for youtube-dl "large apple".
                                      Use the value "auto" to let youtube-dl guess ("auto_warning" to emit a warning when guessing). "error" just throws an error. The
@@ -223,7 +223,7 @@ which means you can modify it, redistribute it or use it however you like.
                                      parameters replace existing values. Additional templates: %(album)s, %(artist)s. Example: --metadata-from-title "%(artist)s -
                                      %(title)s" matches a title like "Coldplay - Paradise"
     --xattrs                         Write metadata to the video file's xattrs (using dublin core and xdg standards)
-    --fixup POLICY                   Automatically correct known faults of the file. One of never (do nothing), warn (only emit a warning), detect_or_warn(the default;
+    --fixup POLICY                   Automatically correct known faults of the file. One of never (do nothing), warn (only emit a warning), detect_or_warn (the default;
                                      fix file if we can, warn otherwise)
     --prefer-avconv                  Prefer avconv over ffmpeg for running the postprocessors (default)
     --prefer-ffmpeg                  Prefer ffmpeg over avconv for running the postprocessors
index d147b53fe60a5bba6f9d33d1fba684014513f83b..220e52b988ace1314ec866f59a8ef34562f1b583 100644 (file)
  - **divxstage**: DivxStage
  - **Dotsub**
  - **DouyuTV**
+ - **dramafever**
+ - **dramafever:series**
  - **DRBonanza**
  - **Dropbox**
  - **DrTuber**
  - **fernsehkritik.tv**
  - **fernsehkritik.tv:postecke**
  - **Firstpost**
+ - **FiveTV**
  - **Flickr**
  - **Folketinget**: Folketinget (ft.dk; Danish parliament)
  - **FootyRoom**
  - **instagram:user**: Instagram user profile
  - **InternetVideoArchive**
  - **IPrima**
+ - **iqiyi**
  - **ivi**: ivi.ru
  - **ivi:compilation**: ivi.ru compilations
  - **Izlesene**
  - **rutube:movie**: Rutube movies
  - **rutube:person**: Rutube person videos
  - **RUTV**: RUTV.RU
+ - **Ruutu**
  - **safari**: safaribooksonline.com online video
  - **safari:course**: safaribooksonline.com online courses
  - **Sandia**: Sandia National Laboratories
  - **TV2**
  - **TV2Article**
  - **TV4**: tv4.se and tv4play.se
+ - **TVC**
+ - **TVCArticle**
  - **tvigle**: Интернет-телевидение Tvigle.ru
  - **tvp.pl**
  - **tvp.pl:Series**
index a7d3a1c017fb6230639b522c1b36c356cfcaf93e..ef0f71bad45d6057dc99c1ce968629a0e357e57b 100755 (executable)
@@ -119,7 +119,7 @@ class YoutubeDL(object):
 
     username:          Username for authentication purposes.
     password:          Password for authentication purposes.
-    videopassword:     Password for acces a video.
+    videopassword:     Password for accessing a video.
     usenetrc:          Use netrc for authentication instead.
     verbose:           Print additional info to stdout.
     quiet:             Do not print messages to stdout.
@@ -1037,12 +1037,6 @@ class YoutubeDL(object):
             info_dict['id'], info_dict.get('subtitles'),
             info_dict.get('automatic_captions'))
 
-        # This extractors handle format selection themselves
-        if info_dict['extractor'] in ['Youku']:
-            if download:
-                self.process_info(info_dict)
-            return info_dict
-
         # We now pick which formats have to be downloaded
         if info_dict.get('formats') is None:
             # There's only one format available
index 3bc62e9d5b222c20ecbc6239f6f3f05199bc4eba..dc1a302e69c13f00d67230b2920ce50106ea2629 100644 (file)
@@ -4,7 +4,10 @@ from .abc import ABCIE
 from .abc7news import Abc7NewsIE
 from .academicearth import AcademicEarthCourseIE
 from .addanime import AddAnimeIE
-from .adobetv import AdobeTVIE
+from .adobetv import (
+    AdobeTVIE,
+    AdobeTVVideoIE,
+)
 from .adultswim import AdultSwimIE
 from .aftenposten import AftenpostenIE
 from .aftonbladet import AftonbladetIE
@@ -103,6 +106,7 @@ from .dailymotion import (
     DailymotionIE,
     DailymotionPlaylistIE,
     DailymotionUserIE,
+    DailymotionCloudIE,
 )
 from .daum import DaumIE
 from .dbtv import DBTVIE
@@ -112,6 +116,10 @@ from .dfb import DFBIE
 from .dhm import DHMIE
 from .dotsub import DotsubIE
 from .douyutv import DouyuTVIE
+from .dramafever import (
+    DramaFeverIE,
+    DramaFeverSeriesIE,
+)
 from .dreisat import DreiSatIE
 from .drbonanza import DRBonanzaIE
 from .drtuber import DrTuberIE
@@ -397,6 +405,7 @@ from .pbs import PBSIE
 from .philharmoniedeparis import PhilharmonieDeParisIE
 from .phoenix import PhoenixIE
 from .photobucket import PhotobucketIE
+from .pinkbike import PinkbikeIE
 from .planetaplay import PlanetaPlayIE
 from .pladform import PladformIE
 from .played import PlayedIE
@@ -692,7 +701,10 @@ from .wrzuta import WrzutaIE
 from .wsj import WSJIE
 from .xbef import XBefIE
 from .xboxclips import XboxClipsIE
-from .xhamster import XHamsterIE
+from .xhamster import (
+    XHamsterIE,
+    XHamsterEmbedIE,
+)
 from .xminus import XMinusIE
 from .xnxx import XNXXIE
 from .xstream import XstreamIE
index 97d12856092975a094ec18a5fd7ecafef39c255a..5e43adc51f98c2f22e728c49150b84ae64f704e3 100644 (file)
@@ -5,6 +5,8 @@ from ..utils import (
     parse_duration,
     unified_strdate,
     str_to_int,
+    float_or_none,
+    ISO639Utils,
 )
 
 
@@ -69,3 +71,61 @@ class AdobeTVIE(InfoExtractor):
             'view_count': view_count,
             'formats': formats,
         }
+
+
+class AdobeTVVideoIE(InfoExtractor):
+    _VALID_URL = r'https?://video\.tv\.adobe\.com/v/(?P<id>\d+)'
+
+    _TEST = {
+        # From https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners
+        'url': 'https://video.tv.adobe.com/v/2456/',
+        'md5': '43662b577c018ad707a63766462b1e87',
+        'info_dict': {
+            'id': '2456',
+            'ext': 'mp4',
+            'title': 'New experience with Acrobat DC',
+            'description': 'New experience with Acrobat DC',
+            'duration': 248.667,
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        player_params = self._parse_json(self._search_regex(
+            r'var\s+bridge\s*=\s*([^;]+);', webpage, 'player parameters'),
+            video_id)
+
+        formats = [{
+            'url': source['src'],
+            'width': source.get('width'),
+            'height': source.get('height'),
+            'tbr': source.get('bitrate'),
+        } for source in player_params['sources']]
+
+        # For both metadata and downloaded files the duration varies among
+        # formats. I just pick the max one
+        duration = max(filter(None, [
+            float_or_none(source.get('duration'), scale=1000)
+            for source in player_params['sources']]))
+
+        subtitles = {}
+        for translation in player_params.get('translations', []):
+            lang_id = translation.get('language_w3c') or ISO639Utils.long2short(translation['language_medium'])
+            if lang_id not in subtitles:
+                subtitles[lang_id] = []
+            subtitles[lang_id].append({
+                'url': translation['vttPath'],
+                'ext': 'vtt',
+            })
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'title': player_params['title'],
+            'description': self._og_search_description(webpage),
+            'duration': duration,
+            'subtitles': subtitles,
+        }
index 249bc6bbde85dc568796f094f421f989df664a1c..5825d286774fa003d343f41c85e07e35340cb428 100644 (file)
@@ -129,6 +129,20 @@ class BBCCoUkIE(InfoExtractor):
                 'skip_download': True,
             },
             'skip': 'geolocation',
+        }, {
+            'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
+            'info_dict': {
+                'id': 'b05zmgw1',
+                'ext': 'flv',
+                'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
+                'title': 'Royal Academy Summer Exhibition',
+                'duration': 3540,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+            'skip': 'geolocation',
         }, {
             'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
             'only_matching': True,
@@ -237,26 +251,11 @@ class BBCCoUkIE(InfoExtractor):
         for connection in self._extract_connections(media):
             captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
             lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
-            ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/2006/10/ttaf1}'))
-            srt = ''
-
-            def _extract_text(p):
-                if p.text is not None:
-                    stripped_text = p.text.strip()
-                    if stripped_text:
-                        return stripped_text
-                return ' '.join(span.text.strip() for span in p.findall('{http://www.w3.org/2006/10/ttaf1}span'))
-            for pos, p in enumerate(ps):
-                srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'), _extract_text(p))
             subtitles[lang] = [
                 {
                     'url': connection.get('href'),
                     'ext': 'ttml',
                 },
-                {
-                    'data': srt,
-                    'ext': 'srt',
-                },
             ]
         return subtitles
 
@@ -267,7 +266,7 @@ class BBCCoUkIE(InfoExtractor):
                 programme_id, 'Downloading media selection XML')
         except ExtractorError as ee:
             if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
-                media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().encode('utf-8'))
+                media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().decode('utf-8'))
             else:
                 raise
 
@@ -362,7 +361,7 @@ class BBCCoUkIE(InfoExtractor):
             formats, subtitles = self._download_media_selector(programme_id)
             title = self._og_search_title(webpage)
             description = self._search_regex(
-                r'<p class="medium-description">([^<]+)</p>',
+                r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
                 webpage, 'description', fatal=False)
         else:
             programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
index c1d4320e1ce73b6d9c5a9313eee61a7e05f9daf9..4721c22930f15cb51d0daaac294eeeca3a329092 100644 (file)
@@ -13,6 +13,7 @@ from ..compat import (
     compat_urllib_parse_urlparse,
     compat_urllib_request,
     compat_urlparse,
+    compat_xml_parse_error,
 )
 from ..utils import (
     determine_ext,
@@ -119,7 +120,7 @@ class BrightcoveIE(InfoExtractor):
 
         try:
             object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8'))
-        except xml.etree.ElementTree.ParseError:
+        except compat_xml_parse_error:
             return
 
         fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars')
@@ -156,6 +157,28 @@ class BrightcoveIE(InfoExtractor):
         linkBase = find_param('linkBaseURL')
         if linkBase is not None:
             params['linkBaseURL'] = linkBase
+        return cls._make_brightcove_url(params)
+
+    @classmethod
+    def _build_brighcove_url_from_js(cls, object_js):
+        # The layout of JS is as follows:
+        # customBC.createVideo = function (width, height, playerID, playerKey, videoPlayer, VideoRandomID) {
+        #   // build Brightcove <object /> XML
+        # }
+        m = re.search(
+            r'''(?x)customBC.\createVideo\(
+                .*?                                                  # skipping width and height
+                ["\'](?P<playerID>\d+)["\']\s*,\s*                   # playerID
+                ["\'](?P<playerKey>AQ[^"\']{48})[^"\']*["\']\s*,\s*  # playerKey begins with AQ and is 50 characters
+                                                                     # in length, however it's appended to itself
+                                                                     # in places, so truncate
+                ["\'](?P<videoID>\d+)["\']                           # @videoPlayer
+            ''', object_js)
+        if m:
+            return cls._make_brightcove_url(m.groupdict())
+
+    @classmethod
+    def _make_brightcove_url(cls, params):
         data = compat_urllib_parse.urlencode(params)
         return cls._FEDERATED_URL_TEMPLATE % data
 
@@ -188,7 +211,12 @@ class BrightcoveIE(InfoExtractor):
                 [^>]*?>\s*<param\s+name="movie"\s+value="https?://[^/]*brightcove\.com/
             ).+?>\s*</object>''',
             webpage)
-        return list(filter(None, [cls._build_brighcove_url(m) for m in matches]))
+        if matches:
+            return list(filter(None, [cls._build_brighcove_url(m) for m in matches]))
+
+        return list(filter(None, [
+            cls._build_brighcove_url_from_js(custom_bc)
+            for custom_bc in re.findall(r'(customBC\.createVideo\(.+?\);)', webpage)]))
 
     def _real_extract(self, url):
         url, smuggled_data = unsmuggle_url(url, {})
index 70aa4333c773c39c4e3be451f8b632045ac85b51..96f0ed9ad19756b3d380d8023af6c61bee1c18a9 100644 (file)
@@ -251,3 +251,45 @@ class DailymotionUserIE(DailymotionPlaylistIE):
             'title': full_user,
             'entries': self._extract_entries(user),
         }
+
+
+class DailymotionCloudIE(DailymotionBaseInfoExtractor):
+    _VALID_URL = r'http://api\.dmcloud\.net/embed/[^/]+/(?P<id>[^/?]+)'
+
+    _TEST = {
+        # From http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html
+        # Tested at FranceTvInfo_2
+        'url': 'http://api.dmcloud.net/embed/4e7343f894a6f677b10006b4/556e03339473995ee145930c?auth=1464865870-0-jyhsm84b-ead4c701fb750cf9367bf4447167a3db&autoplay=1',
+        'only_matching': True,
+    }
+
+    @classmethod
+    def _extract_dmcloud_url(self, webpage):
+        mobj = re.search(r'<iframe[^>]+src=[\'"](http://api\.dmcloud\.net/embed/[^/]+/[^\'"]+)[\'"]', webpage)
+        if mobj:
+            return mobj.group(1)
+
+        mobj = re.search(r'<input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=[\'"](http://api\.dmcloud\.net/embed/[^/]+/[^\'"]+)[\'"]', webpage)
+        if mobj:
+            return mobj.group(1)
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        request = self._build_request(url)
+        webpage = self._download_webpage(request, video_id)
+
+        title = self._html_search_regex(r'<title>([^>]+)</title>', webpage, 'title')
+
+        video_info = self._parse_json(self._search_regex(
+            r'var\s+info\s*=\s*([^;]+);', webpage, 'video info'), video_id)
+
+        # TODO: parse ios_url, which is in fact a manifest
+        video_url = video_info['mp4_url']
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'thumbnail': video_info.get('thumbnail_url'),
+        }
diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py
new file mode 100644 (file)
index 0000000..ca41a3a
--- /dev/null
@@ -0,0 +1,197 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import itertools
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_HTTPError,
+    compat_urllib_parse,
+    compat_urllib_request,
+    compat_urlparse,
+)
+from ..utils import (
+    ExtractorError,
+    clean_html,
+    determine_ext,
+    int_or_none,
+    parse_iso8601,
+)
+
+
+class DramaFeverBaseIE(InfoExtractor):
+    _LOGIN_URL = 'https://www.dramafever.com/accounts/login/'
+    _NETRC_MACHINE = 'dramafever'
+
+    def _real_initialize(self):
+        self._login()
+
+    def _login(self):
+        (username, password) = self._get_login_info()
+        if username is None:
+            return
+
+        login_form = {
+            'username': username,
+            'password': password,
+        }
+
+        request = compat_urllib_request.Request(
+            self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
+        response = self._download_webpage(
+            request, None, 'Logging in as %s' % username)
+
+        if all(logout_pattern not in response
+               for logout_pattern in ['href="/accounts/logout/"', '>Log out<']):
+            error = self._html_search_regex(
+                r'(?s)class="hidden-xs prompt"[^>]*>(.+?)<',
+                response, 'error message', default=None)
+            if error:
+                raise ExtractorError('Unable to login: %s' % error, expected=True)
+            raise ExtractorError('Unable to log in')
+
+
+class DramaFeverIE(DramaFeverBaseIE):
+    IE_NAME = 'dramafever'
+    _VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P<id>[0-9]+/[0-9]+)(?:/|$)'
+    _TEST = {
+        'url': 'http://www.dramafever.com/drama/4512/1/Cooking_with_Shin/',
+        'info_dict': {
+            'id': '4512.1',
+            'ext': 'flv',
+            'title': 'Cooking with Shin 4512.1',
+            'description': 'md5:a8eec7942e1664a6896fcd5e1287bfd0',
+            'thumbnail': 're:^https?://.*\.jpg',
+            'timestamp': 1404336058,
+            'upload_date': '20140702',
+            'duration': 343,
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url).replace('/', '.')
+
+        try:
+            feed = self._download_json(
+                'http://www.dramafever.com/amp/episode/feed.json?guid=%s' % video_id,
+                video_id, 'Downloading episode JSON')['channel']['item']
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError):
+                raise ExtractorError(
+                    'Currently unavailable in your country.', expected=True)
+            raise
+
+        media_group = feed.get('media-group', {})
+
+        formats = []
+        for media_content in media_group['media-content']:
+            src = media_content.get('@attributes', {}).get('url')
+            if not src:
+                continue
+            ext = determine_ext(src)
+            if ext == 'f4m':
+                formats.extend(self._extract_f4m_formats(
+                    src, video_id, f4m_id='hds'))
+            elif ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    src, video_id, 'mp4', m3u8_id='hls'))
+            else:
+                formats.append({
+                    'url': src,
+                })
+        self._sort_formats(formats)
+
+        title = media_group.get('media-title')
+        description = media_group.get('media-description')
+        duration = int_or_none(media_group['media-content'][0].get('@attributes', {}).get('duration'))
+        thumbnail = self._proto_relative_url(
+            media_group.get('media-thumbnail', {}).get('@attributes', {}).get('url'))
+        timestamp = parse_iso8601(feed.get('pubDate'), ' ')
+
+        subtitles = {}
+        for media_subtitle in media_group.get('media-subTitle', []):
+            lang = media_subtitle.get('@attributes', {}).get('lang')
+            href = media_subtitle.get('@attributes', {}).get('href')
+            if not lang or not href:
+                continue
+            subtitles[lang] = [{
+                'ext': 'ttml',
+                'url': href,
+            }]
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'timestamp': timestamp,
+            'duration': duration,
+            'formats': formats,
+            'subtitles': subtitles,
+        }
+
+
+class DramaFeverSeriesIE(DramaFeverBaseIE):
+    IE_NAME = 'dramafever:series'
+    _VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P<id>[0-9]+)(?:/(?:(?!\d+(?:/|$)).+)?)?$'
+    _TESTS = [{
+        'url': 'http://www.dramafever.com/drama/4512/Cooking_with_Shin/',
+        'info_dict': {
+            'id': '4512',
+            'title': 'Cooking with Shin',
+            'description': 'md5:84a3f26e3cdc3fb7f500211b3593b5c1',
+        },
+        'playlist_count': 4,
+    }, {
+        'url': 'http://www.dramafever.com/drama/124/IRIS/',
+        'info_dict': {
+            'id': '124',
+            'title': 'IRIS',
+            'description': 'md5:b3a30e587cf20c59bd1c01ec0ee1b862',
+        },
+        'playlist_count': 20,
+    }]
+
+    _CONSUMER_SECRET = 'DA59dtVXYLxajktV'
+    _PAGE_SIZE = 60  # max is 60 (see http://api.drama9.com/#get--api-4-episode-series-)
+
+    def _get_consumer_secret(self, video_id):
+        mainjs = self._download_webpage(
+            'http://www.dramafever.com/static/51afe95/df2014/scripts/main.js',
+            video_id, 'Downloading main.js', fatal=False)
+        if not mainjs:
+            return self._CONSUMER_SECRET
+        return self._search_regex(
+            r"var\s+cs\s*=\s*'([^']+)'", mainjs,
+            'consumer secret', default=self._CONSUMER_SECRET)
+
+    def _real_extract(self, url):
+        series_id = self._match_id(url)
+
+        consumer_secret = self._get_consumer_secret(series_id)
+
+        series = self._download_json(
+            'http://www.dramafever.com/api/4/series/query/?cs=%s&series_id=%s'
+            % (consumer_secret, series_id),
+            series_id, 'Downloading series JSON')['series'][series_id]
+
+        title = clean_html(series['name'])
+        description = clean_html(series.get('description') or series.get('description_short'))
+
+        entries = []
+        for page_num in itertools.count(1):
+            episodes = self._download_json(
+                'http://www.dramafever.com/api/4/episode/series/?cs=%s&series_id=%s&page_size=%d&page_number=%d'
+                % (consumer_secret, series_id, self._PAGE_SIZE, page_num),
+                series_id, 'Downloading episodes JSON page #%d' % page_num)
+            for episode in episodes.get('value', []):
+                episode_url = episode.get('episode_url')
+                if not episode_url:
+                    continue
+                entries.append(self.url_result(
+                    compat_urlparse.urljoin(url, episode_url),
+                    'DramaFever', episode.get('guid')))
+            if page_num == episodes['num_pages']:
+                break
+
+        return self.playlist_result(entries, series_id, title, description)
index 7626219baf33522958960bca0d8babe67b8a332e..8b98b013adeee32c67c769acfb88d76edff9a1f7 100644 (file)
@@ -15,7 +15,6 @@ class DRBonanzaIE(InfoExtractor):
 
     _TESTS = [{
         'url': 'http://www.dr.dk/bonanza/serie/portraetter/Talkshowet.htm?assetId=65517',
-        'md5': 'fe330252ddea607635cf2eb2c99a0af3',
         'info_dict': {
             'id': '65517',
             'ext': 'mp4',
@@ -26,6 +25,9 @@ class DRBonanzaIE(InfoExtractor):
             'upload_date': '20110120',
             'duration': 3664,
         },
+        'params': {
+            'skip_download': True,  # requires rtmp
+        },
     }, {
         'url': 'http://www.dr.dk/bonanza/radio/serie/sport/fodbold.htm?assetId=59410',
         'md5': '6dfe039417e76795fb783c52da3de11d',
@@ -93,6 +95,11 @@ class DRBonanzaIE(InfoExtractor):
                         'format_id': file['Type'].replace('Video', ''),
                         'preference': preferencemap.get(file['Type'], -10),
                     })
+                    if format['url'].startswith('rtmp'):
+                        rtmp_url = format['url']
+                        format['rtmp_live'] = True  # --resume does not work
+                        if '/bonanza/' in rtmp_url:
+                            format['play_path'] = rtmp_url.split('/bonanza/')[1]
                     formats.append(format)
                 elif file['Type'] == "Thumb":
                     thumbnail = file['Location']
@@ -111,9 +118,6 @@ class DRBonanzaIE(InfoExtractor):
         description = '%s\n%s\n%s\n' % (
             info['Description'], info['Actors'], info['Colophon'])
 
-        for f in formats:
-            f['url'] = f['url'].replace('rtmp://vod-bonanza.gss.dr.dk/bonanza/', 'http://vodfiles.dr.dk/')
-            f['url'] = f['url'].replace('mp4:bonanza', 'bonanza')
         self._sort_formats(formats)
 
         display_id = re.sub(r'[^\w\d-]', '', re.sub(r' ', '-', title.lower())) + '-' + asset_id
index 3c39ca451a38e69a822968911e758847657380e9..cebdd0193a82eaccc673dffe9d001f766e9e31d1 100644 (file)
@@ -6,9 +6,9 @@ from .common import InfoExtractor
 
 class FazIE(InfoExtractor):
     IE_NAME = 'faz.net'
-    _VALID_URL = r'https?://www\.faz\.net/multimedia/videos/.*?-(?P<id>\d+)\.html'
+    _VALID_URL = r'https?://(?:www\.)?faz\.net/(?:[^/]+/)*.*?-(?P<id>\d+)\.html'
 
-    _TEST = {
+    _TESTS = [{
         'url': 'http://www.faz.net/multimedia/videos/stockholm-chemie-nobelpreis-fuer-drei-amerikanische-forscher-12610585.html',
         'info_dict': {
             'id': '12610585',
@@ -16,7 +16,22 @@ class FazIE(InfoExtractor):
             'title': 'Stockholm: Chemie-Nobelpreis für drei amerikanische Forscher',
             'description': 'md5:1453fbf9a0d041d985a47306192ea253',
         },
-    }
+    }, {
+        'url': 'http://www.faz.net/aktuell/politik/berlin-gabriel-besteht-zerreissprobe-ueber-datenspeicherung-13659345.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.faz.net/berlin-gabriel-besteht-zerreissprobe-ueber-datenspeicherung-13659345.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.faz.net/-13659345.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.faz.net/aktuell/politik/-13659345.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.faz.net/foobarblafasel-13659345.html',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
index edf555b2987520618b70bf8bd423c5fc1f60e5a9..b2c984bf272783d8fa71ebe0679066b3d98d25f5 100644 (file)
@@ -18,6 +18,7 @@ from ..utils import (
     parse_duration,
     determine_ext,
 )
+from .dailymotion import DailymotionCloudIE
 
 
 class FranceTVBaseInfoExtractor(InfoExtractor):
@@ -60,7 +61,7 @@ class FranceTVBaseInfoExtractor(InfoExtractor):
                     continue
                 video_url_parsed = compat_urllib_parse_urlparse(video_url)
                 f4m_url = self._download_webpage(
-                    'http://hdfauth.francetv.fr/esi/urltokengen2.html?url=%s' % video_url_parsed.path,
+                    'http://hdfauth.francetv.fr/esi/TA?url=%s' % video_url_parsed.path,
                     video_id, 'Downloading f4m manifest token', fatal=False)
                 if f4m_url:
                     formats.extend(self._extract_f4m_formats(f4m_url, video_id, 1, format_id))
@@ -131,12 +132,26 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor):
             'skip_download': 'HLS (reqires ffmpeg)'
         },
         'skip': 'Ce direct est terminé et sera disponible en rattrapage dans quelques minutes.',
+    }, {
+        'url': 'http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html',
+        'md5': 'f485bda6e185e7d15dbc69b72bae993e',
+        'info_dict': {
+            'id': '556e03339473995ee145930c',
+            'ext': 'mp4',
+            'title': 'Les entreprises familiales : le secret de la réussite',
+            'thumbnail': 're:^https?://.*\.jpe?g$',
+        }
     }]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         page_title = mobj.group('title')
         webpage = self._download_webpage(url, page_title)
+
+        dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage)
+        if dmcloud_url:
+            return self.url_result(dmcloud_url, 'DailymotionCloud')
+
         video_id, catalogue = self._search_regex(
             r'id-video=([^@]+@[^"]+)', webpage, 'video id').split('@')
         return self._extract_video(video_id, catalogue)
index c8582bda97f2b9704c3d53d53490296f0f8df0ac..7769ffc5c5f425ce04dc92147c77803291f0fdd5 100644 (file)
@@ -42,6 +42,10 @@ from .udn import UDNEmbedIE
 from .senateisvp import SenateISVPIE
 from .bliptv import BlipTVIE
 from .svt import SVTIE
+from .pornhub import PornHubIE
+from .xhamster import XHamsterEmbedIE
+from .vimeo import VimeoIE
+from .dailymotion import DailymotionCloudIE
 
 
 class GenericIE(InfoExtractor):
@@ -332,6 +336,15 @@ class GenericIE(InfoExtractor):
                 'skip_download': True,
             },
         },
+        # XHamster embed
+        {
+            'url': 'http://www.numisc.com/forum/showthread.php?11696-FM15-which-pumiscer-was-this-%28-vid-%29-%28-alfa-as-fuck-srx-%29&s=711f5db534502e22260dec8c5e2d66d8',
+            'info_dict': {
+                'id': 'showthread',
+                'title': '[NSFL] [FM15] which pumiscer was this ( vid ) ( alfa as fuck srx )',
+            },
+            'playlist_mincount': 7,
+        },
         # Embedded TED video
         {
             'url': 'http://en.support.wordpress.com/videos/ted-talks/',
@@ -811,6 +824,29 @@ class GenericIE(InfoExtractor):
                 'description': 'To understand why he was the Toronto Blue Jays’ top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.',
                 'uploader': 'Rogers Sportsnet',
             },
+        },
+        # Dailymotion Cloud video
+        {
+            'url': 'http://replay.publicsenat.fr/vod/le-debat/florent-kolandjian,dominique-cena,axel-decourtye,laurence-abeille,bruno-parmentier/175910',
+            'md5': '49444254273501a64675a7e68c502681',
+            'info_dict': {
+                'id': '5585de919473990de4bee11b',
+                'ext': 'mp4',
+                'title': 'Le débat',
+                'thumbnail': 're:^https?://.*\.jpe?g$',
+            }
+        },
+        # AdobeTVVideo embed
+        {
+            'url': 'https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners',
+            'md5': '43662b577c018ad707a63766462b1e87',
+            'info_dict': {
+                'id': '2456',
+                'ext': 'mp4',
+                'title': 'New experience with Acrobat DC',
+                'description': 'New experience with Acrobat DC',
+                'duration': 248.667,
+            },
         }
     ]
 
@@ -1090,18 +1126,9 @@ class GenericIE(InfoExtractor):
         if matches:
             return _playlist_from_matches(matches, ie='RtlNl')
 
-        # Look for embedded (iframe) Vimeo player
-        mobj = re.search(
-            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
-        if mobj:
-            player_url = unescapeHTML(mobj.group('url'))
-            surl = smuggle_url(player_url, {'Referer': url})
-            return self.url_result(surl)
-        # Look for embedded (swf embed) Vimeo player
-        mobj = re.search(
-            r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
-        if mobj:
-            return self.url_result(mobj.group(1))
+        vimeo_url = VimeoIE._extract_vimeo_url(url, webpage)
+        if vimeo_url is not None:
+            return self.url_result(vimeo_url)
 
         # Look for embedded YouTube player
         matches = re.findall(r'''(?x)
@@ -1323,6 +1350,16 @@ class GenericIE(InfoExtractor):
         if sportbox_urls:
             return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed')
 
+        # Look for embedded PornHub player
+        pornhub_url = PornHubIE._extract_url(webpage)
+        if pornhub_url:
+            return self.url_result(pornhub_url, 'PornHub')
+
+        # Look for embedded XHamster player
+        xhamster_urls = XHamsterEmbedIE._extract_urls(webpage)
+        if xhamster_urls:
+            return _playlist_from_matches(xhamster_urls, ie='XHamsterEmbed')
+
         # Look for embedded Tvigle player
         mobj = re.search(
             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage)
@@ -1490,6 +1527,20 @@ class GenericIE(InfoExtractor):
         if senate_isvp_url:
             return self.url_result(senate_isvp_url, 'SenateISVP')
 
+        # Look for Dailymotion Cloud videos
+        dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage)
+        if dmcloud_url:
+            return self.url_result(dmcloud_url, 'DailymotionCloud')
+
+        # Look for AdobeTVVideo embeds
+        mobj = re.search(
+            r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]',
+            webpage)
+        if mobj is not None:
+            return self.url_result(
+                self._proto_relative_url(unescapeHTML(mobj.group(1))),
+                'AdobeTVVideo')
+
         def check_video(vurl):
             if YoutubeIE.suitable(vurl):
                 return True
index f29df36b5bf6bd7e732ad84cbfd7d3eeb412f5ff..4bb574cf37df2421721b088ded37a3fc66c8c2ea 100644 (file)
@@ -46,7 +46,7 @@ class ImdbIE(InfoExtractor):
             format_info = info['videoPlayerObject']['video']
             formats.append({
                 'format_id': f_id,
-                'url': format_info['url'],
+                'url': format_info['videoInfoList'][0]['videoUrl'],
             })
 
         return {
index 42cb6e35f821256e90c8eef4f176812e3e0f42d0..f8cbca7b36afab1890b71806d6761bbe67d7d924 100644 (file)
@@ -8,6 +8,7 @@ from ..compat import compat_urlparse
 from ..utils import (
     determine_ext,
     int_or_none,
+    remove_end,
     unified_strdate,
     ExtractorError,
 )
@@ -39,7 +40,6 @@ class LifeNewsIE(InfoExtractor):
             'title': 'В Сети появилось видео захвата «Правым сектором» колхозных полей ',
             'description': 'Жители двух поселков Днепропетровской области не простили радикалам угрозу лишения плодородных земель и пошли в лобовую. ',
             'upload_date': '20150402',
-            'uploader': 'embed.life.ru',
         }
     }, {
         'url': 'http://lifenews.ru/news/153461',
@@ -50,7 +50,6 @@ class LifeNewsIE(InfoExtractor):
             'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве',
             'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.',
             'upload_date': '20150505',
-            'uploader': 'embed.life.ru',
         }
     }, {
         'url': 'http://lifenews.ru/video/13035',
@@ -72,20 +71,20 @@ class LifeNewsIE(InfoExtractor):
         if not videos and not iframe_link:
             raise ExtractorError('No media links available for %s' % video_id)
 
-        title = self._og_search_title(webpage)
-        TITLE_SUFFIX = ' - Первый по срочным новостям — LIFE | NEWS'
-        if title.endswith(TITLE_SUFFIX):
-            title = title[:-len(TITLE_SUFFIX)]
+        title = remove_end(
+            self._og_search_title(webpage),
+            ' - Первый по срочным новостям — LIFE | NEWS')
 
         description = self._og_search_description(webpage)
 
         view_count = self._html_search_regex(
             r'<div class=\'views\'>\s*(\d+)\s*</div>', webpage, 'view count', fatal=False)
         comment_count = self._html_search_regex(
-            r'<div class=\'comments\'>\s*<span class=\'counter\'>\s*(\d+)\s*</span>', webpage, 'comment count', fatal=False)
+            r'=\'commentCount\'[^>]*>\s*(\d+)\s*<',
+            webpage, 'comment count', fatal=False)
 
         upload_date = self._html_search_regex(
-            r'<time datetime=\'([^\']+)\'>', webpage, 'upload date', fatal=False)
+            r'<time[^>]*datetime=\'([^\']+)\'', webpage, 'upload date', fatal=False)
         if upload_date is not None:
             upload_date = unified_strdate(upload_date)
 
index 35822067f908f0567e8dcb8c9c8265df4d3421c2..857edfde263196d9bf2811568cc9f9de90eed92b 100644 (file)
@@ -40,6 +40,17 @@ class LiveLeakIE(InfoExtractor):
             'title': 'Man is Fatally Struck by Reckless Car While Packing up a Moving Truck',
             'age_limit': 18,
         }
+    }, {
+        # Covers https://github.com/rg3/youtube-dl/pull/5983
+        'url': 'http://www.liveleak.com/view?i=801_1409392012',
+        'md5': '0b3bec2d888c20728ca2ad3642f0ef15',
+        'info_dict': {
+            'id': '801_1409392012',
+            'ext': 'mp4',
+            'description': "Happened on 27.7.2014. \r\nAt 0:53 you can see people still swimming at near beach.",
+            'uploader': 'bony333',
+            'title': 'Crazy Hungarian tourist films close call waterspout in Croatia'
+        }
     }]
 
     def _real_extract(self, url):
@@ -85,7 +96,10 @@ class LiveLeakIE(InfoExtractor):
             'url': s['file'],
         } for i, s in enumerate(sources)]
         for i, s in enumerate(sources):
-            orig_url = s['file'].replace('.h264_base.mp4', '')
+            # Removing '.h264_*.mp4' gives the raw video, which is essentially
+            # the same video without the LiveLeak logo at the top (see
+            # https://github.com/rg3/youtube-dl/pull/4768)
+            orig_url = re.sub(r'\.h264_.+?\.mp4', '', s['file'])
             if s['file'] != orig_url:
                 formats.append({
                     'format_id': 'original-%s' % i,
index 3cecebf95a4acd0a388da5984aebc2822e79b32c..0f8aa5adad5b2247621ce00249f3bd03a33a104a 100644 (file)
@@ -182,7 +182,6 @@ class NiconicoIE(InfoExtractor):
         extension = xpath_text(video_info, './/movie_type')
         if not extension:
             extension = determine_ext(video_real_url)
-        video_format = extension.upper()
 
         thumbnail = (
             xpath_text(video_info, './/thumbnail_url') or
@@ -241,7 +240,7 @@ class NiconicoIE(InfoExtractor):
             'url': video_real_url,
             'title': title,
             'ext': extension,
-            'format': video_format,
+            'format_id': 'economy' if video_real_url.endswith('low') else 'normal',
             'thumbnail': thumbnail,
             'description': description,
             'uploader': uploader,
index 5bbd2dcf66294f5f0e21b6aae000f9ddecd5c051..a53e27b274eaa21ac15a1dc5077001d520832696 100644 (file)
@@ -195,7 +195,7 @@ class NocoIE(InfoExtractor):
         if episode_number:
             title += ' #' + compat_str(episode_number)
         if episode:
-            title += ' - ' + episode
+            title += ' - ' + compat_str(episode)
 
         description = show.get('show_resume') or show.get('family_resume')
 
diff --git a/youtube_dl/extractor/pinkbike.py b/youtube_dl/extractor/pinkbike.py
new file mode 100644 (file)
index 0000000..a52210f
--- /dev/null
@@ -0,0 +1,96 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    remove_end,
+    remove_start,
+    str_to_int,
+    unified_strdate,
+)
+
+
+class PinkbikeIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:(?:www\.)?pinkbike\.com/video/|es\.pinkbike\.org/i/kvid/kvid-y5\.swf\?id=)(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://www.pinkbike.com/video/402811/',
+        'md5': '4814b8ca7651034cd87e3361d5c2155a',
+        'info_dict': {
+            'id': '402811',
+            'ext': 'mp4',
+            'title': 'Brandon Semenuk - RAW 100',
+            'description': 'Official release: www.redbull.ca/rupertwalker',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'duration': 100,
+            'upload_date': '20150406',
+            'uploader': 'revelco',
+            'location': 'Victoria, British Columbia, Canada',
+            'view_count': int,
+            'comment_count': int,
+        }
+    }, {
+        'url': 'http://es.pinkbike.org/i/kvid/kvid-y5.swf?id=406629',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            'http://www.pinkbike.com/video/%s' % video_id, video_id)
+
+        formats = []
+        for _, format_id, src in re.findall(
+                r'data-quality=((?:\\)?["\'])(.+?)\1[^>]+src=\1(.+?)\1', webpage):
+            height = int_or_none(self._search_regex(
+                r'^(\d+)[pP]$', format_id, 'height', default=None))
+            formats.append({
+                'url': src,
+                'format_id': format_id,
+                'height': height,
+            })
+        self._sort_formats(formats)
+
+        title = remove_end(self._og_search_title(webpage), ' Video - Pinkbike')
+        description = self._html_search_regex(
+            r'(?s)id="media-description"[^>]*>(.+?)<',
+            webpage, 'description', default=None) or remove_start(
+            self._og_search_description(webpage), title + '. ')
+        thumbnail = self._og_search_thumbnail(webpage)
+        duration = int_or_none(self._html_search_meta(
+            'video:duration', webpage, 'duration'))
+
+        uploader = self._search_regex(
+            r'un:\s*"([^"]+)"', webpage, 'uploader', fatal=False)
+        upload_date = unified_strdate(self._search_regex(
+            r'class="fullTime"[^>]+title="([^"]+)"',
+            webpage, 'upload date', fatal=False))
+
+        location = self._html_search_regex(
+            r'(?s)<dt>Location</dt>\s*<dd>(.+?)<',
+            webpage, 'location', fatal=False)
+
+        def extract_count(webpage, label):
+            return str_to_int(self._search_regex(
+                r'<span[^>]+class="stat-num"[^>]*>([\d,.]+)</span>\s*<span[^>]+class="stat-label"[^>]*>%s' % label,
+                webpage, label, fatal=False))
+
+        view_count = extract_count(webpage, 'Views')
+        comment_count = extract_count(webpage, 'Comments')
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'upload_date': upload_date,
+            'uploader': uploader,
+            'location': location,
+            'view_count': view_count,
+            'comment_count': comment_count,
+            'formats': formats
+        }
index daa284ea28be63f9677471f16a0b8102f32b560e..8172bc9976755f7cc4361e1f6dba8d9d7b53d5fd 100644 (file)
@@ -19,8 +19,8 @@ from ..aes import (
 
 
 class PornHubIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?pornhub\.com/view_video\.php\?viewkey=(?P<id>[0-9a-f]+)'
-    _TEST = {
+    _VALID_URL = r'https?://(?:www\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)(?P<id>[0-9a-z]+)'
+    _TESTS = [{
         'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
         'md5': '882f488fa1f0026f023f33576004a2ed',
         'info_dict': {
@@ -30,7 +30,17 @@ class PornHubIE(InfoExtractor):
             "title": "Seductive Indian beauty strips down and fingers her pink pussy",
             "age_limit": 18
         }
-    }
+    }, {
+        'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def _extract_url(cls, webpage):
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?pornhub\.com/embed/\d+)\1', webpage)
+        if mobj:
+            return mobj.group('url')
 
     def _extract_count(self, pattern, webpage, name):
         return str_to_int(self._search_regex(
@@ -39,7 +49,8 @@ class PornHubIE(InfoExtractor):
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        req = compat_urllib_request.Request(url)
+        req = compat_urllib_request.Request(
+            'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id)
         req.add_header('Cookie', 'age_verified=1')
         webpage = self._download_webpage(req, video_id)
 
index 255d4abc131519ec470ccdc2b1a64b7d38d9f44b..536a42dc88a4e17bbd039289508521d1ea13e282 100644 (file)
@@ -177,6 +177,7 @@ class ProSiebenSat1IE(InfoExtractor):
         r'<header class="clearfix">\s*<h3>(.+?)</h3>',
         r'<!-- start video -->\s*<h1>(.+?)</h1>',
         r'<h1 class="att-name">\s*(.+?)</h1>',
+        r'<header class="module_header">\s*<h2>([^<]+)</h2>\s*</header>',
     ]
     _DESCRIPTION_REGEXES = [
         r'<p itemprop="description">\s*(.+?)</p>',
@@ -206,8 +207,8 @@ class ProSiebenSat1IE(InfoExtractor):
     def _extract_clip(self, url, webpage):
         clip_id = self._html_search_regex(self._CLIPID_REGEXES, webpage, 'clip id')
 
-        access_token = 'testclient'
-        client_name = 'kolibri-1.2.5'
+        access_token = 'prosieben'
+        client_name = 'kolibri-1.12.6'
         client_location = url
 
         videos_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos?%s' % compat_urllib_parse.urlencode({
@@ -275,13 +276,17 @@ class ProSiebenSat1IE(InfoExtractor):
         for source in urls_sources:
             protocol = source['protocol']
             if protocol == 'rtmp' or protocol == 'rtmpe':
-                mobj = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', source['url'])
+                mobj = re.search(r'^(?P<url>rtmpe?://[^/]+)/(?P<path>.+)$', source['url'])
                 if not mobj:
                     continue
+                path = mobj.group('path')
+                mp4colon_index = path.rfind('mp4:')
+                app = path[:mp4colon_index]
+                play_path = path[mp4colon_index:]
                 formats.append({
-                    'url': mobj.group('url'),
-                    'app': mobj.group('app'),
-                    'play_path': mobj.group('playpath'),
+                    'url': '%s/%s' % (mobj.group('url'), app),
+                    'app': app,
+                    'play_path': play_path,
                     'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf',
                     'page_url': 'http://www.prosieben.de',
                     'vbr': fix_bitrate(source['bitrate']),
index 10251f29e033ef241618ed7985e214dc0e76cd51..f3c80708c86ab2fc29fbd029b245bbe894af2dfb 100644 (file)
@@ -83,7 +83,7 @@ class SafariIE(SafariBaseIE):
                                     library/view/[^/]+|
                                     api/v1/book
                                 )/
-                                (?P<course_id>\d+)/
+                                (?P<course_id>[^/]+)/
                                     (?:chapter(?:-content)?/)?
                                 (?P<part>part\d+)\.html
     '''
@@ -100,6 +100,10 @@ class SafariIE(SafariBaseIE):
     }, {
         'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html',
         'only_matching': True,
+    }, {
+        # non-digits in course id
+        'url': 'https://www.safaribooksonline.com/library/view/create-a-nodejs/100000006A0210/part00.html',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
@@ -122,7 +126,7 @@ class SafariCourseIE(SafariBaseIE):
     IE_NAME = 'safari:course'
     IE_DESC = 'safaribooksonline.com online courses'
 
-    _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/(?:library/view/[^/]+|api/v1/book)/(?P<id>\d+)/?(?:[#?]|$)'
+    _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/(?:library/view/[^/]+|api/v1/book)/(?P<id>[^/]+)/?(?:[#?]|$)'
 
     _TESTS = [{
         'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/',
index 29bd9ce6f76247b5ac997050075c7e720d8a3b2b..ba2d5e19bc0d1de322b4b12ed5b8c0dc31157f7f 100644 (file)
@@ -6,9 +6,12 @@ import re
 from .common import InfoExtractor
 from ..compat import (
     compat_str,
-    compat_urllib_request
+    compat_urllib_request,
+    compat_urllib_parse,
+)
+from ..utils import (
+    ExtractorError,
 )
-from ..utils import ExtractorError
 
 
 class SohuIE(InfoExtractor):
@@ -26,7 +29,7 @@ class SohuIE(InfoExtractor):
         'skip': 'On available in China',
     }, {
         'url': 'http://tv.sohu.com/20150305/n409385080.shtml',
-        'md5': 'ac9a5d322b4bf9ae184d53e4711e4f1a',
+        'md5': '699060e75cf58858dd47fb9c03c42cfb',
         'info_dict': {
             'id': '409385080',
             'ext': 'mp4',
@@ -34,7 +37,7 @@ class SohuIE(InfoExtractor):
         }
     }, {
         'url': 'http://my.tv.sohu.com/us/232799889/78693464.shtml',
-        'md5': '49308ff6dafde5ece51137d04aec311e',
+        'md5': '9bf34be48f2f4dadcb226c74127e203c',
         'info_dict': {
             'id': '78693464',
             'ext': 'mp4',
@@ -48,7 +51,7 @@ class SohuIE(InfoExtractor):
             'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',
         },
         'playlist': [{
-            'md5': '492923eac023ba2f13ff69617c32754a',
+            'md5': 'bdbfb8f39924725e6589c146bc1883ad',
             'info_dict': {
                 'id': '78910339_part1',
                 'ext': 'mp4',
@@ -56,7 +59,7 @@ class SohuIE(InfoExtractor):
                 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',
             }
         }, {
-            'md5': 'de604848c0e8e9c4a4dde7e1347c0637',
+            'md5': '3e1f46aaeb95354fd10e7fca9fc1804e',
             'info_dict': {
                 'id': '78910339_part2',
                 'ext': 'mp4',
@@ -64,7 +67,7 @@ class SohuIE(InfoExtractor):
                 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',
             }
         }, {
-            'md5': '93584716ee0657c0b205b8aa3d27aa13',
+            'md5': '8407e634175fdac706766481b9443450',
             'info_dict': {
                 'id': '78910339_part3',
                 'ext': 'mp4',
@@ -139,21 +142,42 @@ class SohuIE(InfoExtractor):
         for i in range(part_count):
             formats = []
             for format_id, format_data in formats_json.items():
+                allot = format_data['allot']
+
                 data = format_data['data']
+                clips_url = data['clipsURL']
+                su = data['su']
 
-                # URLs starts with http://newflv.sohu.ccgslb.net/ is not usable
-                # so retry until got a working URL
                 video_url = 'newflv.sohu.ccgslb.net'
+                cdnId = None
                 retries = 0
-                while 'newflv.sohu.ccgslb.net' in video_url and retries < 5:
-                    download_note = 'Download information from CDN gateway for format ' + format_id
+
+                while 'newflv.sohu.ccgslb.net' in video_url:
+                    params = {
+                        'prot': 9,
+                        'file': clips_url[i],
+                        'new': su[i],
+                        'prod': 'flash',
+                    }
+
+                    if cdnId is not None:
+                        params['idc'] = cdnId
+
+                    download_note = 'Downloading %s video URL part %d of %d' % (
+                        format_id, i + 1, part_count)
+
                     if retries > 0:
                         download_note += ' (retry #%d)' % retries
+                    part_info = self._parse_json(self._download_webpage(
+                        'http://%s/?%s' % (allot, compat_urllib_parse.urlencode(params)),
+                        video_id, download_note), video_id)
+
+                    video_url = part_info['url']
+                    cdnId = part_info.get('nid')
+
                     retries += 1
-                    cdn_info = self._download_json(
-                        'http://data.vod.itc.cn/cdnList?new=' + data['su'][i],
-                        video_id, download_note)
-                    video_url = cdn_info['url']
+                    if retries > 5:
+                        raise ExtractorError('Failed to get video URL')
 
                 formats.append({
                     'url': video_url,
index 06d6e6640637f2ff3507c3fa1c68339fbb8d98b4..bff75d6b2945584e0193b50ff8915b91fec26f1f 100644 (file)
@@ -27,7 +27,7 @@ class SpankwireIE(InfoExtractor):
             'description': 'Crazy Bitch X rated music video.',
             'uploader': 'oreusz',
             'uploader_id': '124697',
-            'upload_date': '20070508',
+            'upload_date': '20070507',
             'age_limit': 18,
         }
     }
@@ -44,7 +44,7 @@ class SpankwireIE(InfoExtractor):
         title = self._html_search_regex(
             r'<h1>([^<]+)', webpage, 'title')
         description = self._html_search_regex(
-            r'<div\s+id="descriptionContent">([^<]+)<',
+            r'(?s)<div\s+id="descriptionContent">(.+?)</div>',
             webpage, 'description', fatal=False)
         thumbnail = self._html_search_regex(
             r'playerData\.screenShot\s*=\s*["\']([^"\']+)["\']',
@@ -64,12 +64,12 @@ class SpankwireIE(InfoExtractor):
             r'<div id="viewsCounter"><span>([\d,\.]+)</span> views</div>',
             webpage, 'view count', fatal=False))
         comment_count = str_to_int(self._html_search_regex(
-            r'Comments<span[^>]+>\s*\(([\d,\.]+)\)</span>',
+            r'<span\s+id="spCommentCount"[^>]*>([\d,\.]+)</span>',
             webpage, 'comment count', fatal=False))
 
         video_urls = list(map(
             compat_urllib_parse.unquote,
-            re.findall(r'playerData\.cdnPath[0-9]{3,}\s*=\s*["\']([^"\']+)["\']', webpage)))
+            re.findall(r'playerData\.cdnPath[0-9]{3,}\s*=\s*(?:encodeURIComponent\()?["\']([^"\']+)["\']', webpage)))
         if webpage.find('flashvars\.encrypted = "true"') != -1:
             password = self._search_regex(
                 r'flashvars\.video_title = "([^"]+)',
index e6218808f29116e8d2d0d39f2a61342eef1bc823..9ead13a91dd1851085053c9be10cc4a23215dd5a 100644 (file)
@@ -4,6 +4,8 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
+from .pornhub import PornHubIE
+from .vimeo import VimeoIE
 
 
 class TumblrIE(InfoExtractor):
@@ -39,6 +41,17 @@ class TumblrIE(InfoExtractor):
             'timestamp': 1430931613,
         },
         'add_ie': ['Vidme'],
+    }, {
+        'url': 'http://camdamage.tumblr.com/post/98846056295/',
+        'md5': 'a9e0c8371ea1ca306d6554e3fecf50b6',
+        'info_dict': {
+            'id': '105463834',
+            'ext': 'mp4',
+            'title': 'Cam Damage-HD 720p',
+            'uploader': 'John Moyer',
+            'uploader_id': 'user32021558',
+        },
+        'add_ie': ['Vimeo'],
     }]
 
     def _real_extract(self, url):
@@ -55,6 +68,14 @@ class TumblrIE(InfoExtractor):
         if vid_me_embed_url is not None:
             return self.url_result(vid_me_embed_url, 'Vidme')
 
+        pornhub_url = PornHubIE._extract_url(webpage)
+        if pornhub_url:
+            return self.url_result(pornhub_url, 'PornHub')
+
+        vimeo_url = VimeoIE._extract_vimeo_url(url, webpage)
+        if vimeo_url:
+            return self.url_result(vimeo_url, 'Vimeo')
+
         iframe_url = self._search_regex(
             r'src=\'(https?://www\.tumblr\.com/video/[^\']+)\'',
             webpage, 'iframe url')
index 6b5d80aeed9f7458a8136f705c386d213b1e5840..3a4f393fcf6d79f3f42970db7aab853d5efedf84 100644 (file)
@@ -27,7 +27,7 @@ class TVCIE(InfoExtractor):
     @classmethod
     def _extract_url(cls, webpage):
         mobj = re.search(
-            r'<iframe[^>]+?src=(["\'])(?P<url>(?:http://)?(?:www\.)?tvc\.ru/video/iframe/id/[^"]+)\1', webpage)
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:http:)?//(?:www\.)?tvc\.ru/video/iframe/id/[^"]+)\1', webpage)
         if mobj:
             return mobj.group('url')
 
index e83e31a31640fa32e4a19a48a745d279a14d3753..79863e781fd41101c76659ab3b43a85433d25665 100644 (file)
@@ -26,6 +26,7 @@ class TVPlayIE(InfoExtractor):
            viasat4play\.no/programmer|
            tv6play\.no/programmer|
            tv3play\.dk/programmer|
+           play\.novatv\.bg/programi
         )/[^/]+/(?P<id>\d+)
         '''
     _TESTS = [
@@ -173,6 +174,22 @@ class TVPlayIE(InfoExtractor):
                 'skip_download': True,
             },
         },
+        {
+            'url': 'http://play.novatv.bg/programi/zdravei-bulgariya/624952?autostart=true',
+            'info_dict': {
+                'id': '624952',
+                'ext': 'flv',
+                'title': 'Здравей, България (12.06.2015 г.) ',
+                'description': 'md5:99f3700451ac5bb71a260268b8daefd7',
+                'duration': 8838,
+                'timestamp': 1434100372,
+                'upload_date': '20150612',
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+        },
     ]
 
     def _real_extract(self, url):
index dd026748dcbb536f9f49181b0d211bf0a9157777..722eb52368825b92c88506ff33d79bf1f2f91a32 100644 (file)
@@ -5,6 +5,7 @@ from .common import InfoExtractor
 from ..compat import (
     compat_urllib_parse,
     compat_urllib_request,
+    compat_urlparse,
 )
 from ..utils import (
     ExtractorError,
@@ -26,11 +27,21 @@ class Vbox7IE(InfoExtractor):
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        redirect_page, urlh = self._download_webpage_handle(url, video_id)
-        new_location = self._search_regex(r'window\.location = \'(.*)\';',
-                                          redirect_page, 'redirect location')
-        redirect_url = urlh.geturl() + new_location
-        webpage = self._download_webpage(redirect_url, video_id,
+        # need to get the page 3 times for the correct jsSecretToken cookie
+        # which is necessary for the correct title
+        def get_session_id():
+            redirect_page = self._download_webpage(url, video_id)
+            session_id_url = self._search_regex(
+                r'var\s*url\s*=\s*\'([^\']+)\';', redirect_page,
+                'session id url')
+            self._download_webpage(
+                compat_urlparse.urljoin(url, session_id_url), video_id,
+                'Getting session id')
+
+        get_session_id()
+        get_session_id()
+
+        webpage = self._download_webpage(url, video_id,
                                          'Downloading redirect page')
 
         title = self._html_search_regex(r'<title>(.*)</title>',
index 7f2fb1ca8896e29e48a41a9efddaded987ba1e96..51cdc6b65143aaf4a0d2823ffa8c859c96e25972 100644 (file)
@@ -1,5 +1,7 @@
+# coding: utf-8
 from __future__ import unicode_literals
 
+import json
 import time
 import hmac
 import hashlib
@@ -11,6 +13,7 @@ from ..utils import (
     parse_age_limit,
     parse_iso8601,
 )
+from ..compat import compat_urllib_request
 from .common import InfoExtractor
 
 
@@ -23,27 +26,35 @@ class VikiBaseIE(InfoExtractor):
     _APP_VERSION = '2.2.5.1428709186'
     _APP_SECRET = '-$iJ}@p7!G@SyU/je1bEyWg}upLu-6V6-Lg9VD(]siH,r.,m-r|ulZ,U4LC/SeR)'
 
-    def _prepare_call(self, path, timestamp=None):
+    _NETRC_MACHINE = 'viki'
+
+    _token = None
+
+    def _prepare_call(self, path, timestamp=None, post_data=None):
         path += '?' if '?' not in path else '&'
         if not timestamp:
             timestamp = int(time.time())
         query = self._API_QUERY_TEMPLATE % (path, self._APP, timestamp)
+        if self._token:
+            query += '&token=%s' % self._token
         sig = hmac.new(
             self._APP_SECRET.encode('ascii'),
             query.encode('ascii'),
             hashlib.sha1
         ).hexdigest()
-        return self._API_URL_TEMPLATE % (query, sig)
+        url = self._API_URL_TEMPLATE % (query, sig)
+        return compat_urllib_request.Request(
+            url, json.dumps(post_data).encode('utf-8')) if post_data else url
 
-    def _call_api(self, path, video_id, note, timestamp=None):
+    def _call_api(self, path, video_id, note, timestamp=None, post_data=None):
         resp = self._download_json(
-            self._prepare_call(path, timestamp), video_id, note)
+            self._prepare_call(path, timestamp, post_data), video_id, note)
 
         error = resp.get('error')
         if error:
             if error == 'invalid timestamp':
                 resp = self._download_json(
-                    self._prepare_call(path, int(resp['current_timestamp'])),
+                    self._prepare_call(path, int(resp['current_timestamp']), post_data),
                     video_id, '%s (retry)' % note)
                 error = resp.get('error')
             if error:
@@ -56,6 +67,27 @@ class VikiBaseIE(InfoExtractor):
             '%s returned error: %s' % (self.IE_NAME, error),
             expected=True)
 
+    def _real_initialize(self):
+        self._login()
+
+    def _login(self):
+        (username, password) = self._get_login_info()
+        if username is None:
+            return
+
+        login_form = {
+            'login_id': username,
+            'password': password,
+        }
+
+        login = self._call_api(
+            'sessions.json', None,
+            'Logging in as %s' % username, post_data=login_form)
+
+        self._token = login.get('token')
+        if not self._token:
+            self.report_warning('Unable to get session token, login has probably failed')
+
 
 class VikiIE(VikiBaseIE):
     IE_NAME = 'viki'
index f300c7ca40eeba75f872bbd6cd79c3f0720e0955..cae90205d30b0917e359df6281646eef64b82def 100644 (file)
@@ -22,6 +22,7 @@ from ..utils import (
     unified_strdate,
     unsmuggle_url,
     urlencode_postdata,
+    unescapeHTML,
 )
 
 
@@ -173,6 +174,21 @@ class VimeoIE(VimeoBaseInfoExtractor):
         },
     ]
 
+    @staticmethod
+    def _extract_vimeo_url(url, webpage):
+        # Look for embedded (iframe) Vimeo player
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
+        if mobj:
+            player_url = unescapeHTML(mobj.group('url'))
+            surl = smuggle_url(player_url, {'Referer': url})
+            return surl
+        # Look for embedded (swf embed) Vimeo player
+        mobj = re.search(
+            r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
+        if mobj:
+            return mobj.group(1)
+
     def _verify_video_password(self, url, video_id, webpage):
         password = self._downloader.params.get('videopassword', None)
         if password is None:
index cc384adbf9837f35f90c64d0e8dc0396b0b601ec..38ff3c1a949c0511d08518b77180be653defce62 100644 (file)
@@ -13,6 +13,7 @@ from ..compat import (
 from ..utils import (
     ExtractorError,
     orderedSet,
+    str_to_int,
     unescapeHTML,
     unified_strdate,
 )
@@ -34,6 +35,7 @@ class VKIE(InfoExtractor):
                 'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*',
                 'duration': 195,
                 'upload_date': '20120212',
+                'view_count': int,
             },
         },
         {
@@ -45,7 +47,8 @@ class VKIE(InfoExtractor):
                 'uploader': 'Tom Cruise',
                 'title': 'No name',
                 'duration': 9,
-                'upload_date': '20130721'
+                'upload_date': '20130721',
+                'view_count': int,
             }
         },
         {
@@ -59,6 +62,7 @@ class VKIE(InfoExtractor):
                 'title': 'Lin Dan',
                 'duration': 101,
                 'upload_date': '20120730',
+                'view_count': int,
             }
         },
         {
@@ -73,7 +77,8 @@ class VKIE(InfoExtractor):
                 'uploader': 'Триллеры',
                 'title': '► Бойцовский клуб / Fight Club 1999 [HD 720]',
                 'duration': 8352,
-                'upload_date': '20121218'
+                'upload_date': '20121218',
+                'view_count': int,
             },
             'skip': 'Requires vk account credentials',
         },
@@ -100,6 +105,7 @@ class VKIE(InfoExtractor):
                 'title': 'Книга Илая',
                 'duration': 6771,
                 'upload_date': '20140626',
+                'view_count': int,
             },
             'skip': 'Only works from Russia',
         },
@@ -119,8 +125,8 @@ class VKIE(InfoExtractor):
             'act': 'login',
             'role': 'al_frame',
             'expire': '1',
-            'email': username,
-            'pass': password,
+            'email': username.encode('cp1251'),
+            'pass': password.encode('cp1251'),
         }
 
         request = compat_urllib_request.Request('https://login.vk.com/?act=login',
@@ -175,25 +181,29 @@ class VKIE(InfoExtractor):
                 m_rutube.group(1).replace('\\', ''))
             return self.url_result(rutube_url)
 
-        m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.*?});', info_page)
+        m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.+?});', info_page)
         if m_opts:
-            m_opts_url = re.search(r"url\s*:\s*'([^']+)", m_opts.group(1))
+            m_opts_url = re.search(r"url\s*:\s*'((?!/\b)[^']+)", m_opts.group(1))
             if m_opts_url:
                 opts_url = m_opts_url.group(1)
                 if opts_url.startswith('//'):
                     opts_url = 'http:' + opts_url
                 return self.url_result(opts_url)
 
-        data_json = self._search_regex(r'var vars = ({.*?});', info_page, 'vars')
+        data_json = self._search_regex(r'var\s+vars\s*=\s*({.+?});', info_page, 'vars')
         data = json.loads(data_json)
 
         # Extract upload date
         upload_date = None
-        mobj = re.search(r'id="mv_date_wrap".*?Added ([a-zA-Z]+ [0-9]+), ([0-9]+) at', info_page)
+        mobj = re.search(r'id="mv_date(?:_views)?_wrap"[^>]*>([a-zA-Z]+ [0-9]+), ([0-9]+) at', info_page)
         if mobj is not None:
             mobj.group(1) + ' ' + mobj.group(2)
             upload_date = unified_strdate(mobj.group(1) + ' ' + mobj.group(2))
 
+        view_count = str_to_int(self._search_regex(
+            r'"mv_views_count_number"[^>]*>([\d,.]+) views<',
+            info_page, 'view count', fatal=False))
+
         formats = [{
             'format_id': k,
             'url': v,
@@ -210,6 +220,7 @@ class VKIE(InfoExtractor):
             'uploader': data.get('md_author'),
             'duration': data.get('duration'),
             'upload_date': upload_date,
+            'view_count': view_count,
         }
 
 
index 4527567f8fc26c45b091aa868dc77b159576b56c..b4ad513a0d18ecbae558deceb6234efaa5ce5415 100644 (file)
@@ -13,7 +13,6 @@ from ..utils import (
 
 
 class XHamsterIE(InfoExtractor):
-    """Information Extractor for xHamster"""
     _VALID_URL = r'(?P<proto>https?)://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?'
     _TESTS = [
         {
@@ -133,3 +132,36 @@ class XHamsterIE(InfoExtractor):
             'age_limit': age_limit,
             'formats': formats,
         }
+
+
+class XHamsterEmbedIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?xhamster\.com/xembed\.php\?video=(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://xhamster.com/xembed.php?video=3328539',
+        'info_dict': {
+            'id': '3328539',
+            'ext': 'mp4',
+            'title': 'Pen Masturbation',
+            'upload_date': '20140728',
+            'uploader_id': 'anonymous',
+            'duration': 5,
+            'age_limit': 18,
+        }
+    }
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return [url for _, url in re.findall(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?xhamster\.com/xembed\.php\?video=\d+)\1',
+            webpage)]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        video_url = self._search_regex(
+            r'href="(https?://xhamster\.com/movies/%s/[^"]+\.html[^"]*)"' % video_id,
+            webpage, 'xhamster url')
+
+        return self.url_result(video_url, 'XHamster')
index 2a45dc574263f7e651020e591fcc40bdf987367d..d8415bed49ea8bab3bbc30f7136f31a69ac357c2 100644 (file)
@@ -5,10 +5,12 @@ import re
 from .common import InfoExtractor
 from ..compat import (
     compat_urllib_parse,
+    compat_urllib_request,
 )
 from ..utils import (
     clean_html,
     ExtractorError,
+    determine_ext,
 )
 
 
@@ -25,6 +27,8 @@ class XVideosIE(InfoExtractor):
         }
     }
 
+    _ANDROID_USER_AGENT = 'Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19'
+
     def _real_extract(self, url):
         video_id = self._match_id(url)
         webpage = self._download_webpage(url, video_id)
@@ -40,9 +44,30 @@ class XVideosIE(InfoExtractor):
         video_thumbnail = self._search_regex(
             r'url_bigthumb=(.+?)&amp', webpage, 'thumbnail', fatal=False)
 
+        formats = [{
+            'url': video_url,
+        }]
+
+        android_req = compat_urllib_request.Request(url)
+        android_req.add_header('User-Agent', self._ANDROID_USER_AGENT)
+        android_webpage = self._download_webpage(android_req, video_id, fatal=False)
+
+        if android_webpage is not None:
+            player_params_str = self._search_regex(
+                'mobileReplacePlayerDivTwoQual\(([^)]+)\)',
+                android_webpage, 'player parameters', default='')
+            player_params = list(map(lambda s: s.strip(' \''), player_params_str.split(',')))
+            if player_params:
+                formats.extend([{
+                    'url': param,
+                    'preference': -10,
+                } for param in player_params if determine_ext(param) == 'mp4'])
+
+        self._sort_formats(formats)
+
         return {
             'id': video_id,
-            'url': video_url,
+            'formats': formats,
             'title': video_title,
             'ext': 'flv',
             'thumbnail': video_thumbnail,
index 97b98bbe88715f644da6bec1709d697af2c8e0e0..ced3a10cd417840796d4240cfd1d8163f3148134 100644 (file)
 # coding: utf-8
-
 from __future__ import unicode_literals
 
-import math
-import random
-import re
-import time
+import base64
 
 from .common import InfoExtractor
-from ..utils import (
-    ExtractorError,
+from ..utils import ExtractorError
+
+from ..compat import (
+    compat_urllib_parse,
+    compat_ord,
+    compat_urllib_request,
 )
 
 
 class YoukuIE(InfoExtractor):
+    IE_NAME = 'youku'
     _VALID_URL = r'''(?x)
         (?:
             http://(?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)|
             youku:)
         (?P<id>[A-Za-z0-9]+)(?:\.html|/v\.swf|)
     '''
-    _TEST = {
-        'url': 'http://v.youku.com/v_show/id_XNDgyMDQ2NTQw.html',
-        'md5': 'ffe3f2e435663dc2d1eea34faeff5b5b',
-        'params': {
-            'test': False
+
+    _TESTS = [{
+        'url': 'http://v.youku.com/v_show/id_XMTc1ODE5Njcy.html',
+        'md5': '5f3af4192eabacc4501508d54a8cabd7',
+        'info_dict': {
+            'id': 'XMTc1ODE5Njcy_part1',
+            'title': '★Smile﹗♡ Git Fresh -Booty Music舞蹈.',
+            'ext': 'flv'
+        }
+    }, {
+        'url': 'http://player.youku.com/player.php/sid/XNDgyMDQ2NTQw/v.swf',
+        'only_matching': True,
+    }, {
+        'url': 'http://v.youku.com/v_show/id_XODgxNjg1Mzk2_ev_1.html',
+        'info_dict': {
+            'id': 'XODgxNjg1Mzk2',
+            'title': '武媚娘传奇 85',
         },
+        'playlist_count': 11,
+    }, {
+        'url': 'http://v.youku.com/v_show/id_XMTI1OTczNDM5Mg==.html',
         'info_dict': {
-            'id': 'XNDgyMDQ2NTQw_part00',
-            'ext': 'flv',
-            'title': 'youtube-dl test video "\'/\\ä↭𝕐'
+            'id': 'XMTI1OTczNDM5Mg',
+            'title': '花千骨 04',
+        },
+        'playlist_count': 13,
+        'skip': 'Available in China only',
+    }]
+
+    def construct_video_urls(self, data1, data2):
+        # get sid, token
+        def yk_t(s1, s2):
+            ls = list(range(256))
+            t = 0
+            for i in range(256):
+                t = (t + ls[i] + compat_ord(s1[i % len(s1)])) % 256
+                ls[i], ls[t] = ls[t], ls[i]
+            s = bytearray()
+            x, y = 0, 0
+            for i in range(len(s2)):
+                y = (y + 1) % 256
+                x = (x + ls[y]) % 256
+                ls[x], ls[y] = ls[y], ls[x]
+                s.append(compat_ord(s2[i]) ^ ls[(ls[x] + ls[y]) % 256])
+            return bytes(s)
+
+        sid, token = yk_t(
+            b'becaf9be', base64.b64decode(data2['ep'].encode('ascii'))
+        ).decode('ascii').split('_')
+
+        # get oip
+        oip = data2['ip']
+
+        # get fileid
+        string_ls = list(
+            'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890')
+        shuffled_string_ls = []
+        seed = data1['seed']
+        N = len(string_ls)
+        for ii in range(N):
+            seed = (seed * 0xd3 + 0x754f) % 0x10000
+            idx = seed * len(string_ls) // 0x10000
+            shuffled_string_ls.append(string_ls[idx])
+            del string_ls[idx]
+
+        fileid_dict = {}
+        for format in data1['streamtypes']:
+            streamfileid = [
+                int(i) for i in data1['streamfileids'][format].strip('*').split('*')]
+            fileid = ''.join(
+                [shuffled_string_ls[i] for i in streamfileid])
+            fileid_dict[format] = fileid[:8] + '%s' + fileid[10:]
+
+        def get_fileid(format, n):
+            fileid = fileid_dict[format] % hex(int(n))[2:].upper().zfill(2)
+            return fileid
+
+        # get ep
+        def generate_ep(format, n):
+            fileid = get_fileid(format, n)
+            ep_t = yk_t(
+                b'bf7e5f01',
+                ('%s_%s_%s' % (sid, fileid, token)).encode('ascii')
+            )
+            ep = base64.b64encode(ep_t).decode('ascii')
+            return ep
+
+        # generate video_urls
+        video_urls_dict = {}
+        for format in data1['streamtypes']:
+            video_urls = []
+            for dt in data1['segs'][format]:
+                n = str(int(dt['no']))
+                param = {
+                    'K': dt['k'],
+                    'hd': self.get_hd(format),
+                    'myp': 0,
+                    'ts': dt['seconds'],
+                    'ypp': 0,
+                    'ctype': 12,
+                    'ev': 1,
+                    'token': token,
+                    'oip': oip,
+                    'ep': generate_ep(format, n)
+                }
+                video_url = \
+                    'http://k.youku.com/player/getFlvPath/' + \
+                    'sid/' + sid + \
+                    '_' + str(int(n) + 1).zfill(2) + \
+                    '/st/' + self.parse_ext_l(format) + \
+                    '/fileid/' + get_fileid(format, n) + '?' + \
+                    compat_urllib_parse.urlencode(param)
+                video_urls.append(video_url)
+            video_urls_dict[format] = video_urls
+
+        return video_urls_dict
+
+    def get_hd(self, fm):
+        hd_id_dict = {
+            'flv': '0',
+            'mp4': '1',
+            'hd2': '2',
+            'hd3': '3',
+            '3gp': '0',
+            '3gphd': '1'
         }
-    }
-
-    def _gen_sid(self):
-        nowTime = int(time.time() * 1000)
-        random1 = random.randint(1000, 1998)
-        random2 = random.randint(1000, 9999)
-
-        return "%d%d%d" % (nowTime, random1, random2)
-
-    def _get_file_ID_mix_string(self, seed):
-        mixed = []
-        source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
-        seed = float(seed)
-        for i in range(len(source)):
-            seed = (seed * 211 + 30031) % 65536
-            index = math.floor(seed / 65536 * len(source))
-            mixed.append(source[int(index)])
-            source.remove(source[int(index)])
-        # return ''.join(mixed)
-        return mixed
-
-    def _get_file_id(self, fileId, seed):
-        mixed = self._get_file_ID_mix_string(seed)
-        ids = fileId.split('*')
-        realId = []
-        for ch in ids:
-            if ch:
-                realId.append(mixed[int(ch)])
-        return ''.join(realId)
+        return hd_id_dict[fm]
+
+    def parse_ext_l(self, fm):
+        ext_dict = {
+            'flv': 'flv',
+            'mp4': 'mp4',
+            'hd2': 'flv',
+            'hd3': 'flv',
+            '3gp': 'flv',
+            '3gphd': 'mp4'
+        }
+        return ext_dict[fm]
+
+    def get_format_name(self, fm):
+        _dict = {
+            '3gp': 'h6',
+            '3gphd': 'h5',
+            'flv': 'h4',
+            'mp4': 'h3',
+            'hd2': 'h2',
+            'hd3': 'h1'
+        }
+        return _dict[fm]
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-
-        info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
+        video_id = self._match_id(url)
 
-        config = self._download_json(info_url, video_id)
+        def retrieve_data(req_url, note):
+            req = compat_urllib_request.Request(req_url)
 
-        error_code = config['data'][0].get('error_code')
-        if error_code:
-            # -8 means blocked outside China.
-            error = config['data'][0].get('error')  # Chinese and English, separated by newline.
-            raise ExtractorError(error or 'Server reported error %i' % error_code,
-                                 expected=True)
+            cn_verification_proxy = self._downloader.params.get('cn_verification_proxy')
+            if cn_verification_proxy:
+                req.add_header('Ytdl-request-proxy', cn_verification_proxy)
 
-        video_title = config['data'][0]['title']
-        seed = config['data'][0]['seed']
+            raw_data = self._download_json(req, video_id, note=note)
+            return raw_data['data'][0]
 
-        format = self._downloader.params.get('format', None)
-        supported_format = list(config['data'][0]['streamfileids'].keys())
+        # request basic data
+        data1 = retrieve_data(
+            'http://v.youku.com/player/getPlayList/VideoIDS/%s' % video_id,
+            'Downloading JSON metadata 1')
+        data2 = retrieve_data(
+            'http://v.youku.com/player/getPlayList/VideoIDS/%s/Pf/4/ctype/12/ev/1' % video_id,
+            'Downloading JSON metadata 2')
 
-        # TODO proper format selection
-        if format is None or format == 'best':
-            if 'hd2' in supported_format:
-                format = 'hd2'
+        error_code = data1.get('error_code')
+        if error_code:
+            error = data1.get('error')
+            if error is not None and '因版权原因无法观看此视频' in error:
+                raise ExtractorError(
+                    'Youku said: Sorry, this video is available in China only', expected=True)
             else:
-                format = 'flv'
-            ext = 'flv'
-        elif format == 'worst':
-            format = 'mp4'
-            ext = 'mp4'
-        else:
-            format = 'flv'
-            ext = 'flv'
-
-        fileid = config['data'][0]['streamfileids'][format]
-        keys = [s['k'] for s in config['data'][0]['segs'][format]]
-        # segs is usually a dictionary, but an empty *list* if an error occured.
-
-        files_info = []
-        sid = self._gen_sid()
-        fileid = self._get_file_id(fileid, seed)
-
-        # column 8,9 of fileid represent the segment number
-        # fileid[7:9] should be changed
-        for index, key in enumerate(keys):
-            temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
-            download_url = 'http://k.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
-
-            info = {
-                'id': '%s_part%02d' % (video_id, index),
-                'url': download_url,
-                'uploader': None,
-                'upload_date': None,
-                'title': video_title,
-                'ext': ext,
-            }
-            files_info.append(info)
-
-        return files_info
+                msg = 'Youku server reported error %i' % error_code
+                if error is not None:
+                    msg += ': ' + error
+                raise ExtractorError(msg)
+
+        title = data1['title']
+
+        # generate video_urls_dict
+        video_urls_dict = self.construct_video_urls(data1, data2)
+
+        # construct info
+        entries = [{
+            'id': '%s_part%d' % (video_id, i + 1),
+            'title': title,
+            'formats': [],
+            # some formats are not available for all parts, we have to detect
+            # which one has all
+        } for i in range(max(len(v) for v in data1['segs'].values()))]
+        for fm in data1['streamtypes']:
+            video_urls = video_urls_dict[fm]
+            for video_url, seg, entry in zip(video_urls, data1['segs'][fm], entries):
+                entry['formats'].append({
+                    'url': video_url,
+                    'format_id': self.get_format_name(fm),
+                    'ext': self.parse_ext_l(fm),
+                    'filesize': int(seg['size']),
+                })
+
+        return {
+            '_type': 'multi_video',
+            'id': video_id,
+            'title': title,
+            'entries': entries,
+        }
index 3448bec4fdc96b361c0b071ab12832583533c14f..a3da56c1413494ffd73d523a042959af16956ff0 100644 (file)
@@ -234,6 +234,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
         '44': {'ext': 'webm', 'width': 854, 'height': 480},
         '45': {'ext': 'webm', 'width': 1280, 'height': 720},
         '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
+        '59': {'ext': 'mp4', 'width': 854, 'height': 480},
+        '78': {'ext': 'mp4', 'width': 854, 'height': 480},
 
 
         # 3d videos
@@ -1504,7 +1506,7 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE):
 
         for pagenum in itertools.count(1):
             url_query = {
-                'search_query': query,
+                'search_query': query.encode('utf-8'),
                 'page': pagenum,
                 'spf': 'navigate',
             }
index 096ab6137460e2298a5a1fa17eb81a36de65be32..6aeca61ee5e099e80a5d98a893afb8adde72e1cd 100644 (file)
@@ -729,7 +729,7 @@ def parseOpts(overrideArguments=None):
         metavar='POLICY', dest='fixup', default='detect_or_warn',
         help='Automatically correct known faults of the file. '
              'One of never (do nothing), warn (only emit a warning), '
-             'detect_or_warn(the default; fix file if we can, warn otherwise)')
+             'detect_or_warn (the default; fix file if we can, warn otherwise)')
     postproc.add_option(
         '--prefer-avconv',
         action='store_false', dest='prefer_ffmpeg',
index 774494efd1dbc9af1901bf67d1453689e846b780..e19dbf73d5fe36c602d9ffb83cd2d02ab39cb5e1 100644 (file)
@@ -35,6 +35,11 @@ class EmbedThumbnailPP(FFmpegPostProcessor):
 
         thumbnail_filename = info['thumbnails'][-1]['filename']
 
+        if not os.path.exists(encodeFilename(thumbnail_filename)):
+            self._downloader.report_warning(
+                'Skipping embedding the thumbnail because the file is missing.')
+            return [], info
+
         if info['ext'] == 'mp3':
             options = [
                 '-c', 'copy', '-map', '0', '-map', '1',
index cc65b34e71a28cfb0947b9441d5dcc006baf47ba..fe7e0a8eea6ad81c8700f0a1d5415f7cccf1d756 100644 (file)
@@ -21,6 +21,7 @@ from ..utils import (
     shell_quote,
     subtitles_filename,
     dfxp2srt,
+    ISO639Utils,
 )
 
 
@@ -307,199 +308,6 @@ class FFmpegVideoConvertorPP(FFmpegPostProcessor):
 
 
 class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
-    # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
-    _lang_map = {
-        'aa': 'aar',
-        'ab': 'abk',
-        'ae': 'ave',
-        'af': 'afr',
-        'ak': 'aka',
-        'am': 'amh',
-        'an': 'arg',
-        'ar': 'ara',
-        'as': 'asm',
-        'av': 'ava',
-        'ay': 'aym',
-        'az': 'aze',
-        'ba': 'bak',
-        'be': 'bel',
-        'bg': 'bul',
-        'bh': 'bih',
-        'bi': 'bis',
-        'bm': 'bam',
-        'bn': 'ben',
-        'bo': 'bod',
-        'br': 'bre',
-        'bs': 'bos',
-        'ca': 'cat',
-        'ce': 'che',
-        'ch': 'cha',
-        'co': 'cos',
-        'cr': 'cre',
-        'cs': 'ces',
-        'cu': 'chu',
-        'cv': 'chv',
-        'cy': 'cym',
-        'da': 'dan',
-        'de': 'deu',
-        'dv': 'div',
-        'dz': 'dzo',
-        'ee': 'ewe',
-        'el': 'ell',
-        'en': 'eng',
-        'eo': 'epo',
-        'es': 'spa',
-        'et': 'est',
-        'eu': 'eus',
-        'fa': 'fas',
-        'ff': 'ful',
-        'fi': 'fin',
-        'fj': 'fij',
-        'fo': 'fao',
-        'fr': 'fra',
-        'fy': 'fry',
-        'ga': 'gle',
-        'gd': 'gla',
-        'gl': 'glg',
-        'gn': 'grn',
-        'gu': 'guj',
-        'gv': 'glv',
-        'ha': 'hau',
-        'he': 'heb',
-        'hi': 'hin',
-        'ho': 'hmo',
-        'hr': 'hrv',
-        'ht': 'hat',
-        'hu': 'hun',
-        'hy': 'hye',
-        'hz': 'her',
-        'ia': 'ina',
-        'id': 'ind',
-        'ie': 'ile',
-        'ig': 'ibo',
-        'ii': 'iii',
-        'ik': 'ipk',
-        'io': 'ido',
-        'is': 'isl',
-        'it': 'ita',
-        'iu': 'iku',
-        'ja': 'jpn',
-        'jv': 'jav',
-        'ka': 'kat',
-        'kg': 'kon',
-        'ki': 'kik',
-        'kj': 'kua',
-        'kk': 'kaz',
-        'kl': 'kal',
-        'km': 'khm',
-        'kn': 'kan',
-        'ko': 'kor',
-        'kr': 'kau',
-        'ks': 'kas',
-        'ku': 'kur',
-        'kv': 'kom',
-        'kw': 'cor',
-        'ky': 'kir',
-        'la': 'lat',
-        'lb': 'ltz',
-        'lg': 'lug',
-        'li': 'lim',
-        'ln': 'lin',
-        'lo': 'lao',
-        'lt': 'lit',
-        'lu': 'lub',
-        'lv': 'lav',
-        'mg': 'mlg',
-        'mh': 'mah',
-        'mi': 'mri',
-        'mk': 'mkd',
-        'ml': 'mal',
-        'mn': 'mon',
-        'mr': 'mar',
-        'ms': 'msa',
-        'mt': 'mlt',
-        'my': 'mya',
-        'na': 'nau',
-        'nb': 'nob',
-        'nd': 'nde',
-        'ne': 'nep',
-        'ng': 'ndo',
-        'nl': 'nld',
-        'nn': 'nno',
-        'no': 'nor',
-        'nr': 'nbl',
-        'nv': 'nav',
-        'ny': 'nya',
-        'oc': 'oci',
-        'oj': 'oji',
-        'om': 'orm',
-        'or': 'ori',
-        'os': 'oss',
-        'pa': 'pan',
-        'pi': 'pli',
-        'pl': 'pol',
-        'ps': 'pus',
-        'pt': 'por',
-        'qu': 'que',
-        'rm': 'roh',
-        'rn': 'run',
-        'ro': 'ron',
-        'ru': 'rus',
-        'rw': 'kin',
-        'sa': 'san',
-        'sc': 'srd',
-        'sd': 'snd',
-        'se': 'sme',
-        'sg': 'sag',
-        'si': 'sin',
-        'sk': 'slk',
-        'sl': 'slv',
-        'sm': 'smo',
-        'sn': 'sna',
-        'so': 'som',
-        'sq': 'sqi',
-        'sr': 'srp',
-        'ss': 'ssw',
-        'st': 'sot',
-        'su': 'sun',
-        'sv': 'swe',
-        'sw': 'swa',
-        'ta': 'tam',
-        'te': 'tel',
-        'tg': 'tgk',
-        'th': 'tha',
-        'ti': 'tir',
-        'tk': 'tuk',
-        'tl': 'tgl',
-        'tn': 'tsn',
-        'to': 'ton',
-        'tr': 'tur',
-        'ts': 'tso',
-        'tt': 'tat',
-        'tw': 'twi',
-        'ty': 'tah',
-        'ug': 'uig',
-        'uk': 'ukr',
-        'ur': 'urd',
-        'uz': 'uzb',
-        've': 'ven',
-        'vi': 'vie',
-        'vo': 'vol',
-        'wa': 'wln',
-        'wo': 'wol',
-        'xh': 'xho',
-        'yi': 'yid',
-        'yo': 'yor',
-        'za': 'zha',
-        'zh': 'zho',
-        'zu': 'zul',
-    }
-
-    @classmethod
-    def _conver_lang_code(cls, code):
-        """Convert language code from ISO 639-1 to ISO 639-2/T"""
-        return cls._lang_map.get(code[:2])
-
     def run(self, information):
         if information['ext'] not in ['mp4', 'mkv']:
             self._downloader.to_screen('[ffmpeg] Subtitles can only be embedded in mp4 or mkv files')
@@ -525,7 +333,7 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
             opts += ['-c:s', 'mov_text']
         for (i, lang) in enumerate(sub_langs):
             opts.extend(['-map', '%d:0' % (i + 1)])
-            lang_code = self._conver_lang_code(lang)
+            lang_code = ISO639Utils.short2long(lang)
             if lang_code is not None:
                 opts.extend(['-metadata:s:s:%d' % i, 'language=%s' % lang_code])
 
index 52d198fa3c2eb36a1a3d41620cd645b90d52f854..a2746b2d1f0106ab9d9e651250a070df34af2d11 100644 (file)
@@ -1841,7 +1841,10 @@ def srt_subtitles_timecode(seconds):
 
 
 def dfxp2srt(dfxp_data):
-    _x = functools.partial(xpath_with_ns, ns_map={'ttml': 'http://www.w3.org/ns/ttml'})
+    _x = functools.partial(xpath_with_ns, ns_map={
+        'ttml': 'http://www.w3.org/ns/ttml',
+        'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
+    })
 
     def parse_node(node):
         str_or_empty = functools.partial(str_or_none, default='')
@@ -1849,9 +1852,9 @@ def dfxp2srt(dfxp_data):
         out = str_or_empty(node.text)
 
         for child in node:
-            if child.tag in (_x('ttml:br'), 'br'):
+            if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
                 out += '\n' + str_or_empty(child.tail)
-            elif child.tag in (_x('ttml:span'), 'span'):
+            elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
                 out += str_or_empty(parse_node(child))
             else:
                 out += str_or_empty(xml.etree.ElementTree.tostring(child))
@@ -1860,7 +1863,7 @@ def dfxp2srt(dfxp_data):
 
     dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
     out = []
-    paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
+    paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
 
     if not paras:
         raise ValueError('Invalid dfxp/TTML subtitle')
@@ -1879,6 +1882,208 @@ def dfxp2srt(dfxp_data):
     return ''.join(out)
 
 
+class ISO639Utils(object):
+    # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
+    _lang_map = {
+        'aa': 'aar',
+        'ab': 'abk',
+        'ae': 'ave',
+        'af': 'afr',
+        'ak': 'aka',
+        'am': 'amh',
+        'an': 'arg',
+        'ar': 'ara',
+        'as': 'asm',
+        'av': 'ava',
+        'ay': 'aym',
+        'az': 'aze',
+        'ba': 'bak',
+        'be': 'bel',
+        'bg': 'bul',
+        'bh': 'bih',
+        'bi': 'bis',
+        'bm': 'bam',
+        'bn': 'ben',
+        'bo': 'bod',
+        'br': 'bre',
+        'bs': 'bos',
+        'ca': 'cat',
+        'ce': 'che',
+        'ch': 'cha',
+        'co': 'cos',
+        'cr': 'cre',
+        'cs': 'ces',
+        'cu': 'chu',
+        'cv': 'chv',
+        'cy': 'cym',
+        'da': 'dan',
+        'de': 'deu',
+        'dv': 'div',
+        'dz': 'dzo',
+        'ee': 'ewe',
+        'el': 'ell',
+        'en': 'eng',
+        'eo': 'epo',
+        'es': 'spa',
+        'et': 'est',
+        'eu': 'eus',
+        'fa': 'fas',
+        'ff': 'ful',
+        'fi': 'fin',
+        'fj': 'fij',
+        'fo': 'fao',
+        'fr': 'fra',
+        'fy': 'fry',
+        'ga': 'gle',
+        'gd': 'gla',
+        'gl': 'glg',
+        'gn': 'grn',
+        'gu': 'guj',
+        'gv': 'glv',
+        'ha': 'hau',
+        'he': 'heb',
+        'hi': 'hin',
+        'ho': 'hmo',
+        'hr': 'hrv',
+        'ht': 'hat',
+        'hu': 'hun',
+        'hy': 'hye',
+        'hz': 'her',
+        'ia': 'ina',
+        'id': 'ind',
+        'ie': 'ile',
+        'ig': 'ibo',
+        'ii': 'iii',
+        'ik': 'ipk',
+        'io': 'ido',
+        'is': 'isl',
+        'it': 'ita',
+        'iu': 'iku',
+        'ja': 'jpn',
+        'jv': 'jav',
+        'ka': 'kat',
+        'kg': 'kon',
+        'ki': 'kik',
+        'kj': 'kua',
+        'kk': 'kaz',
+        'kl': 'kal',
+        'km': 'khm',
+        'kn': 'kan',
+        'ko': 'kor',
+        'kr': 'kau',
+        'ks': 'kas',
+        'ku': 'kur',
+        'kv': 'kom',
+        'kw': 'cor',
+        'ky': 'kir',
+        'la': 'lat',
+        'lb': 'ltz',
+        'lg': 'lug',
+        'li': 'lim',
+        'ln': 'lin',
+        'lo': 'lao',
+        'lt': 'lit',
+        'lu': 'lub',
+        'lv': 'lav',
+        'mg': 'mlg',
+        'mh': 'mah',
+        'mi': 'mri',
+        'mk': 'mkd',
+        'ml': 'mal',
+        'mn': 'mon',
+        'mr': 'mar',
+        'ms': 'msa',
+        'mt': 'mlt',
+        'my': 'mya',
+        'na': 'nau',
+        'nb': 'nob',
+        'nd': 'nde',
+        'ne': 'nep',
+        'ng': 'ndo',
+        'nl': 'nld',
+        'nn': 'nno',
+        'no': 'nor',
+        'nr': 'nbl',
+        'nv': 'nav',
+        'ny': 'nya',
+        'oc': 'oci',
+        'oj': 'oji',
+        'om': 'orm',
+        'or': 'ori',
+        'os': 'oss',
+        'pa': 'pan',
+        'pi': 'pli',
+        'pl': 'pol',
+        'ps': 'pus',
+        'pt': 'por',
+        'qu': 'que',
+        'rm': 'roh',
+        'rn': 'run',
+        'ro': 'ron',
+        'ru': 'rus',
+        'rw': 'kin',
+        'sa': 'san',
+        'sc': 'srd',
+        'sd': 'snd',
+        'se': 'sme',
+        'sg': 'sag',
+        'si': 'sin',
+        'sk': 'slk',
+        'sl': 'slv',
+        'sm': 'smo',
+        'sn': 'sna',
+        'so': 'som',
+        'sq': 'sqi',
+        'sr': 'srp',
+        'ss': 'ssw',
+        'st': 'sot',
+        'su': 'sun',
+        'sv': 'swe',
+        'sw': 'swa',
+        'ta': 'tam',
+        'te': 'tel',
+        'tg': 'tgk',
+        'th': 'tha',
+        'ti': 'tir',
+        'tk': 'tuk',
+        'tl': 'tgl',
+        'tn': 'tsn',
+        'to': 'ton',
+        'tr': 'tur',
+        'ts': 'tso',
+        'tt': 'tat',
+        'tw': 'twi',
+        'ty': 'tah',
+        'ug': 'uig',
+        'uk': 'ukr',
+        'ur': 'urd',
+        'uz': 'uzb',
+        've': 'ven',
+        'vi': 'vie',
+        'vo': 'vol',
+        'wa': 'wln',
+        'wo': 'wol',
+        'xh': 'xho',
+        'yi': 'yid',
+        'yo': 'yor',
+        'za': 'zha',
+        'zh': 'zho',
+        'zu': 'zul',
+    }
+
+    @classmethod
+    def short2long(cls, code):
+        """Convert language code from ISO 639-1 to ISO 639-2/T"""
+        return cls._lang_map.get(code[:2])
+
+    @classmethod
+    def long2short(cls, code):
+        """Convert language code from ISO 639-2/T to ISO 639-1"""
+        for short_name, long_name in cls._lang_map.items():
+            if long_name == code:
+                return short_name
+
+
 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
     def __init__(self, proxies=None):
         # Set default handlers
index 9cf84ff712da103b9c3ce0b3e2f0c9386082e574..34a13cb815d2c11f85f33f725d2f9bcac2a11865 100644 (file)
@@ -1,3 +1,3 @@
 from __future__ import unicode_literals
 
-__version__ = '2015.06.04.1'
+__version__ = '2015.06.15'