Merge remote-tracking branch 'michael-k/links'
authorPhilipp Hagemeister <phihag@phihag.de>
Wed, 12 Nov 2014 14:31:54 +0000 (15:31 +0100)
committerPhilipp Hagemeister <phihag@phihag.de>
Wed, 12 Nov 2014 14:31:54 +0000 (15:31 +0100)
43 files changed:
README.md
test/helper.py
test/test_compat.py
test/test_download.py
test/test_utils.py
test/test_youtube_signature.py
youtube_dl/YoutubeDL.py
youtube_dl/extractor/__init__.py
youtube_dl/extractor/abc.py
youtube_dl/extractor/allocine.py
youtube_dl/extractor/bandcamp.py
youtube_dl/extractor/brightcove.py
youtube_dl/extractor/byutv.py
youtube_dl/extractor/channel9.py
youtube_dl/extractor/cinemassacre.py
youtube_dl/extractor/cnn.py
youtube_dl/extractor/common.py
youtube_dl/extractor/gamespot.py
youtube_dl/extractor/generic.py
youtube_dl/extractor/goshgay.py
youtube_dl/extractor/grooveshark.py
youtube_dl/extractor/heise.py
youtube_dl/extractor/imdb.py
youtube_dl/extractor/izlesene.py
youtube_dl/extractor/mtv.py
youtube_dl/extractor/myspass.py
youtube_dl/extractor/ndr.py
youtube_dl/extractor/niconico.py
youtube_dl/extractor/played.py
youtube_dl/extractor/streamcloud.py
youtube_dl/extractor/swrmediathek.py
youtube_dl/extractor/tapely.py
youtube_dl/extractor/ted.py
youtube_dl/extractor/trutube.py
youtube_dl/extractor/ustream.py
youtube_dl/extractor/vice.py [new file with mode: 0644]
youtube_dl/extractor/wimp.py
youtube_dl/extractor/wrzuta.py
youtube_dl/extractor/youjizz.py
youtube_dl/extractor/youtube.py
youtube_dl/options.py
youtube_dl/utils.py
youtube_dl/version.py

index 9784e9c2a23bfe3409a817b6e8f170ea1594d5f4..c284db4955130065504aabe046ac4c0526917ba4 100644 (file)
--- a/README.md
+++ b/README.md
@@ -131,17 +131,19 @@ which means you can modify it, redistribute it or use it however you like.
                                      %(upload_date)s for the upload date
                                      (YYYYMMDD), %(extractor)s for the provider
                                      (youtube, metacafe, etc), %(id)s for the
-                                     video id, %(playlist)s for the playlist the
+                                     video id, %(playlist_title)s,
+                                     %(playlist_id)s, or %(playlist)s (=title if
+                                     present, ID otherwise) for the playlist the
                                      video is in, %(playlist_index)s for the
-                                     position in the playlist and %% for a
-                                     literal percent. %(height)s and %(width)s
-                                     for the width and height of the video
-                                     format. %(resolution)s for a textual
+                                     position in the playlist. %(height)s and
+                                     %(width)s for the width and height of the
+                                     video format. %(resolution)s for a textual
                                      description of the resolution of the video
-                                     format. Use - to output to stdout. Can also
-                                     be used to download to a different
-                                     directory, for example with -o '/my/downloa
-                                     ds/%(uploader)s/%(title)s-%(id)s.%(ext)s' .
+                                     format. %% for a literal percent. Use - to
+                                     output to stdout. Can also be used to
+                                     download to a different directory, for
+                                     example with -o '/my/downloads/%(uploader)s
+                                     /%(title)s-%(id)s.%(ext)s' .
     --autonumber-size NUMBER         Specifies the number of digits in
                                      %(autonumber)s when it is present in output
                                      filename template or --auto-number option
@@ -239,8 +241,13 @@ which means you can modify it, redistribute it or use it however you like.
                                      "worst", "worstvideo" and "worstaudio". By
                                      default, youtube-dl will pick the best
                                      quality. Use commas to download multiple
-                                     audio formats, such as  -f
-                                     136/137/mp4/bestvideo,140/m4a/bestaudio
+                                     audio formats, such as -f
+                                     136/137/mp4/bestvideo,140/m4a/bestaudio.
+                                     You can merge the video and audio of two
+                                     formats into a single file using -f <video-
+                                     format>+<audio-format> (requires ffmpeg or
+                                     avconv), for example -f
+                                     bestvideo+bestaudio.
     --all-formats                    download all available video formats
     --prefer-free-formats            prefer free video formats unless a specific
                                      one is requested
index fb8618120dcbbcdcfe6d576442e5bea189cb2521..325f72f0a35c5c4fe6ac3a1464858b200c152aad 100644 (file)
@@ -57,7 +57,7 @@ class FakeYDL(YoutubeDL):
         # Different instances of the downloader can't share the same dictionary
         # some test set the "sublang" parameter, which would break the md5 checks.
         params = get_params(override=override)
-        super(FakeYDL, self).__init__(params)
+        super(FakeYDL, self).__init__(params, auto_init=False)
         self.result = []
         
     def to_screen(self, s, skip_eol=None):
index d1a33ddc9f2b92a75f0d33671b9652a0813d8f18..4a7fc3606295a4a01b6a67c4e0d22c83240d4342 100644 (file)
@@ -37,7 +37,7 @@ class TestCompat(unittest.TestCase):
         all_names = youtube_dl.compat.__all__
         present_names = set(filter(
             lambda c: '_' in c and not c.startswith('_'),
-            dir(youtube_dl.compat)))
+            dir(youtube_dl.compat))) - set(['unicode_literals'])
         self.assertEqual(all_names, sorted(present_names))
 
 if __name__ == '__main__':
index 88ce29871d87700fa183cf66d918139e83e76fc9..87aced97cfe6e25750183e2fd94d4c2392703d9b 100644 (file)
@@ -96,7 +96,7 @@ def generator(test_case):
             params.setdefault('extract_flat', True)
             params.setdefault('skip_download', True)
 
-        ydl = YoutubeDL(params)
+        ydl = YoutubeDL(params, auto_init=False)
         ydl.add_default_info_extractors()
         finished_hook_called = set()
         def _hook(status):
index 338701f4c33679938dc8f943587aff8a59e1ad46..e59547784578e8ed7eef6b88c571629c44ccec43 100644 (file)
@@ -20,7 +20,6 @@ from youtube_dl.utils import (
     encodeFilename,
     find_xpath_attr,
     fix_xml_ampersands,
-    get_meta_content,
     orderedSet,
     OnDemandPagedList,
     InAdvancePagedList,
@@ -155,17 +154,6 @@ class TestUtil(unittest.TestCase):
         self.assertEqual(find_xpath_attr(doc, './/node', 'x', 'a'), doc[1])
         self.assertEqual(find_xpath_attr(doc, './/node', 'y', 'c'), doc[2])
 
-    def test_meta_parser(self):
-        testhtml = '''
-        <head>
-            <meta name="description" content="foo &amp; bar">
-            <meta content='Plato' name='author'/>
-        </head>
-        '''
-        get_meta = lambda name: get_meta_content(name, testhtml)
-        self.assertEqual(get_meta('description'), 'foo & bar')
-        self.assertEqual(get_meta('author'), 'Plato')
-
     def test_xpath_with_ns(self):
         testxml = '''<root xmlns:media="http://example.com/">
             <media:song>
index df2cb09f2a87dcacbb97de9193265f9bf1e852af..13d228cd85e9e260942635a36652c4cdc010dc8d 100644 (file)
@@ -14,7 +14,7 @@ import re
 import string
 
 from youtube_dl.extractor import YoutubeIE
-from youtube_dl.utils import compat_str, compat_urlretrieve
+from youtube_dl.compat import compat_str, compat_urlretrieve
 
 _TESTS = [
     (
index 8732f3db4e3a7c0eb91182954f0c3a7801dc681c..5c875b4972f61e2ce925c40d13a94fa53fb059d3 100755 (executable)
@@ -658,6 +658,8 @@ class YoutubeDL(object):
                 extra = {
                     'n_entries': n_entries,
                     'playlist': playlist,
+                    'playlist_id': ie_result.get('id'),
+                    'playlist_title': ie_result.get('title'),
                     'playlist_index': i + playliststart,
                     'extractor': ie_result['extractor'],
                     'webpage_url': ie_result['webpage_url'],
@@ -836,6 +838,13 @@ class YoutubeDL(object):
                         formats_info = (self.select_format(format_1, formats),
                             self.select_format(format_2, formats))
                         if all(formats_info):
+                            # The first format must contain the video and the
+                            # second the audio
+                            if formats_info[0].get('vcodec') == 'none':
+                                self.report_error('The first format must '
+                                    'contain the video, try using '
+                                    '"-f %s+%s"' % (format_2, format_1))
+                                return
                             selected_format = {
                                 'requested_formats': formats_info,
                                 'format': rf,
@@ -1297,11 +1306,13 @@ class YoutubeDL(object):
             self.report_warning(
                 'Your Python is broken! Update to a newer and supported version')
 
+        stdout_encoding = getattr(
+            sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__)
         encoding_str = (
             '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
                 locale.getpreferredencoding(),
                 sys.getfilesystemencoding(),
-                sys.stdout.encoding,
+                stdout_encoding,
                 self.get_encoding()))
         write_string(encoding_str, encoding=None)
 
index 3f85c99cdd1a0795327eaa5b52191637912a9225..3c1807f1583c78322b5f5600d3ba4f4b71a1a752 100644 (file)
@@ -421,6 +421,7 @@ from .vesti import VestiIE
 from .vevo import VevoIE
 from .vgtv import VGTVIE
 from .vh1 import VH1IE
+from .vice import ViceIE
 from .viddler import ViddlerIE
 from .videobam import VideoBamIE
 from .videodetective import VideoDetectiveIE
index 69f89320ce7e30a5fdf65acff01df1668620334f..dc0fb85d6048962505d1d207ae590940d69f52e6 100644 (file)
@@ -11,13 +11,13 @@ class ABCIE(InfoExtractor):
     _VALID_URL = r'http://www\.abc\.net\.au/news/[^/]+/[^/]+/(?P<id>\d+)'
 
     _TEST = {
-        'url': 'http://www.abc.net.au/news/2014-07-25/bringing-asylum-seekers-to-australia-would-give/5624716',
-        'md5': 'dad6f8ad011a70d9ddf887ce6d5d0742',
+        'url': 'http://www.abc.net.au/news/2014-11-05/australia-to-staff-ebola-treatment-centre-in-sierra-leone/5868334',
+        'md5': 'cb3dd03b18455a661071ee1e28344d9f',
         'info_dict': {
-            'id': '5624716',
+            'id': '5868334',
             'ext': 'mp4',
-            'title': 'Bringing asylum seekers to Australia would give them right to asylum claims: professor',
-            'description': 'md5:ba36fa5e27e5c9251fd929d339aea4af',
+            'title': 'Australia to help staff Ebola treatment centre in Sierra Leone',
+            'description': 'md5:809ad29c67a05f54eb41f2a105693a67',
         },
     }
 
index 7bd7978841d06747145feeda56624de84747fcc1..398e93bfb4f8472a23d2b8669e7f83f867933244 100644 (file)
@@ -22,7 +22,7 @@ class AllocineIE(InfoExtractor):
             'id': '19546517',
             'ext': 'mp4',
             'title': 'Astérix - Le Domaine des Dieux Teaser VF',
-            'description': 'md5:4a754271d9c6f16c72629a8a993ee884',
+            'description': 'md5:abcd09ce503c6560512c14ebfdb720d2',
             'thumbnail': 're:http://.*\.jpg',
         },
     }, {
index c13446665d2fb0e202973a26f5c7499c325719d1..1b8da43cae83253531e6f542af4ad0b22e588502 100644 (file)
@@ -110,20 +110,25 @@ class BandcampAlbumIE(InfoExtractor):
         'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
         'playlist': [
             {
-                'file': '1353101989.mp3',
                 'md5': '39bc1eded3476e927c724321ddf116cf',
                 'info_dict': {
+                    'id': '1353101989',
+                    'ext': 'mp3',
                     'title': 'Intro',
                 }
             },
             {
-                'file': '38097443.mp3',
                 'md5': '1a2c32e2691474643e912cc6cd4bffaa',
                 'info_dict': {
+                    'id': '38097443',
+                    'ext': 'mp3',
                     'title': 'Kero One - Keep It Alive (Blazo remix)',
                 }
             },
         ],
+        'info_dict': {
+            'title': 'Jazz Format Mixtape vol.1',
+        },
         'params': {
             'playlistend': 2
         },
index ad22cbafd6a541599c289cfd7dcb1b27ec7a8e95..a6920685e1ff93aeaecb0e551215a8343a50e8b0 100644 (file)
@@ -14,6 +14,7 @@ from ..utils import (
     compat_str,
     compat_urllib_request,
     compat_parse_qs,
+    compat_urllib_parse_urlparse,
 
     determine_ext,
     ExtractorError,
@@ -23,7 +24,7 @@ from ..utils import (
 
 
 class BrightcoveIE(InfoExtractor):
-    _VALID_URL = r'https?://.*brightcove\.com/(services|viewer).*\?(?P<query>.*)'
+    _VALID_URL = r'https?://.*brightcove\.com/(services|viewer).*?\?(?P<query>.*)'
     _FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s'
 
     _TESTS = [
@@ -260,11 +261,19 @@ class BrightcoveIE(InfoExtractor):
             formats = []
             for rend in renditions:
                 url = rend['defaultURL']
+                if not url:
+                    continue
                 if rend['remote']:
-                    # This type of renditions are served through akamaihd.net,
-                    # but they don't use f4m manifests
-                    url = url.replace('control/', '') + '?&v=3.3.0&fp=13&r=FEEFJ&g=RTSJIMBMPFPB'
-                    ext = 'flv'
+                    url_comp = compat_urllib_parse_urlparse(url)
+                    if url_comp.path.endswith('.m3u8'):
+                        formats.extend(
+                            self._extract_m3u8_formats(url, info['id'], 'mp4'))
+                        continue
+                    elif 'akamaihd.net' in url_comp.netloc:
+                        # This type of renditions are served through
+                        # akamaihd.net, but they don't use f4m manifests
+                        url = url.replace('control/', '') + '?&v=3.3.0&fp=13&r=FEEFJ&g=RTSJIMBMPFPB'
+                        ext = 'flv'
                 else:
                     ext = determine_ext(url)
                 size = rend.get('size')
index cf19b7b0cf952c3b14d9ef5b91f541332d3e5e69..6252be05b7f4b57787152b4edae5378675a96847 100644 (file)
@@ -10,12 +10,12 @@ from ..utils import ExtractorError
 class BYUtvIE(InfoExtractor):
     _VALID_URL = r'^https?://(?:www\.)?byutv.org/watch/[0-9a-f-]+/(?P<video_id>[^/?#]+)'
     _TEST = {
-        'url': 'http://www.byutv.org/watch/44e80f7b-e3ba-43ba-8c51-b1fd96c94a79/granite-flats-talking',
+        'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d/studio-c-season-5-episode-5',
         'info_dict': {
-            'id': 'granite-flats-talking',
+            'id': 'studio-c-season-5-episode-5',
             'ext': 'mp4',
-            'description': 'md5:4e9a7ce60f209a33eca0ac65b4918e1c',
-            'title': 'Talking',
+            'description': 'md5:5438d33774b6bdc662f9485a340401cc',
+            'title': 'Season 5 Episode 5',
             'thumbnail': 're:^https?://.*promo.*'
         },
         'params': {
index 4f000292b7c4273c40df11252852986df08f5e01..16d800512a092e3d9d064cf2ac7800a830af64ec 100644 (file)
@@ -27,7 +27,7 @@ class Channel9IE(InfoExtractor):
                 'title': 'Developer Kick-Off Session: Stuff We Love',
                 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
                 'duration': 4576,
-                'thumbnail': 'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
+                'thumbnail': 'http://video.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
                 'session_code': 'KOS002',
                 'session_day': 'Day 1',
                 'session_room': 'Arena 1A',
@@ -43,7 +43,7 @@ class Channel9IE(InfoExtractor):
                 'title': 'Self-service BI with Power BI - nuclear testing',
                 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
                 'duration': 1540,
-                'thumbnail': 'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
+                'thumbnail': 'http://video.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
                 'authors': [ 'Mike Wilmot' ],
             },
         }
@@ -94,7 +94,7 @@ class Channel9IE(InfoExtractor):
 
     def _extract_title(self, html):
         title = self._html_search_meta('title', html, 'title')
-        if title is None:           
+        if title is None:
             title = self._og_search_title(html)
             TITLE_SUFFIX = ' (Channel 9)'
             if title is not None and title.endswith(TITLE_SUFFIX):
@@ -115,7 +115,7 @@ class Channel9IE(InfoExtractor):
         return self._html_search_meta('description', html, 'description')
 
     def _extract_duration(self, html):
-        m = re.search(r'data-video_duration="(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
+        m = re.search(r'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
         return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
 
     def _extract_slides(self, html):
@@ -167,7 +167,7 @@ class Channel9IE(InfoExtractor):
         return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
 
     def _extract_content(self, html, content_path):
-        # Look for downloadable content        
+        # Look for downloadable content
         formats = self._formats_from_html(html)
         slides = self._extract_slides(html)
         zip_ = self._extract_zip(html)
@@ -258,16 +258,17 @@ class Channel9IE(InfoExtractor):
 
         webpage = self._download_webpage(url, content_path, 'Downloading web page')
 
-        page_type_m = re.search(r'<meta name="Search.PageType" content="(?P<pagetype>[^"]+)"/>', webpage)
-        if page_type_m is None:
-            raise ExtractorError('Search.PageType not found, don\'t know how to process this page', expected=True)
-
-        page_type = page_type_m.group('pagetype')
-        if page_type == 'List':         # List page, may contain list of 'item'-like objects
+        page_type_m = re.search(r'<meta name="WT.entryid" content="(?P<pagetype>[^:]+)[^"]+"/>', webpage)
+        if page_type_m is not None:
+            page_type = page_type_m.group('pagetype')
+            if page_type == 'Entry':      # Any 'item'-like page, may contain downloadable content
+                return self._extract_entry_item(webpage, content_path)
+            elif page_type == 'Session':  # Event session page, may contain downloadable content
+                return self._extract_session(webpage, content_path)
+            elif page_type == 'Event':
+                return self._extract_list(content_path)
+            else:
+                raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True)
+
+        else: # Assuming list
             return self._extract_list(content_path)
-        elif page_type == 'Entry.Item': # Any 'item'-like page, may contain downloadable content
-            return self._extract_entry_item(webpage, content_path)
-        elif page_type == 'Session':    # Event session page, may contain downloadable content
-            return self._extract_session(webpage, content_path)
-        else:
-            raise ExtractorError('Unexpected Search.PageType %s' % page_type, expected=True)
\ No newline at end of file
index 806e378b1b6da0eebe01bdd3992d53814090bf59..31fe906b448669ae94c185546a5402314b975224 100644 (file)
@@ -42,11 +42,12 @@ class CinemassacreIE(InfoExtractor):
 
         webpage = self._download_webpage(url, display_id)
         video_date = mobj.group('date_Y') + mobj.group('date_m') + mobj.group('date_d')
-        mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=(?:Cinemassacre-)?(?P<video_id>.+?))"', webpage)
+        mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=(?P<full_video_id>(?:Cinemassacre-)?(?P<video_id>.+?)))"', webpage)
         if not mobj:
             raise ExtractorError('Can\'t extract embed url and video id')
         playerdata_url = mobj.group('embed_url')
         video_id = mobj.group('video_id')
+        full_video_id = mobj.group('full_video_id')
 
         video_title = self._html_search_regex(
             r'<title>(?P<title>.+?)\|', webpage, 'title')
@@ -60,37 +61,52 @@ class CinemassacreIE(InfoExtractor):
         vidurl = self._search_regex(
             r'\'vidurl\'\s*:\s*"([^\']+)"', playerdata, 'vidurl').replace('\\/', '/')
 
-        videolist_url = self._search_regex(
-            r"file\s*:\s*'(http.+?/jwplayer\.smil)'", playerdata, 'jwplayer.smil')
-        videolist = self._download_xml(videolist_url, video_id, 'Downloading videolist XML')
+        videolist_url = None
 
-        formats = []
-        baseurl = vidurl[:vidurl.rfind('/')+1]
-        for video in videolist.findall('.//video'):
-            src = video.get('src')
-            if not src:
-                continue
-            file_ = src.partition(':')[-1]
-            width = int_or_none(video.get('width'))
-            height = int_or_none(video.get('height'))
-            bitrate = int_or_none(video.get('system-bitrate'))
-            format = {
-                'url': baseurl + file_,
-                'format_id': src.rpartition('.')[0].rpartition('_')[-1],
-            }
-            if width or height:
-                format.update({
-                    'tbr': bitrate // 1000 if bitrate else None,
-                    'width': width,
-                    'height': height,
-                })
-            else:
-                format.update({
-                    'abr': bitrate // 1000 if bitrate else None,
-                    'vcodec': 'none',
-                })
-            formats.append(format)
-        self._sort_formats(formats)
+        mobj = re.search(r"'videoserver'\s*:\s*'(?P<videoserver>[^']+)'", playerdata)
+        if mobj:
+            videoserver = mobj.group('videoserver')
+            mobj = re.search(r'\'vidid\'\s*:\s*"(?P<vidid>[^\']+)"', playerdata)
+            vidid = mobj.group('vidid') if mobj else full_video_id
+            videolist_url = 'http://%s/vod/smil:%s.smil/jwplayer.smil' % (videoserver, vidid)
+        else:
+            mobj = re.search(r"file\s*:\s*'(?P<smil>http.+?/jwplayer\.smil)'", playerdata)
+            if mobj:
+                videolist_url = mobj.group('smil')
+
+        if videolist_url:
+            videolist = self._download_xml(videolist_url, video_id, 'Downloading videolist XML')
+            formats = []
+            baseurl = vidurl[:vidurl.rfind('/')+1]
+            for video in videolist.findall('.//video'):
+                src = video.get('src')
+                if not src:
+                    continue
+                file_ = src.partition(':')[-1]
+                width = int_or_none(video.get('width'))
+                height = int_or_none(video.get('height'))
+                bitrate = int_or_none(video.get('system-bitrate'))
+                format = {
+                    'url': baseurl + file_,
+                    'format_id': src.rpartition('.')[0].rpartition('_')[-1],
+                }
+                if width or height:
+                    format.update({
+                        'tbr': bitrate // 1000 if bitrate else None,
+                        'width': width,
+                        'height': height,
+                    })
+                else:
+                    format.update({
+                        'abr': bitrate // 1000 if bitrate else None,
+                        'vcodec': 'none',
+                    })
+                formats.append(format)
+            self._sort_formats(formats)
+        else:
+            formats = [{
+                'url': vidurl,
+            }]
 
         return {
             'id': video_id,
index 78877b1cf1ee5bbf2dce05c28762e066b48a0178..3826ce7e1ee29bde4d640cb62bf2c127a83eb7bc 100644 (file)
@@ -16,9 +16,10 @@ class CNNIE(InfoExtractor):
 
     _TESTS = [{
         'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn',
-        'file': 'sports_2013_06_09_nadal-1-on-1.cnn.mp4',
         'md5': '3e6121ea48df7e2259fe73a0628605c4',
         'info_dict': {
+            'id': 'sports_2013_06_09_nadal-1-on-1.cnn',
+            'ext': 'mp4',
             'title': 'Nadal wins 8th French Open title',
             'description': 'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.',
             'duration': 135,
@@ -27,9 +28,10 @@ class CNNIE(InfoExtractor):
     },
     {
         "url": "http://edition.cnn.com/video/?/video/us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fcnn_topstories+%28RSS%3A+Top+Stories%29",
-        "file": "us_2013_08_21_sot-student-gives-epic-speech.georgia-institute-of-technology.mp4",
         "md5": "b5cc60c60a3477d185af8f19a2a26f4e",
         "info_dict": {
+            'id': 'us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology',
+            'ext': 'mp4',
             "title": "Student's epic speech stuns new freshmen",
             "description": "A Georgia Tech student welcomes the incoming freshmen with an epic speech backed by music from \"2001: A Space Odyssey.\"",
             "upload_date": "20130821",
index 9e1d62c2b908f0c8ac826a5d16fba10c4645b8b2..b77f0e51904d1539a43c056dafbc82a587041744 100644 (file)
@@ -404,7 +404,7 @@ class InfoExtractor(object):
             video_info['title'] = playlist_title
         return video_info
 
-    def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
+    def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
         """
         Perform a regex search on the given string, using a single or a list of
         patterns returning the first matching group.
@@ -425,8 +425,11 @@ class InfoExtractor(object):
             _name = name
 
         if mobj:
-            # return the first matching group
-            return next(g for g in mobj.groups() if g is not None)
+            if group is None:
+                # return the first matching group
+                return next(g for g in mobj.groups() if g is not None)
+            else:
+                return mobj.group(group)
         elif default is not _NO_DEFAULT:
             return default
         elif fatal:
@@ -436,11 +439,11 @@ class InfoExtractor(object):
                 'please report this issue on http://yt-dl.org/bug' % _name)
             return None
 
-    def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
+    def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
         """
         Like _search_regex, but strips HTML tags and unescapes entities.
         """
-        res = self._search_regex(pattern, string, name, default, fatal, flags)
+        res = self._search_regex(pattern, string, name, default, fatal, flags, group)
         if res:
             return clean_html(res).strip()
         else:
@@ -534,9 +537,9 @@ class InfoExtractor(object):
             display_name = name
         return self._html_search_regex(
             r'''(?ix)<meta
-                    (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
-                    [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
-            html, display_name, fatal=fatal, **kwargs)
+                    (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
+                    [^>]+content=(["\'])(?P<content>.*?)\1''' % re.escape(name),
+            html, display_name, fatal=fatal, group='content', **kwargs)
 
     def _dc_search_uploader(self, html):
         return self._html_search_meta('dc.creator', html, 'uploader')
index 3d67b9d60242760ff3e32c9fbbbcab39542f01da..d570e3f6a85ca399d81328e3afedee4f98158e5f 100644 (file)
@@ -8,12 +8,11 @@ from ..utils import (
     compat_urllib_parse,
     compat_urlparse,
     unescapeHTML,
-    get_meta_content,
 )
 
 
 class GameSpotIE(InfoExtractor):
-    _VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/.*-(?P<page_id>\d+)/?'
+    _VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/.*-(?P<id>\d+)/?'
     _TEST = {
         'url': 'http://www.gamespot.com/videos/arma-3-community-guide-sitrep-i/2300-6410818/',
         'md5': 'b2a30deaa8654fcccd43713a6b6a4825',
@@ -26,10 +25,10 @@ class GameSpotIE(InfoExtractor):
     }
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        page_id = mobj.group('page_id')
+        page_id = self._match_id(url)
         webpage = self._download_webpage(url, page_id)
-        data_video_json = self._search_regex(r'data-video=["\'](.*?)["\']', webpage, 'data video')
+        data_video_json = self._search_regex(
+            r'data-video=["\'](.*?)["\']', webpage, 'data video')
         data_video = json.loads(unescapeHTML(data_video_json))
 
         # Transform the manifest url to a link to the mp4 files
@@ -41,7 +40,8 @@ class GameSpotIE(InfoExtractor):
         http_path = f4m_path[1:].split('/', 1)[1]
         http_template = re.sub(QUALITIES_RE, r'%s', http_path)
         http_template = http_template.replace('.csmil/manifest.f4m', '')
-        http_template = compat_urlparse.urljoin('http://video.gamespotcdn.com/', http_template)
+        http_template = compat_urlparse.urljoin(
+            'http://video.gamespotcdn.com/', http_template)
         formats = []
         for q in qualities:
             formats.append({
@@ -52,8 +52,9 @@ class GameSpotIE(InfoExtractor):
 
         return {
             'id': data_video['guid'],
+            'display_id': page_id,
             'title': compat_urllib_parse.unquote(data_video['title']),
             'formats': formats,
-            'description': get_meta_content('description', webpage),
+            'description': self._html_search_meta('description', webpage),
             'thumbnail': self._og_search_thumbnail(webpage),
         }
index babd581eaadf7d2e0dce3e623aad7f7ed748e119..01d6a57f82aedbf8e350ce753872676b559d24ba 100644 (file)
@@ -100,6 +100,22 @@ class GenericIE(InfoExtractor):
                 'uploader': 'Championat',
             },
         },
+        {
+            # https://github.com/rg3/youtube-dl/issues/3541
+            'add_ie': ['Brightcove'],
+            'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
+            'info_dict': {
+                'id': '3866516442001',
+                'ext': 'mp4',
+                'title': 'Leer mij vrouwen kennen: Aflevering 1',
+                'description': 'Leer mij vrouwen kennen: Aflevering 1',
+                'uploader': 'SBS Broadcasting',
+            },
+            'skip': 'Restricted to Netherlands',
+            'params': {
+                'skip_download': True,  # m3u8 download
+            },
+        },
         # Direct link to a video
         {
             'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
@@ -560,6 +576,7 @@ class GenericIE(InfoExtractor):
             return {
                 'id': video_id,
                 'title': os.path.splitext(url_basename(url))[0],
+                'direct': True,
                 'formats': [{
                     'format_id': m.group('format_id'),
                     'url': url,
index 7bca21ad0fe81c71444bfe64fb23ea3ad62c6c5b..18474cbb72684ac6327957fb3385fb9de3bc4480 100644 (file)
@@ -1,15 +1,11 @@
 # -*- coding: utf-8 -*-
 from __future__ import unicode_literals
 
-import re
-
 from .common import InfoExtractor
 from ..utils import (
     compat_urlparse,
-    str_to_int,
     ExtractorError,
 )
-import json
 
 
 class GoshgayIE(InfoExtractor):
@@ -27,36 +23,27 @@ class GoshgayIE(InfoExtractor):
     }
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
+        video_id = self._match_id(url)
 
         webpage = self._download_webpage(url, video_id)
-        title = self._search_regex(r'class="video-title"><h1>(.+?)<', webpage, 'title')
+        title = self._og_search_title(webpage)
+        thumbnail = self._og_search_thumbnail(webpage)
+        family_friendly = self._html_search_meta(
+            'isFamilyFriendly', webpage, default='false')
+        config_url = self._search_regex(
+            r"'config'\s*:\s*'([^']+)'", webpage, 'config URL')
 
-        player_config = self._search_regex(
-            r'(?s)jwplayer\("player"\)\.setup\(({.+?})\)', webpage, 'config settings')
-        player_vars = json.loads(player_config.replace("'", '"'))
-        width = str_to_int(player_vars.get('width'))
-        height = str_to_int(player_vars.get('height'))
-        config_uri = player_vars.get('config')
+        config = self._download_xml(
+            config_url, video_id, 'Downloading player config XML')
 
-        if config_uri is None:
-            raise ExtractorError('Missing config URI')
-        node = self._download_xml(config_uri, video_id, 'Downloading player config XML',
-                                  errnote='Unable to download XML')
-        if node is None:
+        if config is None:
             raise ExtractorError('Missing config XML')
-        if node.tag != 'config':
+        if config.tag != 'config':
             raise ExtractorError('Missing config attribute')
-        fns = node.findall('file')
-        imgs = node.findall('image')
-        if len(fns) != 1:
+        fns = config.findall('file')
+        if len(fns) < 1:
             raise ExtractorError('Missing media URI')
         video_url = fns[0].text
-        if len(imgs) < 1:
-            thumbnail = None
-        else:
-            thumbnail = imgs[0].text
 
         url_comp = compat_urlparse.urlparse(url)
         ref = "%s://%s%s" % (url_comp[0], url_comp[1], url_comp[2])
@@ -65,9 +52,7 @@ class GoshgayIE(InfoExtractor):
             'id': video_id,
             'url': video_url,
             'title': title,
-            'width': width,
-            'height': height,
             'thumbnail': thumbnail,
             'http_referer': ref,
-            'age_limit': 18,
+            'age_limit': 0 if family_friendly == 'true' else 18,
         }
index 726adff773305844a5e8950b4467509646b79d1d..fff74a70a891fc163ff488408e4df348564b8a29 100644 (file)
@@ -8,12 +8,13 @@ import re
 
 
 from .common import InfoExtractor
-from ..utils import ExtractorError, compat_urllib_request, compat_html_parser
-
-from ..utils import (
+from ..compat import (
+    compat_html_parser,
     compat_urllib_parse,
+    compat_urllib_request,
     compat_urlparse,
 )
+from ..utils import ExtractorError
 
 
 class GroovesharkHtmlParser(compat_html_parser.HTMLParser):
index d41c0413f015e262237f66161d4021c32f56ff0e..278d9f527fd41c8e1e2c180a9ae455a23fbef1fc 100644 (file)
@@ -3,7 +3,7 @@ from __future__ import unicode_literals
 
 from .common import InfoExtractor
 from ..utils import (
-    get_meta_content,
+    determine_ext,
     int_or_none,
     parse_iso8601,
 )
@@ -25,11 +25,11 @@ class HeiseIE(InfoExtractor):
             'title': (
                 "Podcast: c't uplink 3.3 â€“ Owncloud / Tastaturen / Peilsender Smartphone"
             ),
-            'format_id': 'mp4_720',
+            'format_id': 'mp4_720p',
             'timestamp': 1411812600,
             'upload_date': '20140927',
             'description': 'In uplink-Episode 3.3 geht es darum, wie man sich von Cloud-Anbietern emanzipieren kann, worauf man beim Kauf einer Tastatur achten sollte und was Smartphones Ã¼ber uns verraten.',
-            'thumbnail': 're:https?://.*\.jpg$',
+            'thumbnail': 're:^https?://.*\.jpe?g$',
         }
     }
 
@@ -49,11 +49,12 @@ class HeiseIE(InfoExtractor):
         info = {
             'id': video_id,
             'thumbnail': self._og_search_thumbnail(webpage),
-            'timestamp': parse_iso8601(get_meta_content('date', webpage)),
+            'timestamp': parse_iso8601(
+                self._html_search_meta('date', webpage)),
             'description': self._og_search_description(webpage),
         }
 
-        title = get_meta_content('fulltitle', webpage)
+        title = self._html_search_meta('fulltitle', webpage)
         if title:
             info['title'] = title
         else:
@@ -64,9 +65,12 @@ class HeiseIE(InfoExtractor):
             label = source_node.attrib['label']
             height = int_or_none(self._search_regex(
                 r'^(.*?_)?([0-9]+)p$', label, 'height', default=None))
+            video_url = source_node.attrib['file']
+            ext = determine_ext(video_url, '')
             formats.append({
-                'url': source_node.attrib['file'],
+                'url': video_url,
                 'format_note': label,
+                'format_id': '%s_%s' % (ext, label),
                 'height': height,
             })
         self._sort_formats(formats)
index 4536db3bfca1e1244e70089bea30de9687d923f0..6108ed5526ba37f2e5b8d14b604974ca3c3f8273 100644 (file)
@@ -6,7 +6,6 @@ import json
 from .common import InfoExtractor
 from ..utils import (
     compat_urlparse,
-    get_element_by_attribute,
 )
 
 
@@ -27,10 +26,11 @@ class ImdbIE(InfoExtractor):
     }
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
+        video_id = self._match_id(url)
         webpage = self._download_webpage('http://www.imdb.com/video/imdb/vi%s' % video_id, video_id)
-        descr = get_element_by_attribute('itemprop', 'description', webpage)
+        descr = self._html_search_regex(
+            r'(?s)<span itemprop="description">(.*?)</span>',
+            webpage, 'description', fatal=False)
         available_formats = re.findall(
             r'case \'(?P<f_id>.*?)\' :$\s+url = \'(?P<path>.*?)\'', webpage,
             flags=re.MULTILINE)
@@ -73,9 +73,7 @@ class ImdbListIE(InfoExtractor):
     }
     
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        list_id = mobj.group('id')
-
+        list_id = self._match_id(url)
         webpage = self._download_webpage(url, list_id)
         entries = [
             self.url_result('http://www.imdb.com' + m, 'Imdb')
index 07ef682ee38052088d07f3f232c245ded77b2193..d16d483eeb0d533debe041b7cd6c7b4826d41dde 100644 (file)
@@ -5,11 +5,11 @@ import re
 
 from .common import InfoExtractor
 from ..utils import (
-    get_element_by_id,
-    parse_iso8601,
     determine_ext,
-    int_or_none,
     float_or_none,
+    get_element_by_id,
+    int_or_none,
+    parse_iso8601,
     str_to_int,
 )
 
@@ -30,7 +30,7 @@ class IzleseneIE(InfoExtractor):
                 'description': 'md5:253753e2655dde93f59f74b572454f6d',
                 'thumbnail': 're:^http://.*\.jpg',
                 'uploader_id': 'pelikzzle',
-                'timestamp': 1404298698,
+                'timestamp': 1404302298,
                 'upload_date': '20140702',
                 'duration': 95.395,
                 'age_limit': 0,
@@ -46,7 +46,7 @@ class IzleseneIE(InfoExtractor):
                 'description': 'Tarkan Dortmund 2006 Konseri',
                 'thumbnail': 're:^http://.*\.jpg',
                 'uploader_id': 'parlayankiz',
-                'timestamp': 1163318593,
+                'timestamp': 1163322193,
                 'upload_date': '20061112',
                 'duration': 253.666,
                 'age_limit': 0,
@@ -55,10 +55,9 @@ class IzleseneIE(InfoExtractor):
     ]
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-        url = 'http://www.izlesene.com/video/%s' % video_id
+        video_id = self._match_id(url)
 
+        url = 'http://www.izlesene.com/video/%s' % video_id
         webpage = self._download_webpage(url, video_id)
 
         title = self._og_search_title(webpage)
index 228b42d2b940d8eadd0fa3d5e61d0836fd19b7b7..f0e28e66ace8a5f59093eb2c9857968bcabf2dd7 100644 (file)
@@ -33,7 +33,7 @@ class MTVServicesInfoExtractor(InfoExtractor):
         m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp\..+?/.*)$', rtmp_video_url)
         if not m:
             return rtmp_video_url
-        base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
+        base = 'http://viacommtvstrmfs.fplive.net/'
         return base + m.group('finalid')
 
     def _get_feed_url(self, uri):
index 4fa0575f8a282aa6f8f561a7f18bc0129fceea8c..51e540814be209856a9a71f891e55eeb4ba559c2 100644 (file)
@@ -13,9 +13,10 @@ class MySpassIE(InfoExtractor):
     _VALID_URL = r'http://www\.myspass\.de/.*'
     _TEST = {
         'url': 'http://www.myspass.de/myspass/shows/tvshows/absolute-mehrheit/Absolute-Mehrheit-vom-17022013-Die-Highlights-Teil-2--/11741/',
-        'file': '11741.mp4',
         'md5': '0b49f4844a068f8b33f4b7c88405862b',
         'info_dict': {
+            'id': '11741',
+            'ext': 'mp4',
             "description": "Wer kann in die Fu\u00dfstapfen von Wolfgang Kubicki treten und die Mehrheit der Zuschauer hinter sich versammeln? Wird vielleicht sogar die Absolute Mehrheit geknackt und der Jackpot von 200.000 Euro mit nach Hause genommen?",
             "title": "Absolute Mehrheit vom 17.02.2013 - Die Highlights, Teil 2",
         },
index add4b3e5d3b1fcdd4580f8a6fddcd115556ae1cb..c31ff8160851f824042804a06d905f85d3217416 100644 (file)
@@ -67,7 +67,7 @@ class NDRIE(InfoExtractor):
 
         thumbnail = None
 
-        video_url = re.search(r'''3: \{src:'(?P<video>.+?)\.hi\.mp4', type:"video/mp4"},''', page)
+        video_url = re.search(r'''3: \{src:'(?P<video>.+?)\.(lo|hi|hq)\.mp4', type:"video/mp4"},''', page)
         if video_url:
             thumbnails = re.findall(r'''\d+: \{src: "([^"]+)"(?: \|\| '[^']+')?, quality: '([^']+)'}''', page)
             if thumbnails:
index 62d5707fee87302b5f69544f768b1a56a5e3976b..3b5784e8f5cb5ae8014c4f51d2d451d989b87860 100644 (file)
@@ -12,6 +12,7 @@ from ..utils import (
     unified_strdate,
     parse_duration,
     int_or_none,
+    ExtractorError,
 )
 
 
@@ -108,6 +109,9 @@ class NiconicoIE(InfoExtractor):
                 flv_info_request, video_id,
                 note='Downloading flv info', errnote='Unable to download flv info')
 
+        if 'deleted=' in flv_info_webpage:
+            raise ExtractorError('The video has been deleted.',
+                expected=True)
         video_real_url = compat_urlparse.parse_qs(flv_info_webpage)['url'][0]
 
         # Start extracting information
@@ -171,7 +175,8 @@ class NiconicoPlaylistIE(InfoExtractor):
         entries = [{
             '_type': 'url',
             'ie_key': NiconicoIE.ie_key(),
-            'url': 'http://www.nicovideo.jp/watch/%s' % entry['item_id'],
+            'url': ('http://www.nicovideo.jp/watch/%s' %
+                entry['item_data']['video_id']),
         } for entry in entries]
 
         return {
index 645a1e06dc8f3678b30d4e41b38ea13396c7a3e0..17880471d9d160f6d3315ca9c6eadeada8ce91a7 100644 (file)
@@ -6,6 +6,7 @@ import os.path
 
 from .common import InfoExtractor
 from ..utils import (
+    ExtractorError,
     compat_urllib_parse,
     compat_urllib_request,
 )
@@ -29,6 +30,12 @@ class PlayedIE(InfoExtractor):
         video_id = self._match_id(url)
 
         orig_webpage = self._download_webpage(url, video_id)
+
+        m_error = re.search(
+            r'(?s)Reason for deletion:.*?<b class="err"[^>]*>(?P<msg>[^<]+)</b>', orig_webpage)
+        if m_error:
+            raise ExtractorError(m_error.group('msg'), expected=True)
+
         fields = re.findall(
             r'type="hidden" name="([^"]+)"\s+value="([^"]+)">', orig_webpage)
         data = dict(fields)
index 172def221e1277298dc355a2cfdbea3ae4f9fdce..c1178f26de0b961ad68eb6d1ddb89550746f4dd7 100644 (file)
@@ -13,7 +13,7 @@ from ..utils import (
 
 class StreamcloudIE(InfoExtractor):
     IE_NAME = 'streamcloud.eu'
-    _VALID_URL = r'https?://streamcloud\.eu/(?P<id>[a-zA-Z0-9_-]+)/(?P<fname>[^#?]*)\.html'
+    _VALID_URL = r'https?://streamcloud\.eu/(?P<id>[a-zA-Z0-9_-]+)(?:/(?P<fname>[^#?]*)\.html)?'
 
     _TEST = {
         'url': 'http://streamcloud.eu/skp9j99s4bpz/youtube-dl_test_video_____________-BaW_jenozKc.mp4.html',
@@ -27,8 +27,8 @@ class StreamcloudIE(InfoExtractor):
     }
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
+        video_id = self._match_id(url)
+        url = 'http://streamcloud.eu/%s' % video_id
 
         orig_webpage = self._download_webpage(url, video_id)
 
index 13c6ea67728d040a9e1f17111031952492d921b5..bf430d870125a78788d364b0ea615983ae170948 100644 (file)
@@ -80,7 +80,7 @@ class SWRMediathekIE(InfoExtractor):
 
             if media_type == 'Video':
                 fmt.update({
-                    'format_note': ['144p', '288p', '544p'][quality-1],
+                    'format_note': ['144p', '288p', '544p', '720p'][quality-1],
                     'vcodec': codec,
                 })
             elif media_type == 'Audio':
@@ -101,4 +101,4 @@ class SWRMediathekIE(InfoExtractor):
             'uploader': attr['channel_title'],
             'uploader_id': attr['channel_idkey'],
             'formats': formats,
-        }
\ No newline at end of file
+        }
index 77e0562425f0577869c01cb10e690fa9bef7d230..283e11350b212db0c857f1ccdb8982519a78cfbb 100644 (file)
@@ -50,6 +50,7 @@ class TapelyIE(InfoExtractor):
         request = compat_urllib_request.Request(playlist_url)
         request.add_header('X-Requested-With', 'XMLHttpRequest')
         request.add_header('Accept', 'application/json')
+        request.add_header('Referer', url)
 
         playlist = self._download_json(request, display_id)
 
index cd4af96fdb02b2f4ea392fe19482371b6a02acad..8550380779168a80b95e526f8921059e2eddf8f4 100644 (file)
@@ -38,6 +38,7 @@ class TEDIE(SubtitlesInfoExtractor):
                 'actively fooling us.'),
             'uploader': 'Dan Dennett',
             'width': 854,
+            'duration': 1308,
         }
     }, {
         'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
@@ -57,6 +58,7 @@ class TEDIE(SubtitlesInfoExtractor):
             'title': 'Be passionate. Be courageous. Be your best.',
             'uploader': 'Gabby Giffords and Mark Kelly',
             'description': 'md5:5174aed4d0f16021b704120360f72b92',
+            'duration': 1128,
         },
     }, {
         'url': 'http://www.ted.com/playlists/who_are_the_hackers',
@@ -178,6 +180,7 @@ class TEDIE(SubtitlesInfoExtractor):
             'description': self._og_search_description(webpage),
             'subtitles': video_subtitles,
             'formats': formats,
+            'duration': talk_info.get('duration'),
         }
 
     def _get_available_subtitles(self, video_id, talk_info):
index a73f3c43a2637e0cee677fc83d20e761e585af17..e7b79243a8fb9f091087f5c452c8192c49c81af2 100644 (file)
@@ -29,7 +29,7 @@ class TruTubeIE(InfoExtractor):
 
         # filehd is always 404
         video_url = xpath_text(config, './file', 'video URL', fatal=True)
-        title = xpath_text(config, './title', 'title')
+        title = xpath_text(config, './title', 'title').strip()
         thumbnail = xpath_text(config, './image', ' thumbnail')
 
         return {
index cee1ea8f6fd5ed50e14de086cb013628b015cb0e..875450908eb15856643dfbd7b085b107f5e1ca1e 100644 (file)
@@ -5,7 +5,6 @@ import re
 from .common import InfoExtractor
 from ..utils import (
     compat_urlparse,
-    get_meta_content,
 )
 
 
@@ -79,7 +78,7 @@ class UstreamChannelIE(InfoExtractor):
         m = re.match(self._VALID_URL, url)
         display_id = m.group('slug')
         webpage = self._download_webpage(url, display_id)
-        channel_id = get_meta_content('ustream:channel_id', webpage)
+        channel_id = self._html_search_meta('ustream:channel_id', webpage)
 
         BASE = 'http://www.ustream.tv'
         next_url = '/ajax/socialstream/videos/%s/1.json' % channel_id
diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py
new file mode 100644 (file)
index 0000000..f11ca82
--- /dev/null
@@ -0,0 +1,38 @@
+from __future__ import unicode_literals
+import re
+
+from .common import InfoExtractor
+from .ooyala import OoyalaIE
+from ..utils import ExtractorError
+
+
+class ViceIE(InfoExtractor):
+    _VALID_URL = r'http://www\.vice\.com/.*?/(?P<name>.+)'
+
+    _TEST = {
+        'url': 'http://www.vice.com/Fringes/cowboy-capitalists-part-1',
+        'info_dict': {
+            'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp',
+            'ext': 'mp4',
+            'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov',
+        },
+        'params': {
+            # Requires ffmpeg (m3u8 manifest)
+            'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        name = mobj.group('name')
+        webpage = self._download_webpage(url, name)
+        try:
+            embed_code = self._search_regex(
+                r'embedCode=([^&\'"]+)', webpage,
+                'ooyala embed code')
+            ooyala_url = OoyalaIE._url_for_embed_code(embed_code)
+            print(ooyala_url)
+        except ExtractorError:
+            raise ExtractorError('The page doesn\'t contain a video', expected=True)
+        return self.url_result(ooyala_url, ie='Ooyala')
+
index 3377a543ee7a93d59cd139389bb1c4c35b687277..d6dec25ca9e7bb9de539e89c147e22b7381e3719 100644 (file)
@@ -37,7 +37,7 @@ class WimpIE(InfoExtractor):
         video_id = mobj.group(1)
         webpage = self._download_webpage(url, video_id)
         video_url = self._search_regex(
-            r"'file'\s*:\s*'([^']+)'", webpage, 'video URL')
+            r"[\"']file[\"']\s*[:,]\s*[\"'](.+?)[\"']", webpage, 'video URL')
         if YoutubeIE.suitable(video_url):
             self.to_screen('Found YouTube video')
             return {
index 34dd6d9528ee9d4746b798e10cfeecf19e3c8277..c427649211079715a5510eef3eaf35981bdb1034 100644 (file)
@@ -27,15 +27,15 @@ class WrzutaIE(InfoExtractor):
             'description': 'md5:7fb5ef3c21c5893375fda51d9b15d9cd',
         },
     }, {
-        'url': 'http://w729.wrzuta.pl/audio/9oXJqdcndqv/david_guetta_amp_showtek_ft._vassy_-_bad',
-        'md5': '1e546a18e1c22ac6e9adce17b8961ff5',
+        'url': 'http://jolka85.wrzuta.pl/audio/063jOPX5ue2/liber_natalia_szroeder_-_teraz_ty',
+        'md5': 'bc78077859bea7bcfe4295d7d7fc9025',
         'info_dict': {
-            'id': '9oXJqdcndqv',
+            'id': '063jOPX5ue2',
             'ext': 'ogg',
-            'title': 'David Guetta & Showtek ft. Vassy - Bad',
-            'duration': 270,
-            'uploader_id': 'w729',
-            'description': 'md5:4628f01c666bbaaecefa83476cfa794a',
+            'title': 'Liber & Natalia Szroeder - Teraz Ty',
+            'duration': 203,
+            'uploader_id': 'jolka85',
+            'description': 'md5:2d2b6340f9188c8c4cd891580e481096',
         },
     }]
 
@@ -49,16 +49,17 @@ class WrzutaIE(InfoExtractor):
 
         quality = qualities(['SD', 'MQ', 'HQ', 'HD'])
 
-        audio_table = {'flv': 'mp3', 'webm': 'ogg'}
+        audio_table = {'flv': 'mp3', 'webm': 'ogg', '???': 'mp3'}
 
         embedpage = self._download_json('http://www.wrzuta.pl/npp/embed/%s/%s' % (uploader, video_id), video_id)
 
         formats = []
         for media in embedpage['url']:
+            fmt = media['type'].split('@')[0]
             if typ == 'audio':
-                ext = audio_table[media['type'].split('@')[0]]
+                ext = audio_table.get(fmt, fmt)
             else:
-                ext = media['type'].split('@')[0]
+                ext = fmt
 
             formats.append({
                 'format_id': '%s_%s' % (ext, media['quality'].lower()),
index b86331e3cfa39ec8d3f287e829900b414892beee..c642075dcfabbfb025d64b92e392d614578f42b1 100644 (file)
@@ -9,40 +9,30 @@ from ..utils import (
 
 
 class YouJizzIE(InfoExtractor):
-    _VALID_URL = r'^https?://(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+)\.html$'
+    _VALID_URL = r'https?://(?:\w+\.)?youjizz\.com/videos/[^/#?]+-(?P<id>[0-9]+)\.html(?:$|[?#])'
     _TEST = {
         'url': 'http://www.youjizz.com/videos/zeichentrick-1-2189178.html',
-        'file': '2189178.flv',
         'md5': '07e15fa469ba384c7693fd246905547c',
         'info_dict': {
+            'id': '2189178',
+            'ext': 'flv',
             "title": "Zeichentrick 1",
             "age_limit": 18,
         }
     }
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-
-        video_id = mobj.group('videoid')
-
-        # Get webpage content
+        video_id = self._match_id(url)
         webpage = self._download_webpage(url, video_id)
-
         age_limit = self._rta_search(webpage)
-
-        # Get the video title
-        video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
-            webpage, 'title').strip()
-
-        # Get the embed page
-        result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
-        if result is None:
-            raise ExtractorError('ERROR: unable to extract embed page')
-
-        embed_page_url = result.group(0).strip()
-        video_id = result.group('videoid')
-
-        webpage = self._download_webpage(embed_page_url, video_id)
+        video_title = self._html_search_regex(
+            r'<title>\s*(.*)\s*</title>', webpage, 'title')
+
+        embed_page_url = self._search_regex(
+            r'(https?://www.youjizz.com/videos/embed/[0-9]+)',
+            webpage, 'embed page')
+        webpage = self._download_webpage(
+            embed_page_url, video_id, note='downloading embed page')
 
         # Get the video URL
         m_playlist = re.search(r'so.addVariable\("playlist", ?"(?P<playlist>.+?)"\);', webpage)
index aad8ffbf45a71accd0e696ab485b13e13888f003..5406de50b465e95ee2481aecc61d0e7747bf379e 100644 (file)
@@ -684,7 +684,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
         # Get video info
         self.report_video_info_webpage_download(video_id)
         if re.search(r'player-age-gate-content">', video_webpage) is not None:
-            self.report_age_confirmation()
             age_gate = True
             # We simulate the access to the video from www.youtube.com/v/{video_id}
             # this can be viewed without login into Youtube
@@ -692,12 +691,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
                 'video_id': video_id,
                 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
                 'sts': self._search_regex(
-                    r'"sts"\s*:\s*(\d+)', video_webpage, 'sts'),
+                    r'"sts"\s*:\s*(\d+)', video_webpage, 'sts', default=''),
             })
             video_info_url = proto + '://www.youtube.com/get_video_info?' + data
-            video_info_webpage = self._download_webpage(video_info_url, video_id,
-                                    note=False,
-                                    errnote='unable to download video info webpage')
+            video_info_webpage = self._download_webpage(
+                video_info_url, video_id,
+                note='Refetching age-gated info webpage',
+                errnote='unable to download video info webpage')
             video_info = compat_parse_qs(video_info_webpage)
         else:
             age_gate = False
@@ -1043,6 +1043,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
         'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
         'info_dict': {
             'title': 'ytdl test PL',
+            'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
         },
         'playlist_count': 3,
     }, {
index 997e92ad77e53b44a03e1ae8268a9460a60255f4..c182abfdc8f211b7c7520547926ba09b40bf1071 100644 (file)
@@ -261,7 +261,16 @@ def parseOpts(overrideArguments=None):
     video_format.add_option(
         '-f', '--format',
         action='store', dest='format', metavar='FORMAT', default=None,
-        help='video format code, specify the order of preference using slashes: -f 22/17/18 .  -f mp4 , -f m4a and  -f flv  are also supported. You can also use the special names "best", "bestvideo", "bestaudio", "worst", "worstvideo" and "worstaudio". By default, youtube-dl will pick the best quality. Use commas to download multiple audio formats, such as  -f  136/137/mp4/bestvideo,140/m4a/bestaudio')
+        help='video format code, specify the order of preference using'
+            ' slashes: -f 22/17/18 .  -f mp4 , -f m4a and  -f flv  are also'
+            ' supported. You can also use the special names "best",'
+            ' "bestvideo", "bestaudio", "worst", "worstvideo" and'
+            ' "worstaudio". By default, youtube-dl will pick the best quality.'
+            ' Use commas to download multiple audio formats, such as'
+            ' -f  136/137/mp4/bestvideo,140/m4a/bestaudio.'
+            ' You can merge the video and audio of two formats into a single'
+            ' file using -f <video-format>+<audio-format> (requires ffmpeg or'
+            ' avconv), for example -f bestvideo+bestaudio.')
     video_format.add_option(
         '--all-formats',
         action='store_const', dest='format', const='all',
@@ -481,10 +490,12 @@ def parseOpts(overrideArguments=None):
               '%(format_id)s for the unique id of the format (like Youtube\'s itags: "137"), '
               '%(upload_date)s for the upload date (YYYYMMDD), '
               '%(extractor)s for the provider (youtube, metacafe, etc), '
-              '%(id)s for the video id, %(playlist)s for the playlist the video is in, '
-              '%(playlist_index)s for the position in the playlist and %% for a literal percent. '
+              '%(id)s for the video id, '
+              '%(playlist_title)s, %(playlist_id)s, or %(playlist)s (=title if present, ID otherwise) for the playlist the video is in, '
+              '%(playlist_index)s for the position in the playlist. '
               '%(height)s and %(width)s for the width and height of the video format. '
               '%(resolution)s for a textual description of the resolution of the video format. '
+              '%% for a literal percent. '
               'Use - to output to stdout. Can also be used to download to a different directory, '
               'for example with -o \'/my/downloads/%(uploader)s/%(title)s-%(id)s.%(ext)s\' .'))
     filesystem.add_option(
index 233286de840983142f4a89a5ea2cd3db6507fcae..d87e212ae67958cc89e2db7002072fc332477469 100644 (file)
@@ -34,7 +34,6 @@ from .compat import (
     compat_chr,
     compat_getenv,
     compat_html_entities,
-    compat_html_parser,
     compat_parse_qs,
     compat_str,
     compat_urllib_error,
@@ -152,127 +151,32 @@ def xpath_text(node, xpath, name=None, fatal=False):
     return n.text
 
 
-compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
-class BaseHTMLParser(compat_html_parser.HTMLParser):
-    def __init(self):
-        compat_html_parser.HTMLParser.__init__(self)
-        self.html = None
-
-    def loads(self, html):
-        self.html = html
-        self.feed(html)
-        self.close()
-
-class AttrParser(BaseHTMLParser):
-    """Modified HTMLParser that isolates a tag with the specified attribute"""
-    def __init__(self, attribute, value):
-        self.attribute = attribute
-        self.value = value
-        self.result = None
-        self.started = False
-        self.depth = {}
-        self.watch_startpos = False
-        self.error_count = 0
-        BaseHTMLParser.__init__(self)
-
-    def error(self, message):
-        if self.error_count > 10 or self.started:
-            raise compat_html_parser.HTMLParseError(message, self.getpos())
-        self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
-        self.error_count += 1
-        self.goahead(1)
-
-    def handle_starttag(self, tag, attrs):
-        attrs = dict(attrs)
-        if self.started:
-            self.find_startpos(None)
-        if self.attribute in attrs and attrs[self.attribute] == self.value:
-            self.result = [tag]
-            self.started = True
-            self.watch_startpos = True
-        if self.started:
-            if not tag in self.depth: self.depth[tag] = 0
-            self.depth[tag] += 1
-
-    def handle_endtag(self, tag):
-        if self.started:
-            if tag in self.depth: self.depth[tag] -= 1
-            if self.depth[self.result[0]] == 0:
-                self.started = False
-                self.result.append(self.getpos())
-
-    def find_startpos(self, x):
-        """Needed to put the start position of the result (self.result[1])
-        after the opening tag with the requested id"""
-        if self.watch_startpos:
-            self.watch_startpos = False
-            self.result.append(self.getpos())
-    handle_entityref = handle_charref = handle_data = handle_comment = \
-    handle_decl = handle_pi = unknown_decl = find_startpos
-
-    def get_result(self):
-        if self.result is None:
-            return None
-        if len(self.result) != 3:
-            return None
-        lines = self.html.split('\n')
-        lines = lines[self.result[1][0]-1:self.result[2][0]]
-        lines[0] = lines[0][self.result[1][1]:]
-        if len(lines) == 1:
-            lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
-        lines[-1] = lines[-1][:self.result[2][1]]
-        return '\n'.join(lines).strip()
-# Hack for https://github.com/rg3/youtube-dl/issues/662
-if sys.version_info < (2, 7, 3):
-    AttrParser.parse_endtag = (lambda self, i:
-        i + len("</scr'+'ipt>")
-        if self.rawdata[i:].startswith("</scr'+'ipt>")
-        else compat_html_parser.HTMLParser.parse_endtag(self, i))
-
 def get_element_by_id(id, html):
     """Return the content of the tag with the specified ID in the passed HTML document"""
     return get_element_by_attribute("id", id, html)
 
+
 def get_element_by_attribute(attribute, value, html):
     """Return the content of the tag with the specified attribute in the passed HTML document"""
-    parser = AttrParser(attribute, value)
-    try:
-        parser.loads(html)
-    except compat_html_parser.HTMLParseError:
-        pass
-    return parser.get_result()
 
-class MetaParser(BaseHTMLParser):
-    """
-    Modified HTMLParser that isolates a meta tag with the specified name 
-    attribute.
-    """
-    def __init__(self, name):
-        BaseHTMLParser.__init__(self)
-        self.name = name
-        self.content = None
-        self.result = None
-
-    def handle_starttag(self, tag, attrs):
-        if tag != 'meta':
-            return
-        attrs = dict(attrs)
-        if attrs.get('name') == self.name:
-            self.result = attrs.get('content')
+    m = re.search(r'''(?xs)
+        <([a-zA-Z0-9:._-]+)
+         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
+         \s+%s=['"]?%s['"]?
+         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
+        \s*>
+        (?P<content>.*?)
+        </\1>
+    ''' % (re.escape(attribute), re.escape(value)), html)
 
-    def get_result(self):
-        return self.result
+    if not m:
+        return None
+    res = m.group('content')
 
-def get_meta_content(name, html):
-    """
-    Return the content attribute from the meta tag with the given name attribute.
-    """
-    parser = MetaParser(name)
-    try:
-        parser.loads(html)
-    except compat_html_parser.HTMLParseError:
-        pass
-    return parser.get_result()
+    if res.startswith('"') or res.startswith("'"):
+        res = res[1:-1]
+
+    return unescapeHTML(res)
 
 
 def clean_html(html):
index 327560c76bb1bc37584524a51240a97dc7b51428..473916f7ada46fc6c9e66daa718ff354cd362634 100644 (file)
@@ -1,2 +1,2 @@
 
-__version__ = '2014.11.02.1'
+__version__ = '2014.11.12.1'