Merge remote-tracking branch 'rzhxeo/embedly'
authorPhilipp Hagemeister <phihag@phihag.de>
Wed, 5 Mar 2014 13:01:53 +0000 (14:01 +0100)
committerPhilipp Hagemeister <phihag@phihag.de>
Wed, 5 Mar 2014 13:01:53 +0000 (14:01 +0100)
Conflicts:
youtube_dl/extractor/generic.py

61 files changed:
README.md
devscripts/release.sh
test/test_all_urls.py
test/test_download.py
test/test_playlists.py
test/test_utils.py
test/test_youtube_lists.py
youtube_dl/YoutubeDL.py
youtube_dl/__init__.py
youtube_dl/downloader/f4m.py
youtube_dl/downloader/http.py
youtube_dl/extractor/__init__.py
youtube_dl/extractor/academicearth.py
youtube_dl/extractor/br.py [new file with mode: 0644]
youtube_dl/extractor/breakcom.py
youtube_dl/extractor/canal13cl.py [new file with mode: 0644]
youtube_dl/extractor/ceskatelevize.py [new file with mode: 0644]
youtube_dl/extractor/cinemassacre.py
youtube_dl/extractor/collegehumor.py
youtube_dl/extractor/common.py
youtube_dl/extractor/crunchyroll.py
youtube_dl/extractor/dailymotion.py
youtube_dl/extractor/facebook.py
youtube_dl/extractor/fourtube.py
youtube_dl/extractor/gdcvault.py [new file with mode: 0644]
youtube_dl/extractor/generic.py
youtube_dl/extractor/iprima.py
youtube_dl/extractor/lifenews.py
youtube_dl/extractor/mailru.py [new file with mode: 0644]
youtube_dl/extractor/metacafe.py
youtube_dl/extractor/mit.py
youtube_dl/extractor/mixcloud.py
youtube_dl/extractor/nbc.py
youtube_dl/extractor/normalboots.py
youtube_dl/extractor/novamov.py
youtube_dl/extractor/nowvideo.py
youtube_dl/extractor/orf.py
youtube_dl/extractor/podomatic.py
youtube_dl/extractor/prosiebensat1.py [new file with mode: 0644]
youtube_dl/extractor/rtlnow.py
youtube_dl/extractor/soundcloud.py
youtube_dl/extractor/ted.py
youtube_dl/extractor/testurl.py
youtube_dl/extractor/theplatform.py
youtube_dl/extractor/tinypic.py
youtube_dl/extractor/trutube.py [new file with mode: 0644]
youtube_dl/extractor/tvigle.py [new file with mode: 0644]
youtube_dl/extractor/veoh.py
youtube_dl/extractor/vesti.py
youtube_dl/extractor/vevo.py
youtube_dl/extractor/videobam.py [new file with mode: 0644]
youtube_dl/extractor/vimeo.py
youtube_dl/extractor/vine.py
youtube_dl/extractor/vk.py
youtube_dl/extractor/worldstarhiphop.py
youtube_dl/extractor/xhamster.py
youtube_dl/extractor/xtube.py
youtube_dl/extractor/youtube.py
youtube_dl/extractor/zdf.py
youtube_dl/utils.py
youtube_dl/version.py

index 49b62f13f84f32537d7a7d0323c5282866b02994..ccd94b2dcd65e70dbb7e262650081b7f7f6e156f 100644 (file)
--- a/README.md
+++ b/README.md
@@ -124,8 +124,12 @@ which means you can modify it, redistribute it or use it however you like.
                                      video id, %(playlist)s for the playlist the
                                      video is in, %(playlist_index)s for the
                                      position in the playlist and %% for a
-                                     literal percent. Use - to output to stdout.
-                                     Can also be used to download to a different
+                                     literal percent. %(height)s and %(width)s
+                                     for the width and height of the video
+                                     format. %(resolution)s for a textual
+                                     description of the resolution of the video
+                                     format. Use - to output to stdout. Can also
+                                     be used to download to a different
                                      directory, for example with -o '/my/downloa
                                      ds/%(uploader)s/%(title)s-%(id)s.%(ext)s' .
     --autonumber-size NUMBER         Specifies the number of digits in
index 323acf8cfa92cc7662c21fac44795790867901f4..72e708c7f79c24f06797c4de10e6334ecf6bb9ec 100755 (executable)
@@ -14,9 +14,9 @@
 
 set -e
 
-skip_tests=false
-if [ "$1" = '--skip-test' ]; then
-    skip_tests=true
+skip_tests=true
+if [ "$1" = '--run-tests' ]; then
+    skip_tests=false
     shift
 fi
 
index 7a78005a3a32f1e25609ac95e6004c39706e72a2..5c17a39fb556da63020dcfab17d29b9862d90488 100644 (file)
@@ -71,6 +71,10 @@ class TestAllURLsMatching(unittest.TestCase):
     def test_youtube_truncated(self):
         self.assertMatch('http://www.youtube.com/watch?', ['youtube:truncated_url'])
 
+    def test_youtube_search_matching(self):
+        self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url'])
+        self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url'])
+
     def test_justin_tv_channelid_matching(self):
         self.assertTrue(JustinTVIE.suitable(u"justin.tv/vanillatv"))
         self.assertTrue(JustinTVIE.suitable(u"twitch.tv/vanillatv"))
index 7587a18aa18fee1cdf61a735246c62dbf91e691d..bbbb6b78ab326a292b193891cee926f88d1ab3ad 100644 (file)
@@ -18,6 +18,7 @@ from test.helper import (
 import hashlib
 import io
 import json
+import re
 import socket
 
 import youtube_dl.YoutubeDL
@@ -72,9 +73,7 @@ def generator(test_case):
         if 'playlist' not in test_case:
             info_dict = test_case.get('info_dict', {})
             if not test_case.get('file') and not (info_dict.get('id') and info_dict.get('ext')):
-                print_skipping('The output file cannot be know, the "file" '
-                    'key is missing or the info_dict is incomplete')
-                return
+                raise Exception('Test definition incorrect. The output file cannot be known. Are both \'id\' and \'ext\' keys present?')
         if 'skip' in test_case:
             print_skipping(test_case['skip'])
             return
@@ -137,12 +136,21 @@ def generator(test_case):
                 with io.open(info_json_fn, encoding='utf-8') as infof:
                     info_dict = json.load(infof)
                 for (info_field, expected) in tc.get('info_dict', {}).items():
-                    if isinstance(expected, compat_str) and expected.startswith('md5:'):
-                        got = 'md5:' + md5(info_dict.get(info_field))
-                    else:
+                    if isinstance(expected, compat_str) and expected.startswith('re:'):
                         got = info_dict.get(info_field)
-                    self.assertEqual(expected, got,
-                        u'invalid value for field %s, expected %r, got %r' % (info_field, expected, got))
+                        match_str = expected[len('re:'):]
+                        match_rex = re.compile(match_str)
+
+                        self.assertTrue(
+                            isinstance(got, compat_str) and match_rex.match(got),
+                            u'field %s (value: %r) should match %r' % (info_field, got, match_str))
+                    else:
+                        if isinstance(expected, compat_str) and expected.startswith('md5:'):
+                            got = 'md5:' + md5(info_dict.get(info_field))
+                        else:
+                            got = info_dict.get(info_field)
+                        self.assertEqual(expected, got,
+                            u'invalid value for field %s, expected %r, got %r' % (info_field, expected, got))
 
                 # If checkable fields are missing from the test case, print the info_dict
                 test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'md5:' + md5(value))
index 25bec9f1c69c4ef3093dedef7c54d145f9e866ab..4bd815a0e9694cb0df821cc0e12efed8bd3fc715 100644 (file)
@@ -36,6 +36,7 @@ from youtube_dl.extractor import (
     RutubeChannelIE,
     GoogleSearchIE,
     GenericIE,
+    TEDIE,
 )
 
 
@@ -170,12 +171,12 @@ class TestPlaylists(unittest.TestCase):
     def test_AcademicEarthCourse(self):
         dl = FakeYDL()
         ie = AcademicEarthCourseIE(dl)
-        result = ie.extract('http://academicearth.org/courses/building-dynamic-websites/')
+        result = ie.extract('http://academicearth.org/playlists/laws-of-nature/')
         self.assertIsPlaylist(result)
-        self.assertEqual(result['id'], 'building-dynamic-websites')
-        self.assertEqual(result['title'], 'Building Dynamic Websites')
-        self.assertEqual(result['description'], u"Today's websites are increasingly dynamic. Pages are no longer static HTML files but instead generated by scripts and database calls. User interfaces are more seamless, with technologies like Ajax replacing traditional page reloads. This course teaches students how to build dynamic websites with Ajax and with Linux, Apache, MySQL, and PHP (LAMP), one of today's most popular frameworks. Students learn how to set up domain names with DNS, how to structure pages with XHTML and CSS, how to program in JavaScript and PHP, how to configure Apache and MySQL, how to design and query databases with SQL, how to use Ajax with both XML and JSON, and how to build mashups. The course explores issues of security, scalability, and cross-browser support and also discusses enterprise-level deployments of websites, including third-party hosting, virtualization, colocation in data centers, firewalling, and load-balancing.")
-        self.assertEqual(len(result['entries']), 10)
+        self.assertEqual(result['id'], 'laws-of-nature')
+        self.assertEqual(result['title'], 'Laws of Nature')
+        self.assertEqual(result['description'],u'Introduce yourself to the laws of nature with these free online college lectures from Yale, Harvard, and MIT.')# u"Today's websites are increasingly dynamic. Pages are no longer static HTML files but instead generated by scripts and database calls. User interfaces are more seamless, with technologies like Ajax replacing traditional page reloads. This course teaches students how to build dynamic websites with Ajax and with Linux, Apache, MySQL, and PHP (LAMP), one of today's most popular frameworks. Students learn how to set up domain names with DNS, how to structure pages with XHTML and CSS, how to program in JavaScript and PHP, how to configure Apache and MySQL, how to design and query databases with SQL, how to use Ajax with both XML and JSON, and how to build mashups. The course explores issues of security, scalability, and cross-browser support and also discusses enterprise-level deployments of websites, including third-party hosting, virtualization, colocation in data centers, firewalling, and load-balancing.")
+        self.assertEqual(len(result['entries']), 4)
         
     def test_ivi_compilation(self):
         dl = FakeYDL()
@@ -259,5 +260,14 @@ class TestPlaylists(unittest.TestCase):
         self.assertEqual(result['title'], 'Zero Punctuation')
         self.assertTrue(len(result['entries']) > 10)
 
+    def test_ted_playlist(self):
+        dl = FakeYDL()
+        ie = TEDIE(dl)
+        result = ie.extract('http://www.ted.com/playlists/who_are_the_hackers')
+        self.assertIsPlaylist(result)
+        self.assertEqual(result['id'], '10')
+        self.assertEqual(result['title'], 'Who are the hackers?')
+        self.assertTrue(len(result['entries']) >= 6)
+
 if __name__ == '__main__':
     unittest.main()
index 84553b94386236352e9ad856782c4b3428f3587b..4e3c37fb4c7e3a762ad2f049db6fdf1297145824 100644 (file)
@@ -9,6 +9,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
 
 # Various small unit tests
+import io
 import xml.etree.ElementTree
 
 #from youtube_dl.utils import htmlentity_transform
@@ -21,6 +22,7 @@ from youtube_dl.utils import (
     orderedSet,
     PagedList,
     parse_duration,
+    read_batch_urls,
     sanitize_filename,
     shell_quote,
     smuggle_url,
@@ -250,5 +252,14 @@ class TestUtil(unittest.TestCase):
     def test_struct_unpack(self):
         self.assertEqual(struct_unpack(u'!B', b'\x00'), (0,))
 
+    def test_read_batch_urls(self):
+        f = io.StringIO(u'''\xef\xbb\xbf foo
+            bar\r
+            baz
+            # More after this line\r
+            ; or after this
+            bam''')
+        self.assertEqual(read_batch_urls(f), [u'foo', u'bar', u'baz', u'bam'])
+
 if __name__ == '__main__':
     unittest.main()
index 38ac989ce706a347e575f62a9dd6b60b8fece8c9..7d3b9c7054726bb773e3ac390222706c0bcec2af 100644 (file)
@@ -16,6 +16,7 @@ from youtube_dl.extractor import (
     YoutubeChannelIE,
     YoutubeShowIE,
     YoutubeTopListIE,
+    YoutubeSearchURLIE,
 )
 
 
@@ -118,6 +119,8 @@ class TestYoutubeLists(unittest.TestCase):
         self.assertEqual(original_video['id'], 'rjFaenf1T-Y')
 
     def test_youtube_toptracks(self):
+        print('Skipping: The playlist page gives error 500')
+        return
         dl = FakeYDL()
         ie = YoutubePlaylistIE(dl)
         result = ie.extract('https://www.youtube.com/playlist?list=MCUS')
@@ -131,5 +134,14 @@ class TestYoutubeLists(unittest.TestCase):
         entries = result['entries']
         self.assertTrue(len(entries) >= 5)
 
+    def test_youtube_search_url(self):
+        dl = FakeYDL()
+        ie = YoutubeSearchURLIE(dl)
+        result = ie.extract('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video')
+        entries = result['entries']
+        self.assertIsPlaylist(result)
+        self.assertEqual(result['title'], 'youtube-dl test video')
+        self.assertTrue(len(entries) >= 5)
+
 if __name__ == '__main__':
     unittest.main()
index 42cbcf6994567e09b3f35eec3f3f0a71494eb4f5..62ccad20c55b891e76580a4ed9aed0ef34078d1e 100644 (file)
@@ -409,6 +409,13 @@ class YoutubeDL(object):
             template_dict['autonumber'] = autonumber_templ % self._num_downloads
             if template_dict.get('playlist_index') is not None:
                 template_dict['playlist_index'] = '%05d' % template_dict['playlist_index']
+            if template_dict.get('resolution') is None:
+                if template_dict.get('width') and template_dict.get('height'):
+                    template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
+                elif template_dict.get('height'):
+                    res = '%sp' % template_dict['height']
+                elif template_dict.get('width'):
+                    res = '?x%d' % template_dict['width']
 
             sanitize = lambda k, v: sanitize_filename(
                 compat_str(v),
@@ -675,6 +682,9 @@ class YoutubeDL(object):
             info_dict['playlist'] = None
             info_dict['playlist_index'] = None
 
+        if 'display_id' not in info_dict and 'id' in info_dict:
+            info_dict['display_id'] = info_dict['id']
+
         # This extractors handle format selection themselves
         if info_dict['extractor'] in ['Youku']:
             if download:
index 70608066c83b836a14825622d0eef6b0042e5395..d39eb830f4a9ff5c5ec3b559a74ce2eb6b5c9806 100644 (file)
@@ -46,12 +46,17 @@ __authors__  = (
     'Andreas Schmitz',
     'Michael Kaiser',
     'Niklas Laxström',
+    'David Triendl',
+    'Anthony Weems',
+    'David Wagner',
+    'Juan C. Olivares',
 )
 
 __license__ = 'Public Domain'
 
 import codecs
 import getpass
+import io
 import locale
 import optparse
 import os
@@ -70,6 +75,7 @@ from .utils import (
     get_cachedir,
     MaxDownloadsReached,
     preferredencoding,
+    read_batch_urls,
     SameFileError,
     setproctitle,
     std_headers,
@@ -424,6 +430,8 @@ def parseOpts(overrideArguments=None):
                   '%(extractor)s for the provider (youtube, metacafe, etc), '
                   '%(id)s for the video id, %(playlist)s for the playlist the video is in, '
                   '%(playlist_index)s for the position in the playlist and %% for a literal percent. '
+                  '%(height)s and %(width)s for the width and height of the video format. '
+                  '%(resolution)s for a textual description of the resolution of the video format. '
                   'Use - to output to stdout. Can also be used to download to a different directory, '
                   'for example with -o \'/my/downloads/%(uploader)s/%(title)s-%(id)s.%(ext)s\' .'))
     filesystem.add_option('--autonumber-size',
@@ -551,21 +559,19 @@ def _real_main(argv=None):
         sys.exit(0)
 
     # Batch file verification
-    batchurls = []
+    batch_urls = []
     if opts.batchfile is not None:
         try:
             if opts.batchfile == '-':
                 batchfd = sys.stdin
             else:
-                batchfd = open(opts.batchfile, 'r')
-            batchurls = batchfd.readlines()
-            batchurls = [x.strip() for x in batchurls]
-            batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
+                batchfd = io.open(opts.batchfile, 'r', encoding='utf-8', errors='ignore')
+            batch_urls = read_batch_urls(batchfd)
             if opts.verbose:
-                write_string(u'[debug] Batch file urls: ' + repr(batchurls) + u'\n')
+                write_string(u'[debug] Batch file urls: ' + repr(batch_urls) + u'\n')
         except IOError:
             sys.exit(u'ERROR: batch file could not be read')
-    all_urls = batchurls + args
+    all_urls = batch_urls + args
     all_urls = [url.strip() for url in all_urls]
     _enc = preferredencoding()
     all_urls = [url.decode(_enc, 'ignore') if isinstance(url, bytes) else url for url in all_urls]
index 2a870a758fa32dd475eaf725f69dedbf3c45903b..4e6abfe108f8e77da090ac420873229e45b3d0f9 100644 (file)
@@ -12,7 +12,6 @@ from .http import HttpFD
 from ..utils import (
     struct_pack,
     struct_unpack,
-    compat_urllib_request,
     compat_urlparse,
     format_bytes,
     encodeFilename,
@@ -117,8 +116,8 @@ class FlvReader(io.BytesIO):
         self.read_unsigned_char()
         # flags
         self.read(3)
-        # BootstrapinfoVersion
-        bootstrap_info_version = self.read_unsigned_int()
+
+        self.read_unsigned_int()  # BootstrapinfoVersion
         # Profile,Live,Update,Reserved
         self.read(1)
         # time scale
@@ -127,15 +126,15 @@ class FlvReader(io.BytesIO):
         self.read_unsigned_long_long()
         # SmpteTimeCodeOffset
         self.read_unsigned_long_long()
-        # MovieIdentifier
-        movie_identifier = self.read_string()
+
+        self.read_string()  # MovieIdentifier
         server_count = self.read_unsigned_char()
         # ServerEntryTable
         for i in range(server_count):
             self.read_string()
         quality_count = self.read_unsigned_char()
         # QualityEntryTable
-        for i in range(server_count):
+        for i in range(quality_count):
             self.read_string()
         # DrmData
         self.read_string()
index 748f9f3adfe0555024cce272b8eb872c55558d6c..240ecb606e5ee4b76599f2e380b6ea30d767c5da 100644 (file)
@@ -85,6 +85,7 @@ class HttpFD(FileDownloader):
                         else:
                             # The length does not match, we start the download over
                             self.report_unable_to_resume()
+                            resume_len = 0
                             open_mode = 'wb'
                             break
             # Retry
index 229bdc595f81f2a193cbf2661333491201684b97..f35ee49416653ec50c95919ece259ff7eee90abb 100644 (file)
@@ -19,12 +19,15 @@ from .bbccouk import BBCCoUkIE
 from .blinkx import BlinkxIE
 from .bliptv import BlipTVIE, BlipTVUserIE
 from .bloomberg import BloombergIE
+from .br import BRIE
 from .breakcom import BreakIE
 from .brightcove import BrightcoveIE
 from .c56 import C56IE
+from .canal13cl import Canal13clIE
 from .canalplus import CanalplusIE
 from .canalc2 import Canalc2IE
 from .cbs import CBSIE
+from .ceskatelevize import CeskaTelevizeIE
 from .channel9 import Channel9IE
 from .chilloutzone import ChilloutzoneIE
 from .cinemassacre import CinemassacreIE
@@ -88,6 +91,7 @@ from .funnyordie import FunnyOrDieIE
 from .gamekings import GamekingsIE
 from .gamespot import GameSpotIE
 from .gametrailers import GametrailersIE
+from .gdcvault import GDCVaultIE
 from .generic import GenericIE
 from .googleplus import GooglePlusIE
 from .googlesearch import GoogleSearchIE
@@ -132,11 +136,12 @@ from .lynda import (
 )
 from .m6 import M6IE
 from .macgamestore import MacGameStoreIE
+from .mailru import MailRuIE
 from .malemotion import MalemotionIE
 from .mdr import MDRIE
 from .metacafe import MetacafeIE
 from .metacritic import MetacriticIE
-from .mit import TechTVMITIE, MITIE
+from .mit import TechTVMITIE, MITIE, OCWMITIE
 from .mixcloud import MixcloudIE
 from .mpora import MporaIE
 from .mofosex import MofosexIE
@@ -151,7 +156,10 @@ from .myspass import MySpassIE
 from .myvideo import MyVideoIE
 from .naver import NaverIE
 from .nba import NBAIE
-from .nbc import NBCNewsIE
+from .nbc import (
+    NBCIE,
+    NBCNewsIE,
+)
 from .ndr import NDRIE
 from .ndtv import NDTVIE
 from .newgrounds import NewgroundsIE
@@ -160,7 +168,7 @@ from .nhl import NHLIE, NHLVideocenterIE
 from .niconico import NiconicoIE
 from .ninegag import NineGagIE
 from .normalboots import NormalbootsIE
-from .novamov import NovamovIE
+from .novamov import NovaMovIE
 from .nowness import NownessIE
 from .nowvideo import NowVideoIE
 from .ooyala import OoyalaIE
@@ -171,6 +179,7 @@ from .podomatic import PodomaticIE
 from .pornhd import PornHdIE
 from .pornhub import PornHubIE
 from .pornotube import PornotubeIE
+from .prosiebensat1 import ProSiebenSat1IE
 from .pyvideo import PyvideoIE
 from .radiofrance import RadioFranceIE
 from .rbmaradio import RBMARadioIE
@@ -225,10 +234,12 @@ from .tinypic import TinyPicIE
 from .toutv import TouTvIE
 from .traileraddict import TrailerAddictIE
 from .trilulilu import TriluliluIE
+from .trutube import TruTubeIE
 from .tube8 import Tube8IE
 from .tudou import TudouIE
 from .tumblr import TumblrIE
 from .tutv import TutvIE
+from .tvigle import TvigleIE
 from .tvp import TvpIE
 from .unistra import UnistraIE
 from .ustream import UstreamIE, UstreamChannelIE
@@ -239,6 +250,7 @@ from .vesti import VestiIE
 from .vevo import VevoIE
 from .vice import ViceIE
 from .viddler import ViddlerIE
+from .videobam import VideoBamIE
 from .videodetective import VideoDetectiveIE
 from .videofyme import VideofyMeIE
 from .videopremium import VideoPremiumIE
@@ -273,19 +285,20 @@ from .youku import YoukuIE
 from .youporn import YouPornIE
 from .youtube import (
     YoutubeIE,
+    YoutubeChannelIE,
+    YoutubeFavouritesIE,
+    YoutubeHistoryIE,
     YoutubePlaylistIE,
-    YoutubeSearchIE,
+    YoutubeRecommendedIE,
     YoutubeSearchDateIE,
-    YoutubeUserIE,
-    YoutubeChannelIE,
+    YoutubeSearchIE,
+    YoutubeSearchURLIE,
     YoutubeShowIE,
     YoutubeSubscriptionsIE,
-    YoutubeRecommendedIE,
+    YoutubeTopListIE,
     YoutubeTruncatedURLIE,
+    YoutubeUserIE,
     YoutubeWatchLaterIE,
-    YoutubeFavouritesIE,
-    YoutubeHistoryIE,
-    YoutubeTopListIE,
 )
 from .zdf import ZDFIE
 
index 72f81d01a4976767033ea236eff4d8a5e2e43d33..59d3bbba413c3c256a3f77917708fb171e337b14 100644 (file)
@@ -5,7 +5,7 @@ from .common import InfoExtractor
 
 
 class AcademicEarthCourseIE(InfoExtractor):
-    _VALID_URL = r'^https?://(?:www\.)?academicearth\.org/(?:courses|playlists)/(?P<id>[^?#/]+)'
+    _VALID_URL = r'^https?://(?:www\.)?academicearth\.org/playlists/(?P<id>[^?#/]+)'
     IE_NAME = 'AcademicEarth:Course'
 
     def _real_extract(self, url):
@@ -14,12 +14,12 @@ class AcademicEarthCourseIE(InfoExtractor):
 
         webpage = self._download_webpage(url, playlist_id)
         title = self._html_search_regex(
-            r'<h1 class="playlist-name">(.*?)</h1>', webpage, u'title')
+            r'<h1 class="playlist-name"[^>]*?>(.*?)</h1>', webpage, u'title')
         description = self._html_search_regex(
-            r'<p class="excerpt">(.*?)</p>',
+            r'<p class="excerpt"[^>]*?>(.*?)</p>',
             webpage, u'description', fatal=False)
         urls = re.findall(
-            r'<h3 class="lecture-title"><a target="_blank" href="([^"]+)">',
+            r'<li class="lecture-preview">\s*?<a target="_blank" href="([^"]+)">',
             webpage)
         entries = [self.url_result(u) for u in urls]
 
diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py
new file mode 100644 (file)
index 0000000..5fcc108
--- /dev/null
@@ -0,0 +1,80 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+
+
+class BRIE(InfoExtractor):
+    IE_DESC = "Bayerischer Rundfunk Mediathek"
+    _VALID_URL = r"^https?://(?:www\.)?br\.de/mediathek/video/(?:sendungen/)?(?P<id>[a-z0-9\-]+)\.html$"
+    _BASE_URL = "http://www.br.de"
+
+    _TEST = {
+        "url": "http://www.br.de/mediathek/video/anselm-gruen-114.html",
+        "md5": "c4f83cf0f023ba5875aba0bf46860df2",
+        "info_dict": {
+            "id": "2c8d81c5-6fb7-4a74-88d4-e768e5856532",
+            "ext": "mp4",
+            "title": "Feiern und Verzichten",
+            "description": "Anselm Grün: Feiern und Verzichten",
+            "uploader": "BR/Birgit Baier",
+            "upload_date": "20140301"
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        display_id = mobj.group('id')
+        page = self._download_webpage(url, display_id)
+        xml_url = self._search_regex(
+            r"return BRavFramework\.register\(BRavFramework\('avPlayer_(?:[a-f0-9-]{36})'\)\.setup\({dataURL:'(/mediathek/video/[a-z0-9/~_.-]+)'}\)\);", page, "XMLURL")
+        xml = self._download_xml(self._BASE_URL + xml_url, None)
+
+        videos = [{
+            "id": xml_video.get("externalId"),
+            "title": xml_video.find("title").text,
+            "formats": self._extract_formats(xml_video.find("assets")),
+            "thumbnails": self._extract_thumbnails(xml_video.find("teaserImage/variants")),
+            "description": " ".join(xml_video.find("shareTitle").text.splitlines()),
+            "uploader": xml_video.find("author").text,
+            "upload_date": "".join(reversed(xml_video.find("broadcastDate").text.split("."))),
+            "webpage_url": xml_video.find("permalink").text,
+        } for xml_video in xml.findall("video")]
+
+        if len(videos) > 1:
+            self._downloader.report_warning(
+                'found multiple videos; please '
+                'report this with the video URL to http://yt-dl.org/bug')
+        if not videos:
+            raise ExtractorError('No video entries found')
+        return videos[0]
+
+    def _extract_formats(self, assets):
+        formats = [{
+            "url": asset.find("downloadUrl").text,
+            "ext": asset.find("mediaType").text,
+            "format_id": asset.get("type"),
+            "width": int(asset.find("frameWidth").text),
+            "height": int(asset.find("frameHeight").text),
+            "tbr": int(asset.find("bitrateVideo").text),
+            "abr": int(asset.find("bitrateAudio").text),
+            "vcodec": asset.find("codecVideo").text,
+            "container": asset.find("mediaType").text,
+            "filesize": int(asset.find("size").text),
+        } for asset in assets.findall("asset")
+            if asset.find("downloadUrl") is not None]
+
+        self._sort_formats(formats)
+        return formats
+
+    def _extract_thumbnails(self, variants):
+        thumbnails = [{
+            "url": self._BASE_URL + variant.find("url").text,
+            "width": int(variant.find("width").text),
+            "height": int(variant.find("height").text),
+        } for variant in variants.findall("variant")]
+        thumbnails.sort(key=lambda x: x["width"] * x["height"], reverse=True)
+        return thumbnails
index 8ec6dda490c9c886463502c795bce4b361b9c326..85635d1cc62fb7b234741c1bf67390ceaf2127d7 100644 (file)
@@ -23,8 +23,8 @@ class BreakIE(InfoExtractor):
         video_id = mobj.group(1).split("-")[-1]
         embed_url = 'http://www.break.com/embed/%s' % video_id
         webpage = self._download_webpage(embed_url, video_id)
-        info_json = self._search_regex(r'var embedVars = ({.*?});', webpage,
-                                       'info json', flags=re.DOTALL)
+        info_json = self._search_regex(r'var embedVars = ({.*})\s*?</script>',
+            webpage, 'info json', flags=re.DOTALL)
         info = json.loads(info_json)
         video_url = info['videoUri']
         m_youtube = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', video_url)
diff --git a/youtube_dl/extractor/canal13cl.py b/youtube_dl/extractor/canal13cl.py
new file mode 100644 (file)
index 0000000..93241fe
--- /dev/null
@@ -0,0 +1,48 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class Canal13clIE(InfoExtractor):
+    _VALID_URL = r'^http://(?:www\.)?13\.cl/(?:[^/?#]+/)*(?P<id>[^/?#]+)'
+    _TEST = {
+        'url': 'http://www.13.cl/t13/nacional/el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda',
+        'md5': '4cb1fa38adcad8fea88487a078831755',
+        'info_dict': {
+            'id': '1403022125',
+            'display_id': 'el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda',
+            'ext': 'mp4',
+            'title': 'El "círculo de hierro" de Michelle Bachelet en su regreso a La Moneda',
+            'description': '(Foto: Agencia Uno) En nueve días más, Michelle Bachelet va a asumir por segunda vez como presidenta de la República. Entre aquellos que la acompañarán hay caras que se repiten y otras que se consolidan en su entorno de colaboradores más cercanos.',
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        display_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, display_id)
+
+        title = self._html_search_meta(
+            'twitter:title', webpage, 'title', fatal=True)
+        description = self._html_search_meta(
+            'twitter:description', webpage, 'description')
+        url = self._html_search_regex(
+            r'articuloVideo = \"(.*?)\"', webpage, 'url')
+        real_id = self._search_regex(
+            r'[^0-9]([0-9]{7,})[^0-9]', url, 'id', default=display_id)
+        thumbnail = self._html_search_regex(
+            r'articuloImagen = \"(.*?)\"', webpage, 'thumbnail')
+
+        return {
+            'id': real_id,
+            'display_id': display_id,
+            'url': url,
+            'title': title,
+            'description': description,
+            'ext': 'mp4',
+            'thumbnail': thumbnail,
+        }
diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py
new file mode 100644 (file)
index 0000000..90a3ddd
--- /dev/null
@@ -0,0 +1,126 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_urllib_request,
+    compat_urllib_parse,
+    compat_urllib_parse_urlparse,
+    ExtractorError,
+)
+
+
+class CeskaTelevizeIE(InfoExtractor):
+    _VALID_URL = r'https?://www\.ceskatelevize\.cz/(porady|ivysilani)/(.+/)?(?P<id>[^?#]+)'
+
+    _TESTS = [
+        {
+            'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/213512120230004-spanelska-chripka',
+            'info_dict': {
+                'id': '213512120230004',
+                'ext': 'flv',
+                'title': 'První republika: Španělská chřipka',
+                'duration': 3107.4,
+            },
+            'params': {
+                'skip_download': True,  # requires rtmpdump
+            },
+            'skip': 'Works only from Czech Republic.',
+        },
+        {
+            'url': 'http://www.ceskatelevize.cz/ivysilani/1030584952-tsatsiki-maminka-a-policajt',
+            'info_dict': {
+                'id': '20138143440',
+                'ext': 'flv',
+                'title': 'Tsatsiki, maminka a policajt',
+                'duration': 6754.1,
+            },
+            'params': {
+                'skip_download': True,  # requires rtmpdump
+            },
+            'skip': 'Works only from Czech Republic.',
+        },
+        {
+            'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/bonus/14716-zpevacka-z-duparny-bobina',
+            'info_dict': {
+                'id': '14716',
+                'ext': 'flv',
+                'title': 'První republika: Zpěvačka z Dupárny Bobina',
+                'duration': 90,
+            },
+            'params': {
+                'skip_download': True,  # requires rtmpdump
+            },
+        },
+    ]
+
+    def _real_extract(self, url):
+        url = url.replace('/porady/', '/ivysilani/').replace('/video/', '')
+
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+
+        NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.'
+        if '%s</p>' % NOT_AVAILABLE_STRING in webpage:
+            raise ExtractorError(NOT_AVAILABLE_STRING, expected=True)
+
+        typ = self._html_search_regex(r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],', webpage, 'type')
+        episode_id = self._html_search_regex(r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],', webpage, 'episode_id')
+
+        data = {
+            'playlist[0][type]': typ,
+            'playlist[0][id]': episode_id,
+            'requestUrl': compat_urllib_parse_urlparse(url).path,
+            'requestSource': 'iVysilani',
+        }
+
+        req = compat_urllib_request.Request('http://www.ceskatelevize.cz/ivysilani/ajax/get-playlist-url',
+                                            data=compat_urllib_parse.urlencode(data))
+
+        req.add_header('Content-type', 'application/x-www-form-urlencoded')
+        req.add_header('x-addr', '127.0.0.1')
+        req.add_header('X-Requested-With', 'XMLHttpRequest')
+        req.add_header('Referer', url)
+
+        playlistpage = self._download_json(req, video_id)
+
+        req = compat_urllib_request.Request(compat_urllib_parse.unquote(playlistpage['url']))
+        req.add_header('Referer', url)
+
+        playlist = self._download_xml(req, video_id)
+        
+        formats = []
+        for i in playlist.find('smilRoot/body'):
+            if 'AD' not in i.attrib['id']:
+                base_url = i.attrib['base']
+                parsedurl = compat_urllib_parse_urlparse(base_url)
+                duration = i.attrib['duration']
+
+                for video in i.findall('video'):
+                    if video.attrib['label'] != 'AD':
+                        format_id = video.attrib['label']
+                        play_path = video.attrib['src']
+                        vbr = int(video.attrib['system-bitrate'])
+
+                        formats.append({
+                            'format_id': format_id,
+                            'url': base_url,
+                            'vbr': vbr,
+                            'play_path': play_path,
+                            'app': parsedurl.path[1:] + '?' + parsedurl.query,
+                            'rtmp_live': True,
+                            'ext': 'flv',
+                        })
+
+        self._sort_formats(formats)
+
+        return {
+            'id': episode_id,
+            'title': self._html_search_regex(r'<title>(.+?) — iVysílání — Česká televize</title>', webpage, 'title'),
+            'duration': float(duration),
+            'formats': formats,
+        }
index f0d08cebfce87b006b339508f655eba95a4bc1ef..bfbffefdc7bb78cc7003d886d0e528fd6f5e2269 100644 (file)
@@ -1,4 +1,5 @@
 # encoding: utf-8
+from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
@@ -8,73 +9,63 @@ from ..utils import (
 
 
 class CinemassacreIE(InfoExtractor):
-    _VALID_URL = r'(?:http://)?(?:www\.)?(?P<url>cinemassacre\.com/(?P<date_Y>[0-9]{4})/(?P<date_m>[0-9]{2})/(?P<date_d>[0-9]{2})/.+?)(?:[/?].*)?'
-    _TESTS = [{
-        u'url': u'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/',
-        u'file': u'19911.flv',
-        u'info_dict': {
-            u'upload_date': u'20121110',
-            u'title': u'“Angry Video Game Nerd: The Movie” – Trailer',
-            u'description': u'md5:fb87405fcb42a331742a0dce2708560b',
-        },
-        u'params': {
-            # rtmp download
-            u'skip_download': True,
-        },
-    },
-    {
-        u'url': u'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940',
-        u'file': u'521be8ef82b16.flv',
-        u'info_dict': {
-            u'upload_date': u'20131002',
-            u'title': u'The Mummy’s Hand (1940)',
-        },
-        u'params': {
-            # rtmp download
-            u'skip_download': True,
+    _VALID_URL = r'http://(?:www\.)?cinemassacre\.com/(?P<date_Y>[0-9]{4})/(?P<date_m>[0-9]{2})/(?P<date_d>[0-9]{2})/.+?'
+    _TESTS = [
+        {
+            'url': 'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/',
+            'file': '19911.mp4',
+            'md5': 'fde81fbafaee331785f58cd6c0d46190',
+            'info_dict': {
+                'upload_date': '20121110',
+                'title': '“Angry Video Game Nerd: The Movie” – Trailer',
+                'description': 'md5:fb87405fcb42a331742a0dce2708560b',
+            },
         },
-    }]
+        {
+            'url': 'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940',
+            'file': '521be8ef82b16.mp4',
+            'md5': 'd72f10cd39eac4215048f62ab477a511',
+            'info_dict': {
+                'upload_date': '20131002',
+                'title': 'The Mummy’s Hand (1940)',
+            },
+        }
+    ]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
 
-        webpage_url = u'http://' + mobj.group('url')
-        webpage = self._download_webpage(webpage_url, None) # Don't know video id yet
+        webpage = self._download_webpage(url, None)  # Don't know video id yet
         video_date = mobj.group('date_Y') + mobj.group('date_m') + mobj.group('date_d')
         mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?id=(?:Cinemassacre-)?(?P<video_id>.+?))"', webpage)
         if not mobj:
-            raise ExtractorError(u'Can\'t extract embed url and video id')
-        playerdata_url = mobj.group(u'embed_url')
-        video_id = mobj.group(u'video_id')
+            raise ExtractorError('Can\'t extract embed url and video id')
+        playerdata_url = mobj.group('embed_url')
+        video_id = mobj.group('video_id')
 
         video_title = self._html_search_regex(r'<title>(?P<title>.+?)\|',
-            webpage, u'title')
+            webpage, 'title')
         video_description = self._html_search_regex(r'<div class="entry-content">(?P<description>.+?)</div>',
-            webpage, u'description', flags=re.DOTALL, fatal=False)
+            webpage, 'description', flags=re.DOTALL, fatal=False)
         if len(video_description) == 0:
             video_description = None
 
         playerdata = self._download_webpage(playerdata_url, video_id)
-        url = self._html_search_regex(r'\'streamer\': \'(?P<url>[^\']+)\'', playerdata, u'url')
 
-        sd_file = self._html_search_regex(r'\'file\': \'(?P<sd_file>[^\']+)\'', playerdata, u'sd_file')
-        hd_file = self._html_search_regex(r'\'?file\'?: "(?P<hd_file>[^"]+)"', playerdata, u'hd_file')
-        video_thumbnail = self._html_search_regex(r'\'image\': \'(?P<thumbnail>[^\']+)\'', playerdata, u'thumbnail', fatal=False)
+        sd_url = self._html_search_regex(r'file: \'(?P<sd_file>[^\']+)\', label: \'SD\'', playerdata, 'sd_file')
+        hd_url = self._html_search_regex(r'file: \'(?P<hd_file>[^\']+)\', label: \'HD\'', playerdata, 'hd_file')
+        video_thumbnail = self._html_search_regex(r'image: \'(?P<thumbnail>[^\']+)\'', playerdata, 'thumbnail', fatal=False)
 
         formats = [
             {
-                'url': url,
-                'play_path': 'mp4:' + sd_file,
-                'rtmp_live': True, # workaround
-                'ext': 'flv',
+                'url': sd_url,
+                'ext': 'mp4',
                 'format': 'sd',
                 'format_id': 'sd',
             },
             {
-                'url': url,
-                'play_path': 'mp4:' + hd_file,
-                'rtmp_live': True, # workaround
-                'ext': 'flv',
+                'url': hd_url,
+                'ext': 'mp4',
                 'format': 'hd',
                 'format_id': 'hd',
             },
index 10c925dfe1c7d44d7d1f6abe6a72b79d12230abf..6b9fa420903e4e1165a191e88e56886f8cb6058e 100644 (file)
@@ -40,7 +40,7 @@ class CollegeHumorIE(InfoExtractor):
             'id': 'W5gMp3ZjYg4',
             'ext': 'mp4',
             'title': 'Funny Dogs Protecting Babies Compilation 2014 [NEW HD]',
-            'uploader': 'Funnyplox TV',
+            'uploader': 'FunnyPlox TV',
             'uploader_id': 'funnyploxtv',
             'description': 'md5:7ded37421526d54afdf005e25bc2b7a3',
             'upload_date': '20140128',
index 84fca8ba0b2577696877c117a13fcc0a5ce40735..7ee95fe391ad9cac97c9cf0b0364c5a2a88b6f84 100644 (file)
@@ -88,6 +88,10 @@ class InfoExtractor(object):
 
     The following fields are optional:
 
+    display_id      An alternative identifier for the video, not necessarily
+                    unique, but available before title. Typically, id is
+                    something like "4234987", title "Dancing naked mole rats",
+                    and display_id "dancing-naked-mole-rats"
     thumbnails:     A list of dictionaries (with the entries "resolution" and
                     "url") for the varying thumbnails
     thumbnail:      Full URL to a video thumbnail image.
@@ -432,14 +436,14 @@ class InfoExtractor(object):
         if secure: regexes = self._og_regexes('video:secure_url') + regexes
         return self._html_search_regex(regexes, html, name, **kargs)
 
-    def _html_search_meta(self, name, html, display_name=None):
+    def _html_search_meta(self, name, html, display_name=None, fatal=False):
         if display_name is None:
             display_name = name
         return self._html_search_regex(
             r'''(?ix)<meta
                     (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
-            html, display_name, fatal=False)
+            html, display_name, fatal=fatal)
 
     def _dc_search_uploader(self, html):
         return self._html_search_meta('dc.creator', html, 'uploader')
index 920728e01f1c2b2a6e6b75c3b6740e84b3043897..026a9177e754de7d606961e6e4793af86da49fe2 100644 (file)
@@ -1,7 +1,11 @@
 # encoding: utf-8
 from __future__ import unicode_literals
 
-import re, base64, zlib
+import re
+import json
+import base64
+import zlib
+
 from hashlib import sha1
 from math import pow, sqrt, floor
 from .common import InfoExtractor
@@ -19,13 +23,15 @@ from ..aes import (
     inc,
 )
 
+
 class CrunchyrollIE(InfoExtractor):
-    _VALID_URL = r'(?:https?://)?(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?:[^/]*/[^/?&]*?|media/\?id=)(?P<video_id>[0-9]+))(?:[/?&]|$)'
-    _TESTS = [{
+    _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?:[^/]*/[^/?&]*?|media/\?id=)(?P<video_id>[0-9]+))(?:[/?&]|$)'
+    _TEST = {
         'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513',
-        'file': '645513.flv',
         #'md5': 'b1639fd6ddfaa43788c85f6d1dddd412',
         'info_dict': {
+            'id': '645513',
+            'ext': 'flv',
             'title': 'Wanna be the Strongest in the World Episode 1 – An Idol-Wrestler is Born!',
             'description': 'md5:2d17137920c64f2f49981a7797d275ef',
             'thumbnail': 'http://img1.ak.crunchyroll.com/i/spire1-tmb/20c6b5e10f1a47b10516877d3c039cae1380951166_full.jpg',
@@ -36,7 +42,7 @@ class CrunchyrollIE(InfoExtractor):
             # rtmp
             'skip_download': True,
         },
-    }]
+    }
 
     _FORMAT_IDS = {
         '360': ('60', '106'),
@@ -68,7 +74,7 @@ class CrunchyrollIE(InfoExtractor):
             shaHash = bytes_to_intlist(sha1(prefix + str(num4).encode('ascii')).digest())
             # Extend 160 Bit hash to 256 Bit
             return shaHash + [0] * 12
-        
+
         key = obfuscate_key(id)
         class Counter:
             __value = iv
@@ -80,9 +86,8 @@ class CrunchyrollIE(InfoExtractor):
         return zlib.decompress(decrypted_data)
 
     def _convert_subtitles_to_srt(self, subtitles):
-        i=1
         output = ''
-        for start, end, text in re.findall(r'<event [^>]*?start="([^"]+)" [^>]*?end="([^"]+)" [^>]*?text="([^"]+)"[^>]*?>', subtitles):
+        for i, (start, end, text) in enumerate(re.findall(r'<event [^>]*?start="([^"]+)" [^>]*?end="([^"]+)" [^>]*?text="([^"]+)"[^>]*?>', subtitles), 1):
             start = start.replace('.', ',')
             end = end.replace('.', ',')
             text = clean_html(text)
@@ -90,7 +95,6 @@ class CrunchyrollIE(InfoExtractor):
             if not text:
                 continue
             output += '%d\n%s --> %s\n%s\n\n' % (i, start, end, text)
-            i+=1
         return output
 
     def _real_extract(self,url):
@@ -108,6 +112,12 @@ class CrunchyrollIE(InfoExtractor):
         if note_m:
             raise ExtractorError(note_m)
 
+        mobj = re.search(r'Page\.messaging_box_controller\.addItems\(\[(?P<msg>{.+?})\]\)', webpage)
+        if mobj:
+            msg = json.loads(mobj.group('msg'))
+            if msg.get('type') == 'error':
+                raise ExtractorError('crunchyroll returned error: %s' % msg['message_body'], expected=True)
+
         video_title = self._html_search_regex(r'<h1[^>]*>(.+?)</h1>', webpage, 'video_title', flags=re.DOTALL)
         video_title = re.sub(r' {2,}', ' ', video_title)
         video_description = self._html_search_regex(r'"description":"([^"]+)', webpage, 'video_description', default='')
@@ -123,7 +133,7 @@ class CrunchyrollIE(InfoExtractor):
         playerdata_req.data = compat_urllib_parse.urlencode({'current_page': webpage_url})
         playerdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
         playerdata = self._download_webpage(playerdata_req, video_id, note='Downloading media info')
-        
+
         stream_id = self._search_regex(r'<media_id>([^<]+)', playerdata, 'stream_id')
         video_thumbnail = self._search_regex(r'<episode_image_url>([^<]+)', playerdata, 'thumbnail', fatal=False)
 
@@ -161,7 +171,7 @@ class CrunchyrollIE(InfoExtractor):
             data = base64.b64decode(data)
 
             subtitle = self._decrypt_subtitles(data, iv, id).decode('utf-8')
-            lang_code = self._search_regex(r'lang_code=\'([^\']+)', subtitle, 'subtitle_lang_code', fatal=False)
+            lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False)
             if not lang_code:
                 continue
             subtitles[lang_code] = self._convert_subtitles_to_srt(subtitle)
index 6685c94a3d6b283e0b7f2240ebfcf35ce462edc2..10b97d8cac732c0902aa67cb738122fbf80c3d83 100644 (file)
@@ -12,6 +12,7 @@ from ..utils import (
     get_element_by_id,
     orderedSet,
     str_to_int,
+    int_or_none,
 
     ExtractorError,
 )
@@ -124,7 +125,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
             if video_url is not None:
                 m_size = re.search(r'H264-(\d+)x(\d+)', video_url)
                 if m_size is not None:
-                    width, height = m_size.group(1), m_size.group(2)
+                    width, height = map(int_or_none, (m_size.group(1), m_size.group(2)))
                 else:
                     width, height = None, None
                 formats.append({
index 8f9154c0e6864d04ef6c4e0f441ef4a68e6d30d2..f6b5f589a3cc9f0647fa41e628adf426d7d56ca6 100644 (file)
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
 import json
 import re
 import socket
@@ -26,20 +28,21 @@ class FacebookIE(InfoExtractor):
     _LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1'
     _CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1'
     _NETRC_MACHINE = 'facebook'
-    IE_NAME = u'facebook'
+    IE_NAME = 'facebook'
     _TEST = {
-        u'url': u'https://www.facebook.com/photo.php?v=120708114770723',
-        u'file': u'120708114770723.mp4',
-        u'md5': u'48975a41ccc4b7a581abd68651c1a5a8',
-        u'info_dict': {
-            u"duration": 279,
-            u"title": u"PEOPLE ARE AWESOME 2013"
+        'url': 'https://www.facebook.com/photo.php?v=120708114770723',
+        'md5': '48975a41ccc4b7a581abd68651c1a5a8',
+        'info_dict': {
+            'id': '120708114770723',
+            'ext': 'mp4',
+            'duration': 279,
+            'title': 'PEOPLE ARE AWESOME 2013'
         }
     }
 
     def report_login(self):
         """Report attempt to log in."""
-        self.to_screen(u'Logging in')
+        self.to_screen('Logging in')
 
     def _login(self):
         (useremail, password) = self._get_login_info()
@@ -50,9 +53,11 @@ class FacebookIE(InfoExtractor):
         login_page_req.add_header('Cookie', 'locale=en_US')
         self.report_login()
         login_page = self._download_webpage(login_page_req, None, note=False,
-            errnote=u'Unable to download login page')
-        lsd = self._search_regex(r'"lsd":"(\w*?)"', login_page, u'lsd')
-        lgnrnd = self._search_regex(r'name="lgnrnd" value="([^"]*?)"', login_page, u'lgnrnd')
+            errnote='Unable to download login page')
+        lsd = self._search_regex(
+            r'<input type="hidden" name="lsd" value="([^"]*)"',
+            login_page, 'lsd')
+        lgnrnd = self._search_regex(r'name="lgnrnd" value="([^"]*?)"', login_page, 'lgnrnd')
 
         login_form = {
             'email': useremail,
@@ -70,22 +75,22 @@ class FacebookIE(InfoExtractor):
         try:
             login_results = compat_urllib_request.urlopen(request).read()
             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
-                self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
+                self._downloader.report_warning('unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
                 return
 
             check_form = {
-                'fb_dtsg': self._search_regex(r'"fb_dtsg":"(.*?)"', login_results, u'fb_dtsg'),
-                'nh': self._search_regex(r'name="nh" value="(\w*?)"', login_results, u'nh'),
+                'fb_dtsg': self._search_regex(r'"fb_dtsg":"(.*?)"', login_results, 'fb_dtsg'),
+                'nh': self._search_regex(r'name="nh" value="(\w*?)"', login_results, 'nh'),
                 'name_action_selected': 'dont_save',
-                'submit[Continue]': self._search_regex(r'<input value="(.*?)" name="submit\[Continue\]"', login_results, u'continue'),
+                'submit[Continue]': self._search_regex(r'<input value="(.*?)" name="submit\[Continue\]"', login_results, 'continue'),
             }
             check_req = compat_urllib_request.Request(self._CHECKPOINT_URL, compat_urllib_parse.urlencode(check_form))
             check_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
             check_response = compat_urllib_request.urlopen(check_req).read()
             if re.search(r'id="checkpointSubmitButton"', check_response) is not None:
-                self._downloader.report_warning(u'Unable to confirm login, you have to login in your brower and authorize the login.')
+                self._downloader.report_warning('Unable to confirm login, you have to login in your brower and authorize the login.')
         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
+            self._downloader.report_warning('unable to log in: %s' % compat_str(err))
             return
 
     def _real_initialize(self):
@@ -94,7 +99,7 @@ class FacebookIE(InfoExtractor):
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         if mobj is None:
-            raise ExtractorError(u'Invalid URL: %s' % url)
+            raise ExtractorError('Invalid URL: %s' % url)
         video_id = mobj.group('id')
 
         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
@@ -107,10 +112,10 @@ class FacebookIE(InfoExtractor):
             m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*"><div>(.*?)</div>', webpage)
             if m_msg is not None:
                 raise ExtractorError(
-                    u'The video is not available, Facebook said: "%s"' % m_msg.group(1),
+                    'The video is not available, Facebook said: "%s"' % m_msg.group(1),
                     expected=True)
             else:
-                raise ExtractorError(u'Cannot parse data')
+                raise ExtractorError('Cannot parse data')
         data = dict(json.loads(m.group(1)))
         params_raw = compat_urllib_parse.unquote(data['params'])
         params = json.loads(params_raw)
@@ -119,12 +124,12 @@ class FacebookIE(InfoExtractor):
         if not video_url:
             video_url = video_data['sd_src']
         if not video_url:
-            raise ExtractorError(u'Cannot find video URL')
+            raise ExtractorError('Cannot find video URL')
         video_duration = int(video_data['video_duration'])
         thumbnail = video_data['thumbnail_src']
 
         video_title = self._html_search_regex(
-            r'<h2 class="uiHeaderTitle">([^<]*)</h2>', webpage, u'title')
+            r'<h2 class="uiHeaderTitle">([^<]*)</h2>', webpage, 'title')
 
         info = {
             'id': video_id,
index 8db7fc6cba1c13b4a1b6ded218e41e0e30dc8ae4..7d56b9be93a0332e70381c3a46b748c6d39e5b6a 100644 (file)
@@ -8,8 +8,8 @@ from ..utils import (
     unified_strdate,
     str_to_int,
     parse_duration,
+    clean_html,
 )
-from youtube_dl.utils import clean_html
 
 
 class FourTubeIE(InfoExtractor):
diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py
new file mode 100644 (file)
index 0000000..89d5994
--- /dev/null
@@ -0,0 +1,134 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_urllib_parse,
+    compat_urllib_request,
+)
+
+class GDCVaultIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?gdcvault\.com/play/(?P<id>\d+)/(?P<name>(\w|-)+)'
+    _TESTS = [
+        {
+            'url': 'http://www.gdcvault.com/play/1019721/Doki-Doki-Universe-Sweet-Simple',
+            'md5': '7ce8388f544c88b7ac11c7ab1b593704',
+            'info_dict': {
+                'id': '1019721',
+                'ext': 'mp4',
+                'title': 'Doki-Doki Universe: Sweet, Simple and Genuine (GDC Next 10)'
+            }
+        },
+        {
+            'url': 'http://www.gdcvault.com/play/1015683/Embracing-the-Dark-Art-of',
+            'info_dict': {
+                'id': '1015683',
+                'ext': 'flv',
+                'title': 'Embracing the Dark Art of Mathematical Modeling in AI'
+            },
+            'params': {
+                'skip_download': True,  # Requires rtmpdump
+            }
+        },
+    ]
+
+    def _parse_mp4(self, xml_description):
+        video_formats = []
+        mp4_video = xml_description.find('./metadata/mp4video')
+        if mp4_video is None:
+            return None
+
+        mobj = re.match(r'(?P<root>https?://.*?/).*', mp4_video.text)
+        video_root = mobj.group('root')
+        formats = xml_description.findall('./metadata/MBRVideos/MBRVideo')
+        for format in formats:
+            mobj = re.match(r'mp4\:(?P<path>.*)', format.find('streamName').text)
+            url = video_root + mobj.group('path')
+            vbr = format.find('bitrate').text
+            video_formats.append({
+                'url': url,
+                'vbr': int(vbr),
+            })
+        return video_formats
+
+    def _parse_flv(self, xml_description):
+        video_formats = []
+        akami_url = xml_description.find('./metadata/akamaiHost').text
+        slide_video_path = xml_description.find('./metadata/slideVideo').text
+        video_formats.append({
+            'url': 'rtmp://' + akami_url + '/' + slide_video_path,
+            'format_note': 'slide deck video',
+            'quality': -2,
+            'preference': -2,
+            'format_id': 'slides',
+        })
+        speaker_video_path = xml_description.find('./metadata/speakerVideo').text
+        video_formats.append({
+            'url': 'rtmp://' + akami_url + '/' + speaker_video_path,
+            'format_note': 'speaker video',
+            'quality': -1,
+            'preference': -1,
+            'format_id': 'speaker',
+        })
+        return video_formats
+
+    def _login(self, webpage_url, video_id):
+        (username, password) = self._get_login_info()
+        if username is None or password is None:
+            self.report_warning('It looks like ' + webpage_url + ' requires a login. Try specifying a username and password and try again.')
+            return None
+
+        mobj = re.match(r'(?P<root_url>https?://.*?/).*', webpage_url)
+        login_url = mobj.group('root_url') + 'api/login.php'
+        logout_url = mobj.group('root_url') + 'logout'
+
+        login_form = {
+            'email': username,
+            'password': password,
+        }
+
+        request = compat_urllib_request.Request(login_url, compat_urllib_parse.urlencode(login_form))
+        request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+        self._download_webpage(request, video_id, 'Logging in')
+        start_page = self._download_webpage(webpage_url, video_id, 'Getting authenticated video page')
+        self._download_webpage(logout_url, video_id, 'Logging out')
+
+        return start_page
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+
+        video_id = mobj.group('id')
+        webpage_url = 'http://www.gdcvault.com/play/' + video_id
+        start_page = self._download_webpage(webpage_url, video_id)
+
+        xml_root = self._html_search_regex(r'<iframe src="(?P<xml_root>.*?)player.html.*?".*?</iframe>', start_page, 'xml root', None, False)
+
+        if xml_root is None:
+            # Probably need to authenticate
+            start_page = self._login(webpage_url, video_id)
+            if start_page is None:
+                self.report_warning('Could not login.')
+            else:
+                # Grab the url from the authenticated page
+                xml_root = self._html_search_regex(r'<iframe src="(?P<xml_root>.*?)player.html.*?".*?</iframe>', start_page, 'xml root')
+
+        xml_name = self._html_search_regex(r'<iframe src=".*?\?xml=(?P<xml_file>.+?\.xml).*?".*?</iframe>', start_page, 'xml filename', None, False)
+        if xml_name is None:
+            # Fallback to the older format
+            xml_name = self._html_search_regex(r'<iframe src=".*?\?xmlURL=xml/(?P<xml_file>.+?\.xml).*?".*?</iframe>', start_page, 'xml filename')
+
+        xml_decription_url = xml_root + 'xml/' + xml_name
+        xml_description = self._download_xml(xml_decription_url, video_id)
+
+        video_title = xml_description.find('./metadata/title').text
+        video_formats = self._parse_mp4(xml_description)
+        if video_formats is None:
+            video_formats = self._parse_flv(xml_description)
+
+        return {
+            'id': video_id,
+            'title': video_title,
+            'formats': video_formats,
+        }
index e84c022a587cf7cae22f3d6ec33fdbf15f07098b..dd60bc418e2214c2c71f2f41a5e629a9ccc74192 100644 (file)
@@ -83,10 +83,10 @@ class GenericIE(InfoExtractor):
         # Direct link to a video
         {
             'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
-            'file': 'trailer.mp4',
             'md5': '67d406c2bcb6af27fa886f31aa934bbe',
             'info_dict': {
                 'id': 'trailer',
+                'ext': 'mp4',
                 'title': 'trailer',
                 'upload_date': '20100513',
             }
@@ -94,7 +94,6 @@ class GenericIE(InfoExtractor):
         # ooyala video
         {
             'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
-            'file': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ.mp4',
             'md5': '5644c6ca5d5782c1d0d350dad9bd840c',
             'info_dict': {
                 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
@@ -102,6 +101,22 @@ class GenericIE(InfoExtractor):
                 'title': '2cc213299525360.mov',  # that's what we get
             },
         },
+        # google redirect
+        {
+            'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
+            'info_dict': {
+                'id': 'cmQHVoWB5FY',
+                'ext': 'mp4',
+                'upload_date': '20130224',
+                'uploader_id': 'TheVerge',
+                'description': 'Chris Ziegler takes a look at the Alcatel OneTouch Fire and the ZTE Open; two of the first Firefox OS handsets to be officially announced.',
+                'uploader': 'The Verge',
+                'title': 'First Firefox OS phones side-by-side',
+            },
+            'params': {
+                'skip_download': False,
+            }
+        },
         # embed.ly video
         {
             'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
@@ -375,11 +390,17 @@ class GenericIE(InfoExtractor):
         if mobj is not None:
             return self.url_result(mobj.group(1), 'Mpora')
 
-        # Look for embedded Novamov player
+        # Look for embedded NovaMov player
         mobj = re.search(
             r'<iframe[^>]+?src=(["\'])(?P<url>http://(?:(?:embed|www)\.)?novamov\.com/embed\.php.+?)\1', webpage)
         if mobj is not None:
-            return self.url_result(mobj.group('url'), 'Novamov')
+            return self.url_result(mobj.group('url'), 'NovaMov')
+
+        # Look for embedded NowVideo player
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>http://(?:(?:embed|www)\.)?nowvideo\.(?:ch|sx|eu)/embed\.php.+?)\1', webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'), 'NowVideo')
 
         # Look for embedded Facebook player
         mobj = re.search(
@@ -387,6 +408,11 @@ class GenericIE(InfoExtractor):
         if mobj is not None:
             return self.url_result(mobj.group('url'), 'Facebook')
 
+        # Look for embedded VK player
+        mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'), 'VK')
+
         # Look for embedded Huffington Post player
         mobj = re.search(
             r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
@@ -425,6 +451,18 @@ class GenericIE(InfoExtractor):
         if mobj is None:
             # HTML5 video
             mobj = re.search(r'<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage, flags=re.DOTALL)
+        if mobj is None:
+            mobj = re.search(
+                r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
+                r'(?:[a-z-]+="[^"]+"\s+)*?content="[0-9]{,2};url=\'([^\']+)\'"',
+                webpage)
+            if mobj:
+                new_url = mobj.group(1)
+                self.report_following_redirect(new_url)
+                return {
+                    '_type': 'url',
+                    'url': new_url,
+                }
         if mobj is None:
             raise ExtractorError('Unsupported URL: %s' % url)
 
index dde4829981a7adab396a040ccc1566277c39194b..7956e7624db36d74cd4120c1764d0190f8d1dff5 100644 (file)
@@ -10,7 +10,7 @@ from ..utils import compat_urllib_request
 
 
 class IPrimaIE(InfoExtractor):
-    _VALID_URL = r'https?://play\.iprima\.cz/(?P<videogroup>.+)/(?P<videoid>.+)'
+    _VALID_URL = r'https?://play\.iprima\.cz/[^?#]+/(?P<id>[^?#]+)'
 
     _TESTS = [{
         'url': 'http://play.iprima.cz/particka/particka-92',
@@ -22,20 +22,32 @@ class IPrimaIE(InfoExtractor):
             'thumbnail': 'http://play.iprima.cz/sites/default/files/image_crops/image_620x349/3/491483_particka-92_image_620x349.jpg',
         },
         'params': {
-            'skip_download': True,
+            'skip_download': True,  # requires rtmpdump
         },
-    },
-    ]
+    }, {
+        'url': 'http://play.iprima.cz/particka/tchibo-particka-jarni-moda',
+        'info_dict': {
+            'id': '9718337',
+            'ext': 'flv',
+            'title': 'Tchibo Partička - Jarní móda',
+            'description': 'md5:589f8f59f414220621ff8882eb3ce7be',
+            'thumbnail': 're:^http:.*\.jpg$',
+        },
+        'params': {
+            'skip_download': True,  # requires rtmpdump
+        },
+    }]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('videoid')
+        video_id = mobj.group('id')
 
         webpage = self._download_webpage(url, video_id)
 
-        player_url = 'http://embed.livebox.cz/iprimaplay/player-embed-v2.js?__tok%s__=%s' % (
-                         floor(random()*1073741824),
-                         floor(random()*1073741824))
+        player_url = (
+            'http://embed.livebox.cz/iprimaplay/player-embed-v2.js?__tok%s__=%s' %
+            (floor(random()*1073741824), floor(random()*1073741824))
+        )
 
         req = compat_urllib_request.Request(player_url)
         req.add_header('Referer', url)
@@ -44,18 +56,20 @@ class IPrimaIE(InfoExtractor):
         base_url = ''.join(re.findall(r"embed\['stream'\] = '(.+?)'.+'(\?auth=)'.+'(.+?)';", playerpage)[1])
 
         zoneGEO = self._html_search_regex(r'"zoneGEO":(.+?),', webpage, 'zoneGEO')
-
         if zoneGEO != '0':
-            base_url = base_url.replace('token', 'token_'+zoneGEO)
+            base_url = base_url.replace('token', 'token_' + zoneGEO)
 
         formats = []
         for format_id in ['lq', 'hq', 'hd']:
-            filename = self._html_search_regex(r'"%s_id":(.+?),' % format_id, webpage, 'filename')
+            filename = self._html_search_regex(
+                r'"%s_id":(.+?),' % format_id, webpage, 'filename')
 
             if filename == 'null':
                 continue
 
-            real_id = self._search_regex(r'Prima-[0-9]{10}-([0-9]+)_', filename, 'real video id')
+            real_id = self._search_regex(
+                r'Prima-(?:[0-9]{10}|WEB)-([0-9]+)[-_]',
+                filename, 'real video id')
 
             if format_id == 'lq':
                 quality = 0
@@ -63,13 +77,13 @@ class IPrimaIE(InfoExtractor):
                 quality = 1
             elif format_id == 'hd':
                 quality = 2
-                filename = 'hq/'+filename
+                filename = 'hq/' + filename
 
             formats.append({
                 'format_id': format_id,
                 'url': base_url,
                 'quality': quality,
-                'play_path': 'mp4:'+filename.replace('"', '')[:-4],
+                'play_path': 'mp4:' + filename.replace('"', '')[:-4],
                 'rtmp_live': True,
                 'ext': 'flv',
             })
index 7b7185f9adb69f37dee1e4c4b468de8a5a95a556..7a431a274abc5b189af8ee8779f6024f430704c7 100644 (file)
@@ -6,7 +6,8 @@ import re
 from .common import InfoExtractor
 from ..utils import (
     int_or_none,
-    unified_strdate
+    unified_strdate,
+    ExtractorError,
 )
 
 
@@ -32,13 +33,11 @@ class LifeNewsIE(InfoExtractor):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('id')
 
-        webpage = self._download_webpage('http://lifenews.ru/mobile/news/%s' % video_id, video_id, 'Downloading page')
+        webpage = self._download_webpage('http://lifenews.ru/news/%s' % video_id, video_id, 'Downloading page')
 
-        video_url = self._html_search_regex(
-            r'<video.*?src="([^"]+)".*?></video>', webpage, 'video URL')
-
-        thumbnail = self._html_search_regex(
-            r'<video.*?poster="([^"]+)".*?"></video>', webpage, 'video thumbnail')
+        videos = re.findall(r'<video.*?poster="(?P<poster>[^"]+)".*?src="(?P<video>[^"]+)".*?></video>', webpage)
+        if not videos:
+            raise ExtractorError('No media links available for %s' % video_id)
 
         title = self._og_search_title(webpage)
         TITLE_SUFFIX = ' - Первый по срочным новостям — LIFE | NEWS'
@@ -50,20 +49,26 @@ class LifeNewsIE(InfoExtractor):
         view_count = self._html_search_regex(
             r'<div class=\'views\'>(\d+)</div>', webpage, 'view count', fatal=False)
         comment_count = self._html_search_regex(
-            r'<div class=\'comments\'>(\d+)</div>', webpage, 'comment count', fatal=False)
+            r'<div class=\'comments\'>\s*<span class=\'counter\'>(\d+)</span>', webpage, 'comment count', fatal=False)
 
         upload_date = self._html_search_regex(
             r'<time datetime=\'([^\']+)\'>', webpage, 'upload date',fatal=False)
         if upload_date is not None:
             upload_date = unified_strdate(upload_date)
 
-        return {
-            'id': video_id,
-            'url': video_url,
-            'thumbnail': thumbnail,
-            'title': title,
-            'description': description,
-            'view_count': int_or_none(view_count),
-            'comment_count': int_or_none(comment_count),
-            'upload_date': upload_date,
-        }
\ No newline at end of file
+        def make_entry(video_id, media, video_number=None):
+            return {
+                'id': video_id,
+                'url': media[1],
+                'thumbnail': media[0],
+                'title': title if video_number is None else '%s-video%s' % (title, video_number),
+                'description': description,
+                'view_count': int_or_none(view_count),
+                'comment_count': int_or_none(comment_count),
+                'upload_date': upload_date,
+            }
+
+        if len(videos) == 1:
+            return make_entry(video_id, videos[0])
+        else:
+            return [make_entry(video_id, media, video_number+1) for video_number, media in enumerate(videos)]
\ No newline at end of file
diff --git a/youtube_dl/extractor/mailru.py b/youtube_dl/extractor/mailru.py
new file mode 100644 (file)
index 0000000..f819c09
--- /dev/null
@@ -0,0 +1,66 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+import datetime
+
+from .common import InfoExtractor
+
+
+class MailRuIE(InfoExtractor):
+    IE_NAME = 'mailru'
+    IE_DESC = 'Видео@Mail.Ru'
+    _VALID_URL = r'http://(?:www\.)?my\.mail\.ru/video/.*#video=/?(?P<id>[^/]+/[^/]+/[^/]+/\d+)'
+
+    _TEST = {
+        'url': 'http://my.mail.ru/video/top#video=/mail/sonypicturesrus/75/76',
+        'md5': 'dea205f03120046894db4ebb6159879a',
+        'info_dict': {
+            'id': '46301138',
+            'ext': 'mp4',
+            'title': 'Новый Человек-Паук. Высокое напряжение. Восстание Электро',
+            'upload_date': '20140224',
+            'uploader': 'sonypicturesrus',
+            'uploader_id': 'sonypicturesrus@mail.ru',
+            'duration': 184,
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        video_data = self._download_json(
+            'http://videoapi.my.mail.ru/videos/%s.json?new=1' % video_id, video_id, 'Downloading video JSON')
+
+        author = video_data['author']
+        uploader = author['name']
+        uploader_id = author['id']
+
+        movie = video_data['movie']
+        content_id = str(movie['contentId'])
+        title = movie['title']
+        thumbnail = movie['poster']
+        duration = movie['duration']
+
+        upload_date = datetime.datetime.fromtimestamp(video_data['timestamp']).strftime('%Y%m%d')
+        view_count = video_data['views_count']
+
+        formats = [
+            {
+                'url': video['url'],
+                'format_id': video['name'],
+            } for video in video_data['videos']
+        ]
+
+        return {
+            'id': content_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'upload_date': upload_date,
+            'uploader': uploader,
+            'uploader_id': uploader_id,
+            'duration': duration,
+            'view_count': view_count,
+            'formats': formats,
+        }
\ No newline at end of file
index 99d3c83a5e4c3a31d71e9f487e13930af498dc1f..30103119785ed9ca530fc7fe2904603f5f64a450 100644 (file)
@@ -166,6 +166,7 @@ class MetacafeIE(InfoExtractor):
 
         video_title = self._html_search_regex(r'(?im)<title>(.*) - Video</title>', webpage, u'title')
         description = self._og_search_description(webpage)
+        thumbnail = self._og_search_thumbnail(webpage)
         video_uploader = self._html_search_regex(
                 r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);',
                 webpage, u'uploader nickname', fatal=False)
@@ -183,6 +184,7 @@ class MetacafeIE(InfoExtractor):
             'uploader': video_uploader,
             'upload_date':  None,
             'title':    video_title,
+            'thumbnail':thumbnail,
             'ext':      video_ext,
             'age_limit': age_limit,
         }
index 76b717fe5dbac08b8b103a1e44192a6fbf6d2a55..807b1dc89b608333e06c1fbab2e9d806fb7d090f 100644 (file)
@@ -1,24 +1,30 @@
+from __future__ import unicode_literals
+
 import re
 import json
 
 from .common import InfoExtractor
+from .youtube import YoutubeIE
 from ..utils import (
+    compat_urlparse,
     clean_html,
+    ExtractorError,
     get_element_by_id,
 )
 
 
 class TechTVMITIE(InfoExtractor):
-    IE_NAME = u'techtv.mit.edu'
+    IE_NAME = 'techtv.mit.edu'
     _VALID_URL = r'https?://techtv\.mit\.edu/(videos|embeds)/(?P<id>\d+)'
 
     _TEST = {
-        u'url': u'http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set',
-        u'file': u'25418.mp4',
-        u'md5': u'1f8cb3e170d41fd74add04d3c9330e5f',
-        u'info_dict': {
-            u'title': u'MIT DNA Learning Center Set',
-            u'description': u'md5:82313335e8a8a3f243351ba55bc1b474',
+        'url': 'http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set',
+        'md5': '1f8cb3e170d41fd74add04d3c9330e5f',
+        'info_dict': {
+            'id': '25418',
+            'ext': 'mp4',
+            'title': 'MIT DNA Learning Center Set',
+            'description': 'md5:82313335e8a8a3f243351ba55bc1b474',
         },
     }
 
@@ -27,12 +33,12 @@ class TechTVMITIE(InfoExtractor):
         video_id = mobj.group('id')
         raw_page = self._download_webpage(
             'http://techtv.mit.edu/videos/%s' % video_id, video_id)
-        clean_page = re.compile(u'<!--.*?-->', re.S).sub(u'', raw_page)
+        clean_page = re.compile(r'<!--.*?-->', re.S).sub('', raw_page)
 
-        base_url = self._search_regex(r'ipadUrl: \'(.+?cloudfront.net/)',
-            raw_page, u'base url')
-        formats_json = self._search_regex(r'bitrates: (\[.+?\])', raw_page,
-            u'video formats')
+        base_url = self._search_regex(
+            r'ipadUrl: \'(.+?cloudfront.net/)', raw_page, 'base url')
+        formats_json = self._search_regex(
+            r'bitrates: (\[.+?\])', raw_page, 'video formats')
         formats_mit = json.loads(formats_json)
         formats = [
             {
@@ -48,28 +54,31 @@ class TechTVMITIE(InfoExtractor):
 
         title = get_element_by_id('edit-title', clean_page)
         description = clean_html(get_element_by_id('edit-description', clean_page))
-        thumbnail = self._search_regex(r'playlist:.*?url: \'(.+?)\'',
-            raw_page, u'thumbnail', flags=re.DOTALL)
+        thumbnail = self._search_regex(
+            r'playlist:.*?url: \'(.+?)\'',
+            raw_page, 'thumbnail', flags=re.DOTALL)
 
-        return {'id': video_id,
-                'title': title,
-                'formats': formats,
-                'description': description,
-                'thumbnail': thumbnail,
-                }
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'description': description,
+            'thumbnail': thumbnail,
+        }
 
 
 class MITIE(TechTVMITIE):
-    IE_NAME = u'video.mit.edu'
+    IE_NAME = 'video.mit.edu'
     _VALID_URL = r'https?://video\.mit\.edu/watch/(?P<title>[^/]+)'
 
     _TEST = {
-        u'url': u'http://video.mit.edu/watch/the-government-is-profiling-you-13222/',
-        u'file': u'21783.mp4',
-        u'md5': u'7db01d5ccc1895fc5010e9c9e13648da',
-        u'info_dict': {
-            u'title': u'The Government is Profiling You',
-            u'description': u'md5:ad5795fe1e1623b73620dbfd47df9afd',
+        'url': 'http://video.mit.edu/watch/the-government-is-profiling-you-13222/',
+        'md5': '7db01d5ccc1895fc5010e9c9e13648da',
+        'info_dict': {
+            'id': '21783',
+            'ext': 'mp4',
+            'title': 'The Government is Profiling You',
+            'description': 'md5:ad5795fe1e1623b73620dbfd47df9afd',
         },
     }
 
@@ -77,7 +86,73 @@ class MITIE(TechTVMITIE):
         mobj = re.match(self._VALID_URL, url)
         page_title = mobj.group('title')
         webpage = self._download_webpage(url, page_title)
-        self.to_screen('%s: Extracting %s url' % (page_title, TechTVMITIE.IE_NAME))
-        embed_url = self._search_regex(r'<iframe .*?src="(.+?)"', webpage,
-            u'embed url')
+        embed_url = self._search_regex(
+            r'<iframe .*?src="(.+?)"', webpage, 'embed url')
         return self.url_result(embed_url, ie='TechTVMIT')
+
+
+class OCWMITIE(InfoExtractor):
+    IE_NAME = 'ocw.mit.edu'
+    _VALID_URL = r'^http://ocw\.mit\.edu/courses/(?P<topic>[a-z0-9\-]+)'
+    _BASE_URL = 'http://ocw.mit.edu/'
+
+    _TESTS = [
+        {
+            'url': 'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/',
+            'info_dict': {
+                'id': 'EObHWIEKGjA',
+                'ext': 'mp4',
+                'title': 'Lecture 7: Multiple Discrete Random Variables: Expectations, Conditioning, Independence',
+                'description': 'In this lecture, the professor discussed multiple random variables, expectations, and binomial distribution.',
+                #'subtitles': 'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/MIT6_041F11_lec07_300k.mp4.srt'
+            }
+        },
+        {
+            'url': 'http://ocw.mit.edu/courses/mathematics/18-01sc-single-variable-calculus-fall-2010/1.-differentiation/part-a-definition-and-basic-rules/session-1-introduction-to-derivatives/',
+            'info_dict': {
+                'id': '7K1sB05pE0A',
+                'ext': 'mp4',
+                'title': 'Session 1: Introduction to Derivatives',
+                'description': 'This section contains lecture video excerpts, lecture notes, an interactive mathlet with supporting documents, and problem solving videos.',
+                #'subtitles': 'http://ocw.mit.edu//courses/mathematics/18-01sc-single-variable-calculus-fall-2010/ocw-18.01-f07-lec01_300k.SRT'
+            }
+        }
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        topic = mobj.group('topic')
+
+        webpage = self._download_webpage(url, topic)
+        title = self._html_search_meta('WT.cg_s', webpage)
+        description = self._html_search_meta('Description', webpage)
+
+        # search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, start, stop, captions_file)
+        embed_chapter_media = re.search(r'ocw_embed_chapter_media\((.+?)\)', webpage)
+        if embed_chapter_media:
+            metadata = re.sub(r'[\'"]', '', embed_chapter_media.group(1))
+            metadata = re.split(r', ?', metadata)
+            yt = metadata[1]
+            subs = compat_urlparse.urljoin(self._BASE_URL, metadata[7])
+        else:
+            # search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, captions_file)
+            embed_media = re.search(r'ocw_embed_media\((.+?)\)', webpage)
+            if embed_media:
+                metadata = re.sub(r'[\'"]', '', embed_media.group(1))
+                metadata = re.split(r', ?', metadata)
+                yt = metadata[1]
+                subs = compat_urlparse.urljoin(self._BASE_URL, metadata[5])
+            else:
+                raise ExtractorError('Unable to find embedded YouTube video.')
+        video_id = YoutubeIE.extract_id(yt)
+
+        return {
+            '_type': 'url_transparent',
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'url': yt,
+            'url_transparent'
+            'subtitles': subs,
+            'ie_key': 'Youtube',
+        }
index f3356db50ebf8941ac58e9a229778ba864c57be0..c4bd53fe7e8f061fee38d9a9a8c371a5901419ea 100644 (file)
@@ -5,18 +5,20 @@ import re
 from .common import InfoExtractor
 from ..utils import (
     unified_strdate,
+    compat_urllib_parse,
     ExtractorError,
 )
 
 
 class MixcloudIE(InfoExtractor):
-    _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
+    _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([^/]+)/([^/]+)'
     IE_NAME = 'mixcloud'
 
     _TEST = {
         'url': 'http://www.mixcloud.com/dholbach/cryptkeeper/',
-        'file': 'dholbach-cryptkeeper.mp3',
         'info_dict': {
+            'id': 'dholbach-cryptkeeper',
+            'ext': 'mp3',
             'title': 'Cryptkeeper',
             'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.',
             'uploader': 'Daniel Holbach',
@@ -45,7 +47,7 @@ class MixcloudIE(InfoExtractor):
         mobj = re.match(self._VALID_URL, url)
         uploader = mobj.group(1)
         cloudcast_name = mobj.group(2)
-        track_id = '-'.join((uploader, cloudcast_name))
+        track_id = compat_urllib_parse.unquote('-'.join((uploader, cloudcast_name)))
 
         webpage = self._download_webpage(url, track_id)
 
index e8bbfff7bd59eeaca0e04c83ee1599baff8af088..1a63ab56a9b02e9f5aa3d28dde6a57fb8155fc35 100644 (file)
@@ -1,19 +1,46 @@
+from __future__ import unicode_literals
+
 import re
 
 from .common import InfoExtractor
 from ..utils import find_xpath_attr, compat_str
 
 
+class NBCIE(InfoExtractor):
+    _VALID_URL = r'http://www\.nbc\.com/[^/]+/video/[^/]+/(?P<id>n?\d+)'
+
+    _TEST = {
+        'url': 'http://www.nbc.com/chicago-fire/video/i-am-a-firefighter/2734188',
+        'md5': '54d0fbc33e0b853a65d7b4de5c06d64e',
+        'info_dict': {
+            'id': 'u1RInQZRN7QJ',
+            'ext': 'flv',
+            'title': 'I Am a Firefighter',
+            'description': 'An emergency puts Dawson\'sf irefighter skills to the ultimate test in this four-part digital series.',
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        webpage = self._download_webpage(url, video_id)
+        theplatform_url = self._search_regex('class="video-player video-player-full" data-mpx-url="(.*?)"', webpage, 'theplatform url')
+        if theplatform_url.startswith('//'):
+            theplatform_url = 'http:' + theplatform_url
+        return self.url_result(theplatform_url)
+
+
 class NBCNewsIE(InfoExtractor):
     _VALID_URL = r'https?://www\.nbcnews\.com/video/.+?/(?P<id>\d+)'
 
     _TEST = {
-        u'url': u'http://www.nbcnews.com/video/nbc-news/52753292',
-        u'file': u'52753292.flv',
-        u'md5': u'47abaac93c6eaf9ad37ee6c4463a5179',
-        u'info_dict': {
-            u'title': u'Crew emerges after four-month Mars food study',
-            u'description': u'md5:24e632ffac72b35f8b67a12d1b6ddfc1',
+        'url': 'http://www.nbcnews.com/video/nbc-news/52753292',
+        'md5': '47abaac93c6eaf9ad37ee6c4463a5179',
+        'info_dict': {
+            'id': '52753292',
+            'ext': 'flv',
+            'title': 'Crew emerges after four-month Mars food study',
+            'description': 'md5:24e632ffac72b35f8b67a12d1b6ddfc1',
         },
     }
 
@@ -23,10 +50,11 @@ class NBCNewsIE(InfoExtractor):
         all_info = self._download_xml('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id)
         info = all_info.find('video')
 
-        return {'id': video_id,
-                'title': info.find('headline').text,
-                'ext': 'flv',
-                'url': find_xpath_attr(info, 'media', 'type', 'flashVideo').text,
-                'description': compat_str(info.find('caption').text),
-                'thumbnail': find_xpath_attr(info, 'media', 'type', 'thumbnail').text,
-                }
+        return {
+            'id': video_id,
+            'title': info.find('headline').text,
+            'ext': 'flv',
+            'url': find_xpath_attr(info, 'media', 'type', 'flashVideo').text,
+            'description': compat_str(info.find('caption').text),
+            'thumbnail': find_xpath_attr(info, 'media', 'type', 'thumbnail').text,
+        }
index 81b7855b0b23680398909eb74e57eeb3d21dc4bc..25e71a56e196d9cf7f9d2423c47293b01e46cd24 100644 (file)
@@ -1,61 +1,51 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
 import re
 
 from .common import InfoExtractor
 
 from ..utils import (
-    ExtractorError,
     unified_strdate,
 )
 
+
 class NormalbootsIE(InfoExtractor):
-    _VALID_URL = r'(?:http://)?(?:www\.)?normalboots\.com/video/(?P<videoid>[0-9a-z-]*)/?$'
+    _VALID_URL = r'http://(?:www\.)?normalboots\.com/video/(?P<videoid>[0-9a-z-]*)/?$'
     _TEST = {
-        u'url': u'http://normalboots.com/video/home-alone-games-jontron/',
-        u'file': u'home-alone-games-jontron.mp4',
-        u'md5': u'8bf6de238915dd501105b44ef5f1e0f6',
-        u'info_dict': {
-            u'title': u'Home Alone Games - JonTron - NormalBoots',
-            u'description': u'Jon is late for Christmas. Typical. Thanks to: Paul Ritchey for Co-Writing/Filming: http://www.youtube.com/user/ContinueShow Michael Azzi for Christmas Intro Animation: http://michafrar.tumblr.com/ Jerrod Waters for Christmas Intro Music: http://www.youtube.com/user/xXJerryTerryXx Casey Ormond for \u2018Tense Battle Theme\u2019:\xa0http://www.youtube.com/Kiamet/',
-            u'uploader': u'JonTron',
-            u'upload_date': u'20140125',
+        'url': 'http://normalboots.com/video/home-alone-games-jontron/',
+        'md5': '8bf6de238915dd501105b44ef5f1e0f6',
+        'info_dict': {
+            'id': 'home-alone-games-jontron',
+            'ext': 'mp4',
+            'title': 'Home Alone Games - JonTron - NormalBoots',
+            'description': 'Jon is late for Christmas. Typical. Thanks to: Paul Ritchey for Co-Writing/Filming: http://www.youtube.com/user/ContinueShow Michael Azzi for Christmas Intro Animation: http://michafrar.tumblr.com/ Jerrod Waters for Christmas Intro Music: http://www.youtube.com/user/xXJerryTerryXx Casey Ormond for ‘Tense Battle Theme’:\xa0http://www.youtube.com/Kiamet/',
+            'uploader': 'JonTron',
+            'upload_date': '20140125',
         }
     }
-    
+
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-        if mobj is None:
-            raise ExtractorError(u'Invalid URL: %s' % url)
         video_id = mobj.group('videoid')
-        
-        info = {
-            'id': video_id,
-            'uploader': None,
-            'upload_date': None,
-        }
-        
-        if url[:4] != 'http':
-            url = 'http://' + url
-        
+
         webpage = self._download_webpage(url, video_id)
-        video_title = self._og_search_title(webpage)
-        video_description = self._og_search_description(webpage)
-        video_thumbnail = self._og_search_thumbnail(webpage)
         video_uploader = self._html_search_regex(r'Posted\sby\s<a\shref="[A-Za-z0-9/]*">(?P<uploader>[A-Za-z]*)\s</a>',
             webpage, 'uploader')
-        raw_upload_date = self._html_search_regex('<span style="text-transform:uppercase; font-size:inherit;">[A-Za-z]+, (?P<date>.*)</span>', 
+        raw_upload_date = self._html_search_regex('<span style="text-transform:uppercase; font-size:inherit;">[A-Za-z]+, (?P<date>.*)</span>',
             webpage, 'date')
         video_upload_date = unified_strdate(raw_upload_date)
-        video_upload_date = unified_strdate(raw_upload_date)
-            
+
         player_url = self._html_search_regex(r'<iframe\swidth="[0-9]+"\sheight="[0-9]+"\ssrc="(?P<url>[\S]+)"', webpage, 'url')
         player_page = self._download_webpage(player_url, video_id)
-        video_url = u'http://player.screenwavemedia.com/' + self._html_search_regex(r"'file':\s'(?P<file>[0-9A-Za-z-_\.]+)'", player_page, 'file')
-        
-        info['url'] = video_url
-        info['title'] = video_title
-        info['description'] = video_description
-        info['thumbnail'] = video_thumbnail
-        info['uploader'] = video_uploader
-        info['upload_date'] = video_upload_date
-        
-        return info
+        video_url = self._html_search_regex(r"file:\s'(?P<file>[^']+\.mp4)'", player_page, 'file')
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': self._og_search_title(webpage),
+            'description': self._og_search_description(webpage),
+            'thumbnail': self._og_search_thumbnail(webpage),
+            'uploader': video_uploader,
+            'upload_date': video_upload_date,
+        }
index 6af8d934c8baa18b457fe1bdc3e96f42b5a82588..fd310e219c1eff92eea30ae7800afb8122df78af 100644 (file)
@@ -9,14 +9,25 @@ from ..utils import (
 )
 
 
-class NovamovIE(InfoExtractor):
-    _VALID_URL = r'http://(?:(?:www\.)?novamov\.com/video/|(?:(?:embed|www)\.)novamov\.com/embed\.php\?v=)(?P<videoid>[a-z\d]{13})'
+class NovaMovIE(InfoExtractor):
+    IE_NAME = 'novamov'
+    IE_DESC = 'NovaMov'
+
+    _VALID_URL = r'http://(?:(?:www\.)?%(host)s/video/|(?:(?:embed|www)\.)%(host)s/embed\.php\?(?:.*?&)?v=)(?P<videoid>[a-z\d]{13})' % {'host': 'novamov\.com'}
+
+    _HOST = 'www.novamov.com'
+
+    _FILE_DELETED_REGEX = r'This file no longer exists on our servers!</h2>'
+    _FILEKEY_REGEX = r'flashvars\.filekey="(?P<filekey>[^"]+)";'
+    _TITLE_REGEX = r'(?s)<div class="v_tab blockborder rounded5" id="v_tab1">\s*<h3>([^<]+)</h3>'
+    _DESCRIPTION_REGEX = r'(?s)<div class="v_tab blockborder rounded5" id="v_tab1">\s*<h3>[^<]+</h3><p>([^<]+)</p>'
 
     _TEST = {
         'url': 'http://www.novamov.com/video/4rurhn9x446jj',
-        'file': '4rurhn9x446jj.flv',
         'md5': '7205f346a52bbeba427603ba10d4b935',
         'info_dict': {
+            'id': '4rurhn9x446jj',
+            'ext': 'flv',
             'title': 'search engine optimization',
             'description': 'search engine optimization is used to rank the web page in the google search engine'
         },
@@ -27,31 +38,26 @@ class NovamovIE(InfoExtractor):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('videoid')
 
-        page = self._download_webpage('http://www.novamov.com/video/%s' % video_id,
-                                      video_id, 'Downloading video page')
+        page = self._download_webpage(
+            'http://%s/video/%s' % (self._HOST, video_id), video_id, 'Downloading video page')
 
-        if re.search(r'This file no longer exists on our servers!</h2>', page) is not None:
+        if re.search(self._FILE_DELETED_REGEX, page) is not None:
             raise ExtractorError(u'Video %s does not exist' % video_id, expected=True)
 
-        filekey = self._search_regex(
-            r'flashvars\.filekey="(?P<filekey>[^"]+)";', page, 'filekey')
+        filekey = self._search_regex(self._FILEKEY_REGEX, page, 'filekey')
 
-        title = self._html_search_regex(
-            r'(?s)<div class="v_tab blockborder rounded5" id="v_tab1">\s*<h3>([^<]+)</h3>',
-            page, 'title', fatal=False)
+        title = self._html_search_regex(self._TITLE_REGEX, page, 'title', fatal=False)
 
-        description = self._html_search_regex(
-            r'(?s)<div class="v_tab blockborder rounded5" id="v_tab1">\s*<h3>[^<]+</h3><p>([^<]+)</p>',
-            page, 'description', fatal=False)
+        description = self._html_search_regex(self._DESCRIPTION_REGEX, page, 'description', default='', fatal=False)
 
         api_response = self._download_webpage(
-            'http://www.novamov.com/api/player.api.php?key=%s&file=%s' % (filekey, video_id),
-            video_id, 'Downloading video api response')
+            'http://%s/api/player.api.php?key=%s&file=%s' % (self._HOST, filekey, video_id), video_id,
+            'Downloading video api response')
 
         response = compat_urlparse.parse_qs(api_response)
 
         if 'error_msg' in response:
-            raise ExtractorError('novamov returned error: %s' % response['error_msg'][0], expected=True)
+            raise ExtractorError('%s returned error: %s' % (self.IE_NAME, response['error_msg'][0]), expected=True)
 
         video_url = response['url'][0]
 
@@ -60,4 +66,4 @@ class NovamovIE(InfoExtractor):
             'url': video_url,
             'title': title,
             'description': description
-        }
+        }
\ No newline at end of file
index 168ca8b9fa961f350fd3d3292bda648589be5197..dd665874d5e01cd7e6c6e6d7dc69e06748952727 100644 (file)
@@ -1,46 +1,28 @@
-import re
+from __future__ import unicode_literals
 
-from .common import InfoExtractor
-from ..utils import compat_urlparse
+from .novamov import NovaMovIE
 
 
-class NowVideoIE(InfoExtractor):
-    _VALID_URL = r'(?:https?://)?(?:www\.)?nowvideo\.(?:ch|sx)/video/(?P<id>\w+)'
-    _TEST = {
-        u'url': u'http://www.nowvideo.ch/video/0mw0yow7b6dxa',
-        u'file': u'0mw0yow7b6dxa.flv',
-        u'md5': u'f8fbbc8add72bd95b7850c6a02fc8817',
-        u'info_dict': {
-            u"title": u"youtubedl test video _BaW_jenozKc.mp4"
-        }
-    }
-
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-
-        video_id = mobj.group('id')
-        webpage_url = 'http://www.nowvideo.ch/video/' + video_id
-        embed_url = 'http://embed.nowvideo.ch/embed.php?v=' + video_id
-        webpage = self._download_webpage(webpage_url, video_id)
-        embed_page = self._download_webpage(embed_url, video_id,
-            u'Downloading embed page')
+class NowVideoIE(NovaMovIE):
+    IE_NAME = 'nowvideo'
+    IE_DESC = 'NowVideo'
 
-        self.report_extraction(video_id)
+    _VALID_URL = r'http://(?:(?:www\.)?%(host)s/video/|(?:(?:embed|www)\.)%(host)s/embed\.php\?(?:.*?&)?v=)(?P<videoid>[a-z\d]{13})' % {'host': 'nowvideo\.(?:ch|sx|eu)'}
 
-        video_title = self._html_search_regex(r'<h4>(.*)</h4>',
-            webpage, u'video title')
+    _HOST = 'www.nowvideo.ch'
 
-        video_key = self._search_regex(r'var fkzd="(.*)";',
-            embed_page, u'video key')
+    _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<'
+    _FILEKEY_REGEX = r'var fkzd="([^"]+)";'
+    _TITLE_REGEX = r'<h4>([^<]+)</h4>'
+    _DESCRIPTION_REGEX = r'</h4>\s*<p>([^<]+)</p>'
 
-        api_call = "http://www.nowvideo.ch/api/player.api.php?file={0}&numOfErrors=0&cid=1&key={1}".format(video_id, video_key)
-        api_response = self._download_webpage(api_call, video_id,
-            u'Downloading API page')
-        video_url = compat_urlparse.parse_qs(api_response)[u'url'][0]
-
-        return [{
-            'id':        video_id,
-            'url':       video_url,
-            'ext':       'flv',
-            'title':     video_title,
-        }]
+    _TEST = {
+        'url': 'http://www.nowvideo.ch/video/0mw0yow7b6dxa',
+        'md5': 'f8fbbc8add72bd95b7850c6a02fc8817',
+        'info_dict': {
+            'id': '0mw0yow7b6dxa',
+            'ext': 'flv',
+            'title': 'youtubedl test video _BaW_jenozKc.mp4',
+            'description': 'Description',
+        }
+    }
\ No newline at end of file
index 5f5694393765104b45b573c53155d447a45b1e50..03421d1d5c78f2acd712e560ae17fb96d4a323be 100644 (file)
@@ -8,6 +8,7 @@ from .common import InfoExtractor
 from ..utils import (
     HEADRequest,
     unified_strdate,
+    ExtractorError,
 )
 
 
@@ -35,7 +36,15 @@ class ORFIE(InfoExtractor):
         data_json = self._search_regex(
             r'initializeAdworx\((.+?)\);\n', webpage, 'video info')
         all_data = json.loads(data_json)
-        sdata = all_data[0]['values']['segments']
+
+        def get_segments(all_data):
+            for data in all_data:
+                if data['name'] == 'Tracker::EPISODE_DETAIL_PAGE_OVER_PROGRAM':
+                    return data['values']['segments']
+
+        sdata = get_segments(all_data)
+        if not sdata:
+            raise ExtractorError('Unable to extract segments')
 
         def quality_to_int(s):
             m = re.search('([0-9]+)', s)
index 58200971bece7664e18b94eccb52b368ef5a999b..19ad45c9898f2764bde056c9285078cfd6d25b9f 100644 (file)
@@ -1,7 +1,10 @@
+from __future__ import unicode_literals
+
 import json
 import re
 
 from .common import InfoExtractor
+from ..utils import int_or_none
 
 
 class PodomaticIE(InfoExtractor):
@@ -9,14 +12,14 @@ class PodomaticIE(InfoExtractor):
     _VALID_URL = r'^(?P<proto>https?)://(?P<channel>[^.]+)\.podomatic\.com/entry/(?P<id>[^?]+)'
 
     _TEST = {
-        u"url": u"http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00",
-        u"file": u"2009-01-02T16_03_35-08_00.mp3",
-        u"md5": u"84bb855fcf3429e6bf72460e1eed782d",
-        u"info_dict": {
-            u"uploader": u"Science Teaching Tips",
-            u"uploader_id": u"scienceteachingtips",
-            u"title": u"64.  When the Moon Hits Your Eye",
-            u"duration": 446,
+        "url": "http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00",
+        "file": "2009-01-02T16_03_35-08_00.mp3",
+        "md5": "84bb855fcf3429e6bf72460e1eed782d",
+        "info_dict": {
+            "uploader": "Science Teaching Tips",
+            "uploader_id": "scienceteachingtips",
+            "title": "64.  When the Moon Hits Your Eye",
+            "duration": 446,
         }
     }
 
@@ -36,7 +39,7 @@ class PodomaticIE(InfoExtractor):
         uploader = data['podcast']
         title = data['title']
         thumbnail = data['imageLocation']
-        duration = int(data['length'] / 1000.0)
+        duration = int_or_none(data.get('length'), 1000)
 
         return {
             'id': video_id,
diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py
new file mode 100644 (file)
index 0000000..3f585be
--- /dev/null
@@ -0,0 +1,297 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from hashlib import sha1
+from .common import InfoExtractor
+from ..utils import (
+    compat_urllib_parse,
+    unified_strdate,
+    clean_html,
+    RegexNotFoundError,
+)
+
+
+class ProSiebenSat1IE(InfoExtractor):
+    IE_NAME = 'prosiebensat1'
+    IE_DESC = 'ProSiebenSat.1 Digital'
+    _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|ran|the-voice-of-germany)\.de|fem\.com)/(?P<id>.+)'
+
+    _TESTS = [
+        {
+            'url': 'http://www.prosieben.de/tv/circus-halligalli/videos/218-staffel-2-episode-18-jahresrueckblick-ganze-folge',
+            'info_dict': {
+                'id': '2104602',
+                'ext': 'mp4',
+                'title': 'Staffel 2, Episode 18 - Jahresrückblick',
+                'description': 'md5:8733c81b702ea472e069bc48bb658fc1',
+                'upload_date': '20131231',
+                'duration': 5845.04,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://www.prosieben.de/videokatalog/Gesellschaft/Leben/Trends/video-Lady-Umstyling-f%C3%BCr-Audrina-Rebekka-Audrina-Fergen-billig-aussehen-Battal-Modica-700544.html',
+            'info_dict': {
+                'id': '2570327',
+                'ext': 'mp4',
+                'title': 'Lady-Umstyling für Audrina',
+                'description': 'md5:4c16d0c17a3461a0d43ea4084e96319d',
+                'upload_date': '20131014',
+                'duration': 606.76,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+            'skip': 'Seems to be broken',
+        },
+        {
+            'url': 'http://www.prosiebenmaxx.de/tv/experience/video/144-countdown-fuer-die-autowerkstatt-ganze-folge',
+            'info_dict': {
+                'id': '2429369',
+                'ext': 'mp4',
+                'title': 'Countdown für die Autowerkstatt',
+                'description': 'md5:809fc051a457b5d8666013bc40698817',
+                'upload_date': '20140223',
+                'duration': 2595.04,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://www.sixx.de/stars-style/video/sexy-laufen-in-ugg-boots-clip',
+            'info_dict': {
+                'id': '2904997',
+                'ext': 'mp4',
+                'title': 'Sexy laufen in Ugg Boots',
+                'description': 'md5:edf42b8bd5bc4e5da4db4222c5acb7d6',
+                'upload_date': '20140122',
+                'duration': 245.32,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://www.sat1.de/film/der-ruecktritt/video/im-interview-kai-wiesinger-clip',
+            'info_dict': {
+                'id': '2906572',
+                'ext': 'mp4',
+                'title': 'Im Interview: Kai Wiesinger',
+                'description': 'md5:e4e5370652ec63b95023e914190b4eb9',
+                'upload_date': '20140225',
+                'duration': 522.56,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://www.kabeleins.de/tv/rosins-restaurants/videos/jagd-auf-fertigkost-im-elsthal-teil-2-ganze-folge',
+            'info_dict': {
+                'id': '2992323',
+                'ext': 'mp4',
+                'title': 'Jagd auf Fertigkost im Elsthal - Teil 2',
+                'description': 'md5:2669cde3febe9bce13904f701e774eb6',
+                'upload_date': '20140225',
+                'duration': 2410.44,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://www.ran.de/fussball/bundesliga/video/schalke-toennies-moechte-raul-zurueck-ganze-folge',
+            'info_dict': {
+                'id': '3004256',
+                'ext': 'mp4',
+                'title': 'Schalke: Tönnies möchte Raul zurück',
+                'description': 'md5:4b5b271d9bcde223b54390754c8ece3f',
+                'upload_date': '20140226',
+                'duration': 228.96,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://www.the-voice-of-germany.de/video/31-andreas-kuemmert-rocket-man-clip',
+            'info_dict': {
+                'id': '2572814',
+                'ext': 'mp4',
+                'title': 'Andreas Kümmert: Rocket Man',
+                'description': 'md5:6ddb02b0781c6adf778afea606652e38',
+                'upload_date': '20131017',
+                'duration': 469.88,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://www.fem.com/wellness/videos/wellness-video-clip-kurztripps-zum-valentinstag.html',
+            'info_dict': {
+                'id': '2156342',
+                'ext': 'mp4',
+                'title': 'Kurztrips zum Valentinstag',
+                'description': 'md5:8ba6301e70351ae0bedf8da00f7ba528',
+                'upload_date': '20130206',
+                'duration': 307.24,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+        },
+    ]
+
+    _CLIPID_REGEXES = [
+        r'"clip_id"\s*:\s+"(\d+)"',
+        r'clipid: "(\d+)"',
+    ]
+    _TITLE_REGEXES = [
+        r'<h2 class="subtitle" itemprop="name">\s*(.+?)</h2>',
+        r'<header class="clearfix">\s*<h3>(.+?)</h3>',
+        r'<!-- start video -->\s*<h1>(.+?)</h1>',
+        r'<div class="ep-femvideos-pi4-video-txt">\s*<h2>(.+?)</h2>',
+    ]
+    _DESCRIPTION_REGEXES = [
+        r'<p itemprop="description">\s*(.+?)</p>',
+        r'<div class="videoDecription">\s*<p><strong>Beschreibung</strong>: (.+?)</p>',
+        r'<div class="g-plusone" data-size="medium"></div>\s*</div>\s*</header>\s*(.+?)\s*<footer>',
+        r'<p>(.+?)</p>\s*<div class="ep-femvideos-pi4-video-footer">',
+    ]
+    _UPLOAD_DATE_REGEXES = [
+        r'<meta property="og:published_time" content="(.+?)">',
+        r'<span>\s*(\d{2}\.\d{2}\.\d{4} \d{2}:\d{2}) \|\s*<span itemprop="duration"',
+        r'<footer>\s*(\d{2}\.\d{2}\.\d{4}) \d{2}:\d{2} Uhr',
+        r'<span style="padding-left: 4px;line-height:20px; color:#404040">(\d{2}\.\d{2}\.\d{4})</span>',
+        r'(\d{2}\.\d{2}\.\d{4}) \| \d{2}:\d{2} Min<br/>',
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        page = self._download_webpage(url, video_id, 'Downloading page')
+
+        def extract(patterns, name, page, fatal=False):
+            for pattern in patterns:
+                mobj = re.search(pattern, page)
+                if mobj:
+                    return clean_html(mobj.group(1))
+            if fatal:
+                raise RegexNotFoundError(u'Unable to extract %s' % name)
+            return None
+
+        clip_id = extract(self._CLIPID_REGEXES, 'clip id', page, fatal=True)
+
+        access_token = 'testclient'
+        client_name = 'kolibri-1.2.5'
+        client_location = url
+
+        videos_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos?%s' % compat_urllib_parse.urlencode({
+            'access_token': access_token,
+            'client_location': client_location,
+            'client_name': client_name,
+            'ids': clip_id,
+        })
+
+        videos = self._download_json(videos_api_url, clip_id, 'Downloading videos JSON')
+
+        duration = float(videos[0]['duration'])
+        source_ids = [source['id'] for source in videos[0]['sources']]
+        source_ids_str = ','.join(map(str, source_ids))
+
+        g = '01!8d8F_)r9]4s[qeuXfP%'
+
+        client_id = g[:2] + sha1(''.join([clip_id, g, access_token, client_location, g, client_name])
+                                 .encode('utf-8')).hexdigest()
+
+        sources_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources?%s' % (clip_id, compat_urllib_parse.urlencode({
+            'access_token': access_token,
+            'client_id': client_id,
+            'client_location': client_location,
+            'client_name': client_name,
+        }))
+
+        sources = self._download_json(sources_api_url, clip_id, 'Downloading sources JSON')
+        server_id = sources['server_id']
+
+        client_id = g[:2] + sha1(''.join([g, clip_id, access_token, server_id,
+                                          client_location, source_ids_str, g, client_name])
+                                 .encode('utf-8')).hexdigest()
+
+        url_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url?%s' % (clip_id, compat_urllib_parse.urlencode({
+            'access_token': access_token,
+            'client_id': client_id,
+            'client_location': client_location,
+            'client_name': client_name,
+            'server_id': server_id,
+            'source_ids': source_ids_str,
+        }))
+
+        urls = self._download_json(url_api_url, clip_id, 'Downloading urls JSON')
+
+        title = extract(self._TITLE_REGEXES, 'title', page, fatal=True)
+        description = extract(self._DESCRIPTION_REGEXES, 'description', page)
+        thumbnail = self._og_search_thumbnail(page)
+
+        upload_date = extract(self._UPLOAD_DATE_REGEXES, 'upload date', page)
+        if upload_date:
+            upload_date = unified_strdate(upload_date)
+
+        formats = []
+
+        urls_sources = urls['sources']
+        if isinstance(urls_sources, dict):
+            urls_sources = urls_sources.values()
+
+        def fix_bitrate(bitrate):
+            return bitrate / 1000 if bitrate % 1000 == 0 else bitrate
+
+        for source in urls_sources:
+            protocol = source['protocol']
+            if protocol == 'rtmp' or protocol == 'rtmpe':
+                mobj = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', source['url'])
+                if not mobj:
+                    continue
+                formats.append({
+                    'url': mobj.group('url'),
+                    'app': mobj.group('app'),
+                    'play_path': mobj.group('playpath'),
+                    'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf',
+                    'page_url': 'http://www.prosieben.de',
+                    'vbr': fix_bitrate(source['bitrate']),
+                    'ext': 'mp4',
+                    'format_id': '%s_%s' % (source['cdn'], source['bitrate']),
+                })
+            else:
+                formats.append({
+                    'url': source['url'],
+                    'vbr': fix_bitrate(source['bitrate']),
+                })
+
+        self._sort_formats(formats)
+
+        return {
+            'id': clip_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'upload_date': upload_date,
+            'duration': duration,
+            'formats': formats,
+        }
\ No newline at end of file
index cd50f708d202cc29e1a9a24765f2f91396a0074e..4835ec5ecada755a12d7003fed6355adfd6936a6 100644 (file)
 # encoding: utf-8
-
 from __future__ import unicode_literals
 
 import re
 
 from .common import InfoExtractor
 from ..utils import (
-    clean_html,
     ExtractorError,
+    clean_html,
+    unified_strdate,
+    int_or_none,
 )
 
 
 class RTLnowIE(InfoExtractor):
     """Information Extractor for RTL NOW, RTL2 NOW, RTL NITRO, SUPER RTL NOW, VOX NOW and n-tv NOW"""
-    _VALID_URL = r'(?:http://)?(?P<url>(?P<domain>rtl-now\.rtl\.de|rtl2now\.rtl2\.de|(?:www\.)?voxnow\.de|(?:www\.)?rtlnitronow\.de|(?:www\.)?superrtlnow\.de|(?:www\.)?n-tvnow\.de)/+[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?(?:container_id|film_id)=(?P<video_id>[0-9]+)&player=1(?:&season=[0-9]+)?(?:&.*)?)'
-    _TESTS = [{
-        'url': 'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1',
-        'file': '90419.flv',
-        'info_dict': {
-            'upload_date': '20070416',
-            'title': 'Ahornallee - Folge 1 - Der Einzug',
-            'description': 'Folge 1 - Der Einzug',
-        },
-        'params': {
-            'skip_download': True,
-        },
-        'skip': 'Only works from Germany',
-    },
-    {
-        'url': 'http://rtl2now.rtl2.de/aerger-im-revier/episode-15-teil-1.php?film_id=69756&player=1&season=2&index=5',
-        'file': '69756.flv',
-        'info_dict': {
-            'upload_date': '20120519',
-            'title': 'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit...',
-            'description': 'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit u.a.',
-            'thumbnail': 'http://autoimg.static-fra.de/rtl2now/219850/1500x1500/image2.jpg',
-        },
-        'params': {
-            'skip_download': True,
-        },
-        'skip': 'Only works from Germany',
-    },
-    {
-        'url': 'http://www.voxnow.de/voxtours/suedafrika-reporter-ii.php?film_id=13883&player=1&season=17',
-        'file': '13883.flv',
-        'info_dict': {
-            'upload_date': '20090627',
-            'title': 'Voxtours - Südafrika-Reporter II',
-            'description': 'Südafrika-Reporter II',
-        },
-        'params': {
-            'skip_download': True,
+    _VALID_URL = r'''(?x)
+                        (?:https?://)?
+                        (?P<url>
+                            (?P<domain>
+                                rtl-now\.rtl\.de|
+                                rtl2now\.rtl2\.de|
+                                (?:www\.)?voxnow\.de|
+                                (?:www\.)?rtlnitronow\.de|
+                                (?:www\.)?superrtlnow\.de|
+                                (?:www\.)?n-tvnow\.de)
+                            /+[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?
+                            (?:container_id|film_id)=(?P<video_id>[0-9]+)&
+                            player=1(?:&season=[0-9]+)?(?:&.*)?
+                        )'''
+
+    _TESTS = [
+        {
+            'url': 'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1',
+            'info_dict': {
+                'id': '90419',
+                'ext': 'flv',
+                'title': 'Ahornallee - Folge 1 - Der Einzug',
+                'description': 'md5:ce843b6b5901d9a7f7d04d1bbcdb12de',
+                'upload_date': '20070416',
+                'duration': 1685,
+            },
+            'params': {
+                'skip_download': True,
+            },
+            'skip': 'Only works from Germany',
         },
-    },
-    {
-        'url': 'http://superrtlnow.de/medicopter-117/angst.php?film_id=99205&player=1',
-        'file': '99205.flv',
-        'info_dict': {
-            'upload_date': '20080928', 
-            'title': 'Medicopter 117 - Angst!',
-            'description': 'Angst!',
-            'thumbnail': 'http://autoimg.static-fra.de/superrtlnow/287529/1500x1500/image2.jpg'
+        {
+            'url': 'http://rtl2now.rtl2.de/aerger-im-revier/episode-15-teil-1.php?film_id=69756&player=1&season=2&index=5',
+            'info_dict': {
+                'id': '69756',
+                'ext': 'flv',
+                'title': 'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit u.a.',
+                'description': 'md5:3fb247005ed21a935ffc82b7dfa70cf0',
+                'thumbnail': 'http://autoimg.static-fra.de/rtl2now/219850/1500x1500/image2.jpg',
+                'upload_date': '20120519',
+                'duration': 1245,
+            },
+            'params': {
+                'skip_download': True,
+            },
+            'skip': 'Only works from Germany',
         },
-        'params': {
-            'skip_download': True,
+        {
+            'url': 'http://www.voxnow.de/voxtours/suedafrika-reporter-ii.php?film_id=13883&player=1&season=17',
+            'info_dict': {
+                'id': '13883',
+                'ext': 'flv',
+                'title': 'Voxtours - Südafrika-Reporter II',
+                'description': 'md5:de7f8d56be6fd4fed10f10f57786db00',
+                'upload_date': '20090627',
+                'duration': 1800,
+            },
+            'params': {
+                'skip_download': True,
+            },
         },
-    },
-    {
-        'url': 'http://www.n-tvnow.de/top-gear/episode-1-2013-01-01-00-00-00.php?film_id=124903&player=1&season=10',
-        'file': '124903.flv',
-        'info_dict': {
-            'upload_date': '20130101',
-            'title': 'Top Gear vom 01.01.2013',
-            'description': 'Episode 1',
+        {
+            'url': 'http://superrtlnow.de/medicopter-117/angst.php?film_id=99205&player=1',
+            'info_dict': {
+                'id': '99205',
+                'ext': 'flv',
+                'title': 'Medicopter 117 - Angst!',
+                'description': 'md5:895b1df01639b5f61a04fc305a5cb94d',
+                'thumbnail': 'http://autoimg.static-fra.de/superrtlnow/287529/1500x1500/image2.jpg',
+                'upload_date': '20080928',
+                'duration': 2691,
+            },
+            'params': {
+                'skip_download': True,
+            },
         },
-        'params': {
-            'skip_download': True,
+        {
+            'url': 'http://www.n-tvnow.de/deluxe-alles-was-spass-macht/thema-ua-luxushotel-fuer-vierbeiner.php?container_id=153819&player=1&season=0',
+            'info_dict': {
+                'id': '153819',
+                'ext': 'flv',
+                'title': 'Deluxe - Alles was Spaß macht - Thema u.a.: Luxushotel für Vierbeiner',
+                'description': 'md5:c3705e1bb32e1a5b2bcd634fc065c631',
+                'thumbnail': 'http://autoimg.static-fra.de/ntvnow/383157/1500x1500/image2.jpg',
+                'upload_date': '20140221',
+                'duration': 2429,
+            },
+            'skip': 'Only works from Germany',
         },
-        'skip': 'Only works from Germany',
-    }]
+    ]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-
-        webpage_url = 'http://' + mobj.group('url')
-        video_page_url = 'http://' + mobj.group('domain') + '/'
+        video_page_url = 'http://%s/' % mobj.group('domain')
         video_id = mobj.group('video_id')
 
-        webpage = self._download_webpage(webpage_url, video_id)
+        webpage = self._download_webpage('http://' + mobj.group('url'), video_id)
 
-        note_m = re.search(r'''(?sx)
-            <div[ ]style="margin-left:[ ]20px;[ ]font-size:[ ]13px;">(.*?)
-            <div[ ]id="playerteaser">''', webpage)
-        if note_m:
-            msg = clean_html(note_m.group(1))
-            raise ExtractorError(msg)
+        mobj = re.search(r'(?s)<div style="margin-left: 20px; font-size: 13px;">(.*?)<div id="playerteaser">', webpage)
+        if mobj:
+            raise ExtractorError(clean_html(mobj.group(1)), expected=True)
 
-        video_title = self._html_search_regex(
-            r'<title>(?P<title>[^<]+?)( \| [^<]*)?</title>',
-            webpage, 'title')
-        playerdata_url = self._html_search_regex(
-            r'\'playerdata\': \'(?P<playerdata_url>[^\']+)\'',
-            webpage, 'playerdata_url')
+        title = self._og_search_title(webpage)
+        description = self._og_search_description(webpage)
+        thumbnail = self._og_search_thumbnail(webpage, default=None)
 
-        playerdata = self._download_webpage(playerdata_url, video_id)
-        mobj = re.search(r'<title><!\[CDATA\[(?P<description>.+?)(?:\s+- (?:Sendung )?vom (?P<upload_date_d>[0-9]{2})\.(?P<upload_date_m>[0-9]{2})\.(?:(?P<upload_date_Y>[0-9]{4})|(?P<upload_date_y>[0-9]{2})) [0-9]{2}:[0-9]{2} Uhr)?\]\]></title>', playerdata)
-        if mobj:
-            video_description = mobj.group('description')
-            if mobj.group('upload_date_Y'):
-                video_upload_date = mobj.group('upload_date_Y')
-            elif mobj.group('upload_date_y'):
-                video_upload_date = '20' + mobj.group('upload_date_y')
-            else:
-                video_upload_date = None
-            if video_upload_date:
-                video_upload_date += mobj.group('upload_date_m') + mobj.group('upload_date_d')
-        else:
-            video_description = None
-            video_upload_date = None
-            self._downloader.report_warning('Unable to extract description and upload date')
+        upload_date = unified_strdate(self._html_search_meta('uploadDate', webpage, 'upload date'))
 
-        # Thumbnail: not every video has an thumbnail
-        mobj = re.search(r'<meta property="og:image" content="(?P<thumbnail>[^"]+)">', webpage)
-        if mobj:
-            video_thumbnail = mobj.group('thumbnail')
-        else:
-            video_thumbnail = None
+        mobj = re.search(r'<meta itemprop="duration" content="PT(?P<seconds>\d+)S" />', webpage)
+        duration = int(mobj.group('seconds')) if mobj else None
 
-        mobj = re.search(r'<filename [^>]+><!\[CDATA\[(?P<url>rtmpe://(?:[^/]+/){2})(?P<play_path>[^\]]+)\]\]></filename>', playerdata)
-        if mobj is None:
-            raise ExtractorError('Unable to extract media URL')
-        video_url = mobj.group('url')
-        video_play_path = 'mp4:' + mobj.group('play_path')
-        video_player_url = video_page_url + 'includes/vodplayer.swf'
+        playerdata_url = self._html_search_regex(
+            r"'playerdata': '(?P<playerdata_url>[^']+)'", webpage, 'playerdata_url')
+
+        playerdata = self._download_xml(playerdata_url, video_id, 'Downloading player data XML')
+
+        videoinfo = playerdata.find('./playlist/videoinfo')
+        
+        formats = []
+        for filename in videoinfo.findall('filename'):
+            mobj = re.search(r'(?P<url>rtmpe://(?:[^/]+/){2})(?P<play_path>.+)', filename.text)
+            if mobj:
+                fmt = {
+                    'url': mobj.group('url'),
+                    'play_path': 'mp4:' + mobj.group('play_path'),
+                    'page_url': video_page_url,
+                    'player_url': video_page_url + 'includes/vodplayer.swf',
+                }
+            else:
+                fmt = {
+                    'url': filename.text,
+                }
+            fmt.update({
+                'width': int_or_none(filename.get('width')),
+                'height': int_or_none(filename.get('height')),
+                'vbr': int_or_none(filename.get('bitrate')),
+                'ext': 'flv',
+            })
+            formats.append(fmt)
 
         return {
             'id': video_id,
-            'url': video_url,
-            'play_path': video_play_path,
-            'page_url': video_page_url,
-            'player_url': video_player_url,
-            'ext': 'flv',
-            'title': video_title,
-            'description': video_description,
-            'upload_date': video_upload_date,
-            'thumbnail': video_thumbnail,
-        }
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'upload_date': upload_date,
+            'duration': duration,
+            'formats': formats,
+        }
\ No newline at end of file
index 393b5f17c53d5ed216b53ac0d1ff2941cb1d24f7..1cc0dcb158907831e010c3d243ec5f871984885b 100644 (file)
@@ -217,7 +217,7 @@ class SoundcloudIE(InfoExtractor):
         return self._extract_info_dict(info, full_title, secret_token=token)
 
 class SoundcloudSetIE(SoundcloudIE):
-    _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)(?:[?].*)?$'
+    _VALID_URL = r'https?://(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
     IE_NAME = 'soundcloud:set'
     # it's in tests/test_playlists.py
     _TESTS = []
index 8b31caa92c1e44473aa42953427b3cc2d71762f7..b9e65447f1f69c63fc61eab1469644070e5dc5da 100644 (file)
@@ -6,115 +6,111 @@ import re
 from .subtitles import SubtitlesInfoExtractor
 
 from ..utils import (
+    compat_str,
     RegexNotFoundError,
 )
 
 
 class TEDIE(SubtitlesInfoExtractor):
-    _VALID_URL=r'''http://www\.ted\.com/
-                   (
-                        ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
-                        |
-                        ((?P<type_talk>talks)) # We have a simple talk
-                   )
-                   (/lang/(.*?))? # The url may contain the language
-                   /(?P<name>\w+) # Here goes the name and then ".html"
-                   '''
+    _VALID_URL = r'''(?x)http://www\.ted\.com/
+        (
+            (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
+            |
+            ((?P<type_talk>talks)) # We have a simple talk
+        )
+        (/lang/(.*?))? # The url may contain the language
+        /(?P<name>\w+) # Here goes the name and then ".html"
+        '''
     _TEST = {
         'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
         'file': '102.mp4',
         'md5': '4ea1dada91e4174b53dac2bb8ace429d',
         'info_dict': {
-            "description": "md5:c6fa72e6eedbd938c9caf6b2702f5922",
-            "title": "Dan Dennett: The illusion of consciousness"
+            'title': 'The illusion of consciousness',
+            'description': ('Philosopher Dan Dennett makes a compelling '
+                'argument that not only don\'t we understand our own '
+                'consciousness, but that half the time our brains are '
+                'actively fooling us.'),
+            'uploader': 'Dan Dennett',
         }
     }
 
-    @classmethod
-    def suitable(cls, url):
-        """Receives a URL and returns True if suitable for this IE."""
-        return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
+    _FORMATS_PREFERENCE = {
+        'low': 1,
+        'medium': 2,
+        'high': 3,
+    }
+
+    def _extract_info(self, webpage):
+        info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
+            webpage, 'info json')
+        return json.loads(info_json)
 
     def _real_extract(self, url):
-        m=re.match(self._VALID_URL, url, re.VERBOSE)
+        m = re.match(self._VALID_URL, url, re.VERBOSE)
+        name = m.group('name')
         if m.group('type_talk'):
-            return self._talk_info(url)
-        else :
-            playlist_id=m.group('playlist_id')
-            name=m.group('name')
-            self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
-            return [self._playlist_videos_info(url,name,playlist_id)]
+            return self._talk_info(url, name)
+        else:
+            return self._playlist_videos_info(url, name)
 
-
-    def _playlist_videos_info(self, url, name, playlist_id):
+    def _playlist_videos_info(self, url, name):
         '''Returns the videos of the playlist'''
 
-        webpage = self._download_webpage(
-            url, playlist_id, 'Downloading playlist webpage')
-        matches = re.finditer(
-            r'<p\s+class="talk-title[^"]*"><a\s+href="(?P<talk_url>/talks/[^"]+\.html)">[^<]*</a></p>',
-            webpage)
-
-        playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
-                                                 webpage, 'playlist title')
+        webpage = self._download_webpage(url, name,
+            'Downloading playlist webpage')
+        info = self._extract_info(webpage)
+        playlist_info = info['playlist']
 
         playlist_entries = [
-            self.url_result(u'http://www.ted.com' + m.group('talk_url'), 'TED')
-            for m in matches
+            self.url_result(u'http://www.ted.com/talks/' + talk['slug'], self.ie_key())
+            for talk in info['talks']
         ]
         return self.playlist_result(
-            playlist_entries, playlist_id=playlist_id, playlist_title=playlist_title)
+            playlist_entries,
+            playlist_id=compat_str(playlist_info['id']),
+            playlist_title=playlist_info['title'])
 
-    def _talk_info(self, url, video_id=0):
-        """Return the video for the talk in the url"""
-        m = re.match(self._VALID_URL, url,re.VERBOSE)
-        video_name = m.group('name')
-        webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
+    def _talk_info(self, url, video_name):
+        webpage = self._download_webpage(url, video_name)
         self.report_extraction(video_name)
-        # If the url includes the language we get the title translated
-        title = self._html_search_regex(r'<span .*?id="altHeadline".+?>(?P<title>.*)</span>',
-                                        webpage, 'title')
-        json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
-                                    webpage, 'json data')
-        info = json.loads(json_data)
-        desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
-                                       webpage, 'description', flags = re.DOTALL)
-        
-        thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
-                                       webpage, 'thumbnail')
-        formats = [{
-            'ext': 'mp4',
-            'url': stream['file'],
-            'format': stream['id']
-        } for stream in info['htmlStreams']]
 
-        video_id = info['id']
+        talk_info = self._extract_info(webpage)['talks'][0]
 
+        formats = [{
+            'ext': 'mp4',
+            'url': format_url,
+            'format_id': format_id,
+            'format': format_id,
+            'preference': self._FORMATS_PREFERENCE.get(format_id, -1),
+        } for (format_id, format_url) in talk_info['nativeDownloads'].items()]
+        self._sort_formats(formats)
+
+        video_id = talk_info['id']
         # subtitles
-        video_subtitles = self.extract_subtitles(video_id, webpage)
+        video_subtitles = self.extract_subtitles(video_id, talk_info)
         if self._downloader.params.get('listsubtitles', False):
-            self._list_available_subtitles(video_id, webpage)
+            self._list_available_subtitles(video_id, talk_info)
             return
 
         return {
             'id': video_id,
-            'title': title,
-            'thumbnail': thumbnail,
-            'description': desc,
+            'title': talk_info['title'],
+            'uploader': talk_info['speaker'],
+            'thumbnail': talk_info['thumb'],
+            'description': self._og_search_description(webpage),
             'subtitles': video_subtitles,
             'formats': formats,
         }
 
-    def _get_available_subtitles(self, video_id, webpage):
-        try:
-            options = self._search_regex(r'(?:<select name="subtitles_language_select" id="subtitles_language_select">)(.*?)(?:</select>)', webpage, 'subtitles_language_select', flags=re.DOTALL)
-            languages = re.findall(r'(?:<option value=")(\S+)"', options)
-            if languages:
-                sub_lang_list = {}
-                for l in languages:
-                    url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
-                    sub_lang_list[l] = url
-                return sub_lang_list
-        except RegexNotFoundError:
+    def _get_available_subtitles(self, video_id, talk_info):
+        languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
+        if languages:
+            sub_lang_list = {}
+            for l in languages:
+                url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
+                sub_lang_list[l] = url
+            return sub_lang_list
+        else:
             self._downloader.report_warning(u'video doesn\'t have subtitles')
-        return {}
+            return {}
index bdc6e206427a655fd50b7132a1c69e3e2e4a51f7..c7d559315be7d2ceed094fab691e4141de5534e7 100644 (file)
@@ -39,6 +39,8 @@ class TestURLIE(InfoExtractor):
                     ('Found multiple matching extractors: %s' %
                         ' '.join(ie.IE_NAME for ie in matching_extractors)),
                     expected=True)
+        else:
+            extractor = matching_extractors[0]
 
         num_str = mobj.group('num')
         num = int(num_str) if num_str else 0
index d60702325d7d4ffebafcb45cdc9054c8fa4d2af7..91f2453ebf237db086f18f1ca2b30f2ce73a3728 100644 (file)
@@ -13,7 +13,7 @@ _x = lambda p: xpath_with_ns(p, {'smil': 'http://www.w3.org/2005/SMIL21/Language
 class ThePlatformIE(InfoExtractor):
     _VALID_URL = r'''(?x)
         (?:https?://(?:link|player)\.theplatform\.com/[sp]/[^/]+/
-           (?P<config>[^/\?]+/(?:swf|config)/select/)?
+           (?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/)?
          |theplatform:)(?P<id>[^/\?&]+)'''
 
     _TEST = {
@@ -54,10 +54,15 @@ class ThePlatformIE(InfoExtractor):
 
         f4m_node = body.find(_x('smil:seq/smil:video'))
         if f4m_node is not None:
+            f4m_url = f4m_node.attrib['src']
+            if 'manifest.f4m?' not in f4m_url:
+                f4m_url += '?'
+            # the parameters are from syfy.com, other sites may use others,
+            # they also work for nbc.com
+            f4m_url += '&g=UXWGVKRWHFSP&hdcore=3.0.3'
             formats = [{
                 'ext': 'flv',
-                # the parameters are from syfy.com, other sites may use others
-                'url': f4m_node.attrib['src'] + '?g=UXWGVKRWHFSP&hdcore=3.0.3',
+                'url': f4m_url,
             }]
         else:
             base_url = head.find(_x('smil:meta')).attrib['base']
@@ -95,9 +100,10 @@ class ThePlatformIE(InfoExtractor):
         if mobj.group('config'):
             config_url = url+ '&form=json'
             config_url = config_url.replace('swf/', 'config/')
+            config_url = config_url.replace('onsite/', 'onsite/config/')
             config_json = self._download_webpage(config_url, video_id, u'Downloading config')
             config = json.loads(config_json)
-            smil_url = config['releaseUrl'] + '&format=SMIL&formats=MPEG4'
+            smil_url = config['releaseUrl'] + '&format=SMIL&formats=MPEG4&manifest=f4m'
         else:
             smil_url = ('http://link.theplatform.com/s/dJ5BDC/{0}/meta.smil?'
                 'format=smil&mbr=true'.format(video_id))
index 2246d27b2f741047eb6b53ad0e4f54e79293294b..a4aa25f661223301b9d16c7ac87b6c502aa0e0ff 100644 (file)
@@ -3,7 +3,7 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from youtube_dl.utils import ExtractorError
+from ..utils import ExtractorError
 
 
 class TinyPicIE(InfoExtractor):
diff --git a/youtube_dl/extractor/trutube.py b/youtube_dl/extractor/trutube.py
new file mode 100644 (file)
index 0000000..57f9566
--- /dev/null
@@ -0,0 +1,44 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class TruTubeIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?trutube\.tv/video/(?P<id>[0-9]+)/.*'
+    _TEST = {
+        'url': 'http://trutube.tv/video/14880/Ramses-II-Proven-To-Be-A-Red-Headed-Caucasoid-',
+        'md5': 'c5b6e301b0a2040b074746cbeaa26ca1',
+        'info_dict': {
+            'id': '14880',
+            'ext': 'flv',
+            'title': 'Ramses II - Proven To Be A Red Headed Caucasoid',
+            'thumbnail': 're:^http:.*\.jpg$',
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+        video_title = self._og_search_title(webpage).strip()
+        thumbnail = self._search_regex(
+            r"var splash_img = '([^']+)';", webpage, 'thumbnail', fatal=False)
+
+        all_formats = re.finditer(
+            r"var (?P<key>[a-z]+)_video_file\s*=\s*'(?P<url>[^']+)';", webpage)
+        formats = [{
+            'format_id': m.group('key'),
+            'quality': -i,
+            'url': m.group('url'),
+        } for i, m in enumerate(all_formats)]
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': video_title,
+            'formats': formats,
+            'thumbnail': thumbnail,
+        }
diff --git a/youtube_dl/extractor/tvigle.py b/youtube_dl/extractor/tvigle.py
new file mode 100644 (file)
index 0000000..0921cc5
--- /dev/null
@@ -0,0 +1,84 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    unified_strdate,
+    clean_html,
+    int_or_none,
+)
+
+
+class TvigleIE(InfoExtractor):
+    IE_NAME = 'tvigle'
+    IE_DESC = 'Интернет-телевидение Tvigle.ru'
+    _VALID_URL = r'http://(?:www\.)?tvigle\.ru/category/.+?[\?&]v(?:ideo)?=(?P<id>\d+)'
+
+    _TESTS = [
+        {
+            'url': 'http://www.tvigle.ru/category/cinema/1608/?video=503081',
+            'md5': '09afba4616666249f087efc6dcf83cb3',
+            'info_dict': {
+                'id': '503081',
+                'ext': 'flv',
+                'title': 'Брат 2 ',
+                'description': 'md5:f5a42970f50648cee3d7ad740f3ae769',
+                'upload_date': '20110919',
+            },
+        },
+        {
+            'url': 'http://www.tvigle.ru/category/men/vysotskiy_vospominaniya02/?flt=196&v=676433',
+            'md5': 'e7efe5350dd5011d0de6550b53c3ba7b',
+            'info_dict': {
+                'id': '676433',
+                'ext': 'flv',
+                'title': 'Ведущий телепрограммы «60 минут» (США) о Владимире Высоцком',
+                'description': 'md5:027f7dc872948f14c96d19b4178428a4',
+                'upload_date': '20121218',
+            },
+        },
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        video_data = self._download_xml(
+            'http://www.tvigle.ru/xml/single.php?obj=%s' % video_id, video_id, 'Downloading video XML')
+
+        video = video_data.find('./video')
+
+        title = video.get('name')
+        description = video.get('anons')
+        if description:
+            description = clean_html(description)
+        thumbnail = video_data.get('img')
+        upload_date = unified_strdate(video.get('date'))
+        like_count = int_or_none(video.get('vtp'))
+
+        formats = []
+        for num, (format_id, format_note) in enumerate([['low_file', 'SQ'], ['file', 'HQ'], ['hd', 'HD 720']]):
+            video_url = video.get(format_id)
+            if not video_url:
+                continue
+            formats.append({
+                'url': video_url,
+                'format_id': format_id,
+                'format_note': format_note,
+                'quality': num,
+            })
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'upload_date': upload_date,
+            'like_count': like_count,
+            'age_limit': 18,
+            'formats': formats,
+        }
\ No newline at end of file
index baa57f3438603e6a5d0d2b406389df4110353ebe..c90feefd2ae03389877044ce2f706a64fdaf4a0b 100644 (file)
@@ -4,6 +4,7 @@ import re
 import json
 
 from .common import InfoExtractor
+from ..utils import compat_urllib_request
 
 
 class VeohIE(InfoExtractor):
@@ -24,6 +25,13 @@ class VeohIE(InfoExtractor):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('id')
         webpage = self._download_webpage(url, video_id)
+        age_limit = 0
+        if 'class="adultwarning-container"' in webpage:
+            self.report_age_confirmation()
+            age_limit = 18
+            request = compat_urllib_request.Request(url)
+            request.add_header('Cookie', 'confirmedAdult=true')
+            webpage = self._download_webpage(request, video_id)
 
         m_youtube = re.search(r'http://www\.youtube\.com/v/(.*?)(\&|")', webpage)
         if m_youtube is not None:
@@ -44,4 +52,5 @@ class VeohIE(InfoExtractor):
             'thumbnail': info.get('highResImage') or info.get('medResImage'),
             'description': info['description'],
             'view_count': info['views'],
+            'age_limit': age_limit,
         }
index f51d4dcfa6c0cbda5fb8c53d0421ac099cea8295..4172821290d8ca14ca04f30400e48b37df6a243b 100644 (file)
@@ -113,8 +113,8 @@ class VestiIE(InfoExtractor):
         priority_transport = playlist['priority_transport']
 
         thumbnail = media['picture']
-        width = media['width']
-        height = media['height']
+        width = int_or_none(media['width'])
+        height = int_or_none(media['height'])
         description = media['anons']
         title = media['title']
         duration = int_or_none(media.get('duration'))
index e458ac961f701391c2fb651b1557ac7070dc6e2a..fa147a5757df04ca69bac2e3dc6054fe6d2a7797 100644 (file)
@@ -24,9 +24,10 @@ class VevoIE(InfoExtractor):
         (?P<id>[^&?#]+)'''
     _TESTS = [{
         'url': 'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280',
-        'file': 'GB1101300280.mp4',
         "md5": "06bea460acb744eab74a9d7dcb4bfd61",
         'info_dict': {
+            'id': 'GB1101300280',
+            'ext': 'mp4',
             "upload_date": "20130624",
             "uploader": "Hurts",
             "title": "Somebody to Die For",
@@ -34,6 +35,33 @@ class VevoIE(InfoExtractor):
             "width": 1920,
             "height": 1080,
         }
+    }, {
+        'note': 'v3 SMIL format',
+        'url': 'http://www.vevo.com/watch/cassadee-pope/i-wish-i-could-break-your-heart/USUV71302923',
+        'md5': '893ec0e0d4426a1d96c01de8f2bdff58',
+        'info_dict': {
+            'id': 'USUV71302923',
+            'ext': 'mp4',
+            'upload_date': '20140219',
+            'uploader': 'Cassadee Pope',
+            'title': 'I Wish I Could Break Your Heart',
+            'duration': 226.101,
+            'age_limit': 0,
+        }
+    }, {
+        'note': 'Age-limited video',
+        'url': 'https://www.vevo.com/watch/justin-timberlake/tunnel-vision-explicit/USRV81300282',
+        'info_dict': {
+            'id': 'USRV81300282',
+            'ext': 'mp4',
+            'age_limit': 18,
+            'title': 'Tunnel Vision (Explicit)',
+            'uploader': 'Justin Timberlake',
+            'upload_date': '20130704',
+        },
+        'params': {
+            'skip_download': 'true',
+        }
     }]
     _SMIL_BASE_URL = 'http://smil.lvl3.vevo.com/'
 
@@ -105,9 +133,31 @@ class VevoIE(InfoExtractor):
         video_info = self._download_json(json_url, video_id)['video']
 
         formats = self._formats_from_json(video_info)
+
+        is_explicit = video_info.get('isExplicit')
+        if is_explicit is True:
+            age_limit = 18
+        elif is_explicit is False:
+            age_limit = 0
+        else:
+            age_limit = None
+
+        # Download SMIL
+        smil_blocks = sorted((
+            f for f in video_info['videoVersions']
+            if f['sourceType'] == 13),
+            key=lambda f: f['version'])
+
+        smil_url = '%s/Video/V2/VFILE/%s/%sr.smil' % (
+            self._SMIL_BASE_URL, video_id, video_id.lower())
+        if smil_blocks:
+            smil_url_m = self._search_regex(
+                r'url="([^"]+)"', smil_blocks[-1]['data'], 'SMIL URL',
+                fatal=False)
+            if smil_url_m is not None:
+                smil_url = smil_url_m
+
         try:
-            smil_url = '%s/Video/V2/VFILE/%s/%sr.smil' % (
-                self._SMIL_BASE_URL, video_id, video_id.lower())
             smil_xml = self._download_webpage(smil_url, video_id,
                                               'Downloading SMIL info')
             formats.extend(self._formats_from_smil(smil_xml))
@@ -128,4 +178,5 @@ class VevoIE(InfoExtractor):
             'upload_date': upload_date.strftime('%Y%m%d'),
             'uploader': video_info['mainArtists'][0]['artistName'],
             'duration': video_info['duration'],
+            'age_limit': age_limit,
         }
diff --git a/youtube_dl/extractor/videobam.py b/youtube_dl/extractor/videobam.py
new file mode 100644 (file)
index 0000000..cdfff05
--- /dev/null
@@ -0,0 +1,80 @@
+from __future__ import unicode_literals
+
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class VideoBamIE(InfoExtractor):
+    _VALID_URL = r'http://(?:www\.)?videobam\.com/(?:videos/download/)?(?P<id>[a-zA-Z]+)'
+
+    _TESTS = [
+        {
+            'url': 'http://videobam.com/OiJQM',
+            'md5': 'db471f27763a531f10416a0c58b5a1e0',
+            'info_dict': {
+                'id': 'OiJQM',
+                'ext': 'mp4',
+                'title': 'Is Alcohol Worse Than Ecstasy?',
+                'description': 'md5:d25b96151515c91debc42bfbb3eb2683',
+                'uploader': 'frihetsvinge',
+            },
+        },
+        {
+            'url': 'http://videobam.com/pqLvq',
+            'md5': 'd9a565b5379a99126ef94e1d7f9a383e',
+            'note': 'HD video',
+            'info_dict': {
+                'id': 'pqLvq',
+                'ext': 'mp4',
+            }
+        },
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        page = self._download_webpage('http://videobam.com/%s' % video_id, video_id, 'Downloading page')
+
+        formats = []
+
+        for preference, format_id in enumerate(['low', 'high']):
+            mobj = re.search(r"%s: '(?P<url>[^']+)'" % format_id, page)
+            if not mobj:
+                continue
+            formats.append({
+                'url': mobj.group('url'),
+                'ext': 'mp4',
+                'format_id': format_id,
+                'preference': preference,
+            })
+
+        if not formats:
+            player_config = json.loads(self._html_search_regex(r'var player_config = ({.+?});', page, 'player config'))
+            formats = [{
+                'url': item['url'],
+                'ext': 'mp4',
+            } for item in player_config['playlist'] if 'autoPlay' in item]
+
+        self._sort_formats(formats)
+
+        title = self._og_search_title(page, default='VideoBam', fatal=False)
+        description = self._og_search_description(page, default=None)
+        thumbnail = self._og_search_thumbnail(page)
+        uploader = self._html_search_regex(r'Upload by ([^<]+)</a>', page, 'uploader', fatal=False, default=None)
+        view_count = int_or_none(
+            self._html_search_regex(r'<strong>Views:</strong> (\d+) ', page, 'view count', fatal=False))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'uploader': uploader,
+            'view_count': view_count,
+            'formats': formats,
+            'age_limit': 18,
+        }
\ No newline at end of file
index c5ee84807db0b6c66b773c3a6a7be09c4f2eab70..10c3d992d5716c689fe142a99581a46629700524 100644 (file)
@@ -221,7 +221,9 @@ class VimeoIE(SubtitlesInfoExtractor):
         # Extract video thumbnail
         video_thumbnail = config["video"].get("thumbnail")
         if video_thumbnail is None:
-            _, video_thumbnail = sorted((int(width), t_url) for (width, t_url) in config["video"]["thumbs"].items())[-1]
+            video_thumbs = config["video"].get("thumbs")
+            if video_thumbs and isinstance(video_thumbs, dict):
+                _, video_thumbnail = sorted((int(width), t_url) for (width, t_url) in video_thumbs.items())[-1]
 
         # Extract video description
         video_description = None
index e14ff91d44b80b6d6278faece8453b856c39485e..5bbc8ba88bbdb6a0d069324a3fd191844ccf7773 100644 (file)
@@ -1,8 +1,10 @@
 from __future__ import unicode_literals
 
 import re
+import json
 
 from .common import InfoExtractor
+from ..utils import unified_strdate
 
 
 class VineIE(InfoExtractor):
@@ -13,31 +15,46 @@ class VineIE(InfoExtractor):
         'info_dict': {
             'id': 'b9KOOWX7HUx',
             'ext': 'mp4',
-            'uploader': 'Jack Dorsey',
             'title': 'Chicken.',
+            'description': 'Chicken.',
+            'upload_date': '20130519',
+            'uploader': 'Jack Dorsey',
+            'uploader_id': '76',
         },
     }
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-
         video_id = mobj.group('id')
-        webpage_url = 'https://vine.co/v/' + video_id
-        webpage = self._download_webpage(webpage_url, video_id)
 
-        self.report_extraction(video_id)
+        webpage = self._download_webpage('https://vine.co/v/' + video_id, video_id)
 
-        video_url = self._html_search_meta('twitter:player:stream', webpage,
-            'video URL')
+        data = json.loads(self._html_search_regex(
+            r'window\.POST_DATA = { %s: ({.+?}) }' % video_id, webpage, 'vine data'))
 
-        uploader = self._html_search_regex(r'<p class="username">(.*?)</p>',
-            webpage, 'uploader', fatal=False, flags=re.DOTALL)
+        formats = [
+            {
+                'url': data['videoLowURL'],
+                'ext': 'mp4',
+                'format_id': 'low',
+            },
+            {
+                'url': data['videoUrl'],
+                'ext': 'mp4',
+                'format_id': 'standard',
+            }
+        ]
 
         return {
             'id': video_id,
-            'url': video_url,
-            'ext': 'mp4',
             'title': self._og_search_title(webpage),
-            'thumbnail': self._og_search_thumbnail(webpage),
-            'uploader': uploader,
-        }
+            'description': data['description'],
+            'thumbnail': data['thumbnailUrl'],
+            'upload_date': unified_strdate(data['created']),
+            'uploader': data['username'],
+            'uploader_id': data['userIdStr'],
+            'like_count': data['likes']['count'],
+            'comment_count': data['comments']['count'],
+            'repost_count': data['reposts']['count'],
+            'formats': formats,
+        }
\ No newline at end of file
index a293b8875138d6fca999cc1240f47d15df433290..3b3bec92fedd06fe4f61cea73d3e1e97c425e290 100644 (file)
@@ -16,7 +16,7 @@ from ..utils import (
 
 class VKIE(InfoExtractor):
     IE_NAME = 'vk.com'
-    _VALID_URL = r'https?://vk\.com/(?:videos.*?\?.*?z=)?video(?P<id>.*?)(?:\?|%2F|$)'
+    _VALID_URL = r'https?://vk\.com/(?:video_ext\.php\?.*?\boid=(?P<oid>\d+).*?\bid=(?P<id>\d+)|(?:videos.*?\?.*?z=)?video(?P<videoid>.*?)(?:\?|%2F|$))'
     _NETRC_MACHINE = 'vk'
 
     _TESTS = [
@@ -42,6 +42,18 @@ class VKIE(InfoExtractor):
                 'duration': 558,
             }
         },
+        {
+            'note': 'Embedded video',
+            'url': 'http://vk.com/video_ext.php?oid=32194266&id=162925554&hash=7d8c2e0d5e05aeaa&hd=1',
+            'md5': 'c7ce8f1f87bec05b3de07fdeafe21a0a',
+            'info_dict': {
+                'id': '162925554',
+                'ext': 'mp4',
+                'uploader': 'Vladimir Gavrin',
+                'title': 'Lin Dan',
+                'duration': 101,
+            }
+        },
         {
             'url': 'http://vk.com/video-8871596_164049491',
             'md5': 'a590bcaf3d543576c9bd162812387666',
@@ -54,7 +66,7 @@ class VKIE(InfoExtractor):
                 'duration': 8352,
             },
             'skip': 'Requires vk account credentials',
-        }
+        },
     ]
 
     def _login(self):
@@ -82,7 +94,10 @@ class VKIE(InfoExtractor):
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
+        video_id = mobj.group('videoid')
+
+        if not video_id:
+            video_id = '%s_%s' % (mobj.group('oid'), mobj.group('id'))
 
         info_url = 'http://vk.com/al_video.php?act=show&al=1&video=%s' % video_id
         info_page = self._download_webpage(info_url, video_id)
index 3237596a3ace9796001f8ab78921ca9b6c84d2d1..fc9237a3f2eb2b456302659b1954c2715e6feb35 100644 (file)
@@ -22,8 +22,8 @@ class WorldStarHipHopIE(InfoExtractor):
         webpage_src = self._download_webpage(url, video_id)
 
         m_vevo_id = re.search(r'videoId=(.*?)&amp?',
-            webpage_src)
-        
+                              webpage_src)
+
         if m_vevo_id is not None:
             self.to_screen(u'Vevo video detected:')
             return self.url_result('vevo:%s' % m_vevo_id.group(1), ie='Vevo')
index a75e1380d414568a4b8455ba7789da863bf24e87..5374495f9b08f4d13fd7552fd612c19339b99e54 100644 (file)
@@ -103,6 +103,7 @@ class XHamsterIE(InfoExtractor):
         }]
 
         if not hd:
+            mrss_url = self._search_regex(r'<link rel="canonical" href="([^"]+)', webpage, 'mrss_url')
             webpage = self._download_webpage(mrss_url + '?hd', video_id, note='Downloading HD webpage')
             if is_hd(webpage):
                 video_url = extract_video_url(webpage)
index 982619922d8ef5fdd0f260902b0253f5dce024dd..d3eefd086720a82aa5f78c52d941fc6aee35c655 100644 (file)
@@ -7,19 +7,24 @@ from .common import InfoExtractor
 from ..utils import (
     compat_urllib_parse_urlparse,
     compat_urllib_request,
+    parse_duration,
+    str_to_int,
 )
 
+
 class XTubeIE(InfoExtractor):
-    _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>xtube\.com/watch\.php\?v=(?P<videoid>[^/?&]+))'
+    _VALID_URL = r'https?://(?:www\.)?(?P<url>xtube\.com/watch\.php\?v=(?P<videoid>[^/?&]+))'
     _TEST = {
         'url': 'http://www.xtube.com/watch.php?v=kVTUy_G222_',
-        'file': 'kVTUy_G222_.mp4',
         'md5': '092fbdd3cbe292c920ef6fc6a8a9cdab',
         'info_dict': {
-            "title": "strange erotica",
-            "description": "surreal gay themed erotica...almost an ET kind of thing",
-            "uploader": "greenshowers",
-            "age_limit": 18,
+            'id': 'kVTUy_G222_',
+            'ext': 'mp4',
+            'title': 'strange erotica',
+            'description': 'surreal gay themed erotica...almost an ET kind of thing',
+            'uploader': 'greenshowers',
+            'duration': 450,
+            'age_limit': 18,
         }
     }
 
@@ -32,10 +37,23 @@ class XTubeIE(InfoExtractor):
         req.add_header('Cookie', 'age_verified=1')
         webpage = self._download_webpage(req, video_id)
 
-        video_title = self._html_search_regex(r'<div class="p_5px[^>]*>([^<]+)', webpage, 'title')
-        video_uploader = self._html_search_regex(r'so_s\.addVariable\("owner_u", "([^"]+)', webpage, 'uploader', fatal=False)
-        video_description = self._html_search_regex(r'<p class="video_description">([^<]+)', webpage, 'description', fatal=False)
-        video_url= self._html_search_regex(r'var videoMp4 = "([^"]+)', webpage, 'video_url').replace('\\/', '/')
+        video_title = self._html_search_regex(r'<p class="title">([^<]+)', webpage, 'title')
+        video_uploader = self._html_search_regex(
+            r'so_s\.addVariable\("owner_u", "([^"]+)', webpage, 'uploader', fatal=False)
+        video_description = self._html_search_regex(
+            r'<p class="fieldsDesc">([^<]+)', webpage, 'description', fatal=False)
+        video_url = self._html_search_regex(r'var videoMp4 = "([^"]+)', webpage, 'video_url').replace('\\/', '/')
+        duration = parse_duration(self._html_search_regex(
+            r'<span class="bold">Runtime:</span> ([^<]+)</p>', webpage, 'duration', fatal=False))
+        view_count = self._html_search_regex(
+            r'<span class="bold">Views:</span> ([\d,\.]+)</p>', webpage, 'view count', fatal=False)
+        if view_count:
+            view_count = str_to_int(view_count)
+        comment_count = self._html_search_regex(
+            r'<div id="commentBar">([\d,\.]+) Comments</div>', webpage, 'comment count', fatal=False)
+        if comment_count:
+            comment_count = str_to_int(comment_count)
+
         path = compat_urllib_parse_urlparse(video_url).path
         extension = os.path.splitext(path)[1][1:]
         format = path.split('/')[5].split('_')[:2]
@@ -48,6 +66,9 @@ class XTubeIE(InfoExtractor):
             'title': video_title,
             'uploader': video_uploader,
             'description': video_description,
+            'duration': duration,
+            'view_count': view_count,
+            'comment_count': comment_count,
             'url': video_url,
             'ext': extension,
             'format': format,
index e1ef90e384c437f635f0727014505bfde7b7b34e..166a0cf70caf5a52b5891763438cbc892b1a9533 100644 (file)
@@ -29,7 +29,6 @@ from ..utils import (
     ExtractorError,
     int_or_none,
     PagedList,
-    RegexNotFoundError,
     unescapeHTML,
     unified_strdate,
     orderedSet,
@@ -200,9 +199,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
         '135': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'DASH video', 'preference': -40},
         '136': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'DASH video', 'preference': -40},
         '137': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
-        '138': {'ext': 'mp4', 'height': 1081, 'resolution': '>1080p', 'format_note': 'DASH video', 'preference': -40},
+        '138': {'ext': 'mp4', 'height': 2160, 'resolution': '2160p', 'format_note': 'DASH video', 'preference': -40},
         '160': {'ext': 'mp4', 'height': 192, 'resolution': '192p', 'format_note': 'DASH video', 'preference': -40},
-        '264': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
+        '264': {'ext': 'mp4', 'height': 1440, 'resolution': '1440p', 'format_note': 'DASH video', 'preference': -40},
 
         # Dash mp4 audio
         '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
@@ -1489,11 +1488,15 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
         # the id of the playlist is just 'RD' + video_id
         url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
         webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
-        title_span = (get_element_by_attribute('class', 'title long-title', webpage) or
-            get_element_by_attribute('class', 'title ', webpage))
+        search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
+        title_span = (search_title('playlist-title') or
+            search_title('title long-title') or search_title('title'))
         title = clean_html(title_span)
-        video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s' % re.escape(playlist_id)
-        ids = orderedSet(re.findall(video_re, webpage))
+        video_re = r'''(?x)data-video-username="(.*?)".*?
+                       href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id)
+        matches = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL))
+        # Some of the videos may have been deleted, their username field is empty
+        ids = [video_id for (username, video_id) in matches if username]
         url_results = self._ids_to_results(ids)
 
         return self.playlist_result(url_results, playlist_id, title)
@@ -1642,7 +1645,7 @@ class YoutubeChannelIE(InfoExtractor):
 
 class YoutubeUserIE(InfoExtractor):
     IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
-    _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
+    _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
     _GDATA_PAGE_SIZE = 50
     _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
@@ -1741,12 +1744,50 @@ class YoutubeSearchIE(SearchInfoExtractor):
                   for video_id in video_ids]
         return self.playlist_result(videos, query)
 
+
 class YoutubeSearchDateIE(YoutubeSearchIE):
     IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
     _SEARCH_KEY = 'ytsearchdate'
     IE_DESC = u'YouTube.com searches, newest videos first'
 
+
+class YoutubeSearchURLIE(InfoExtractor):
+    IE_DESC = u'YouTube.com search URLs'
+    IE_NAME = u'youtube:search_url'
+    _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        query = compat_urllib_parse.unquote_plus(mobj.group('query'))
+
+        webpage = self._download_webpage(url, query)
+        result_code = self._search_regex(
+            r'(?s)<ol id="search-results"(.*?)</ol>', webpage, u'result HTML')
+
+        part_codes = re.findall(
+            r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
+        entries = []
+        for part_code in part_codes:
+            part_title = self._html_search_regex(
+                r'(?s)title="([^"]+)"', part_code, 'item title', fatal=False)
+            part_url_snippet = self._html_search_regex(
+                r'(?s)href="([^"]+)"', part_code, 'item URL')
+            part_url = compat_urlparse.urljoin(
+                'https://www.youtube.com/', part_url_snippet)
+            entries.append({
+                '_type': 'url',
+                'url': part_url,
+                'title': part_title,
+            })
+
+        return {
+            '_type': 'playlist',
+            'entries': entries,
+            'title': query,
+        }
+
+
 class YoutubeShowIE(InfoExtractor):
     IE_DESC = u'YouTube.com (multi-season) shows'
     _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
index 829f002cf02f9c908a5057ab3c6b20f520e2f2ea..3b1ac4e9f5246e268e0c0b49d64249196270e9d4 100644 (file)
@@ -1,4 +1,5 @@
 # coding: utf-8
+from __future__ import unicode_literals
 
 import re
 
@@ -13,52 +14,42 @@ class ZDFIE(InfoExtractor):
     _VALID_URL = r'^https?://www\.zdf\.de/ZDFmediathek(?P<hash>#)?/(.*beitrag/(?:video/)?)(?P<video_id>[0-9]+)(?:/[^/?]+)?(?:\?.*)?'
 
     _TEST = {
-        u"url": u"http://www.zdf.de/ZDFmediathek/beitrag/video/2037704/ZDFspezial---Ende-des-Machtpokers--?bc=sts;stt",
-        u"file": u"2037704.webm",
-        u"info_dict": {
-            u"upload_date": u"20131127",
-            u"description": u"Union und SPD haben sich auf einen Koalitionsvertrag geeinigt. Aber was bedeutet das für die Bürger? Sehen Sie hierzu das ZDFspezial \"Ende des Machtpokers - Große Koalition für Deutschland\".",
-            u"uploader": u"spezial",
-            u"title": u"ZDFspezial - Ende des Machtpokers"
+        'url': 'http://www.zdf.de/ZDFmediathek/beitrag/video/2037704/ZDFspezial---Ende-des-Machtpokers--?bc=sts;stt',
+        'info_dict': {
+            'id': '2037704',
+            'ext': 'webm',
+            'title': 'ZDFspezial - Ende des Machtpokers',
+            'description': 'Union und SPD haben sich auf einen Koalitionsvertrag geeinigt. Aber was bedeutet das für die Bürger? Sehen Sie hierzu das ZDFspezial "Ende des Machtpokers - Große Koalition für Deutschland".',
+            'duration': 1022,
+            'uploader': 'spezial',
+            'uploader_id': '225948',
+            'upload_date': '20131127',
         },
-        u"skip": u"Videos on ZDF.de are depublicised in short order",
+        'skip': 'Videos on ZDF.de are depublicised in short order',
     }
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('video_id')
 
-        xml_url = u'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id
+        xml_url = 'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id
         doc = self._download_xml(
             xml_url, video_id,
-            note=u'Downloading video info',
-            errnote=u'Failed to download video info')
+            note='Downloading video info',
+            errnote='Failed to download video info')
 
         title = doc.find('.//information/title').text
         description = doc.find('.//information/detail').text
+        duration = int(doc.find('.//details/lengthSec').text)
         uploader_node = doc.find('.//details/originChannelTitle')
         uploader = None if uploader_node is None else uploader_node.text
-        duration_str = doc.find('.//details/length').text
-        duration_m = re.match(r'''(?x)^
-            (?P<hours>[0-9]{2})
-            :(?P<minutes>[0-9]{2})
-            :(?P<seconds>[0-9]{2})
-            (?:\.(?P<ms>[0-9]+)?)
-            ''', duration_str)
-        duration = (
-            (
-                (int(duration_m.group('hours')) * 60 * 60) +
-                (int(duration_m.group('minutes')) * 60) +
-                int(duration_m.group('seconds'))
-            )
-            if duration_m
-            else None
-        )
+        uploader_id_node = doc.find('.//details/originChannelId')
+        uploader_id = None if uploader_id_node is None else uploader_id_node.text
         upload_date = unified_strdate(doc.find('.//details/airtime').text)
 
         def xml_to_format(fnode):
             video_url = fnode.find('url').text
-            is_available = u'http://www.metafilegenerator' not in video_url
+            is_available = 'http://www.metafilegenerator' not in video_url
 
             format_id = fnode.attrib['basetype']
             format_m = re.match(r'''(?x)
@@ -71,22 +62,28 @@ class ZDFIE(InfoExtractor):
 
             quality = fnode.find('./quality').text
             abr = int(fnode.find('./audioBitrate').text) // 1000
-            vbr = int(fnode.find('./videoBitrate').text) // 1000
+            vbr_node = fnode.find('./videoBitrate')
+            vbr = None if vbr_node is None else int(vbr_node.text) // 1000
 
-            format_note = u''
+            width_node = fnode.find('./width')
+            width = None if width_node is None else int_or_none(width_node.text)
+            height_node = fnode.find('./height')
+            height = None if height_node is None else int_or_none(height_node.text)
+
+            format_note = ''
             if not format_note:
                 format_note = None
 
             return {
-                'format_id': format_id + u'-' + quality,
+                'format_id': format_id + '-' + quality,
                 'url': video_url,
                 'ext': ext,
                 'acodec': format_m.group('acodec'),
                 'vcodec': format_m.group('vcodec'),
                 'abr': abr,
                 'vbr': vbr,
-                'width': int_or_none(fnode.find('./width').text),
-                'height': int_or_none(fnode.find('./height').text),
+                'width': width,
+                'height': height,
                 'filesize': int_or_none(fnode.find('./filesize').text),
                 'format_note': format_note,
                 'protocol': proto,
@@ -103,9 +100,10 @@ class ZDFIE(InfoExtractor):
         return {
             'id': video_id,
             'title': title,
-            'formats': formats,
             'description': description,
-            'uploader': uploader,
             'duration': duration,
+            'uploader': uploader,
+            'uploader_id': uploader_id,
             'upload_date': upload_date,
-        }
+            'formats': formats,
+        }
\ No newline at end of file
index 471516b8f6764d26c01818246b95e2e8b20c93fb..02b8f7c45b6519f86430051ba627ae0eb127a635 100644 (file)
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
+import contextlib
 import ctypes
 import datetime
 import email.utils
@@ -771,6 +772,7 @@ def unified_strdate(date_str):
         '%B %d %Y',
         '%b %d %Y',
         '%Y-%m-%d',
+        '%d.%m.%Y',
         '%d/%m/%Y',
         '%Y/%m/%d %H:%M:%S',
         '%Y-%m-%d %H:%M:%S',
@@ -779,6 +781,7 @@ def unified_strdate(date_str):
         '%Y-%m-%dT%H:%M:%S.%fZ',
         '%Y-%m-%dT%H:%M:%S.%f0Z',
         '%Y-%m-%dT%H:%M:%S',
+        '%Y-%m-%dT%H:%M:%S.%f',
         '%Y-%m-%dT%H:%M',
     ]
     for expression in format_expressions:
@@ -1244,3 +1247,19 @@ except TypeError:
 else:
     struct_pack = struct.pack
     struct_unpack = struct.unpack
+
+
+def read_batch_urls(batch_fd):
+    def fixup(url):
+        if not isinstance(url, compat_str):
+            url = url.decode('utf-8', 'replace')
+        BOM_UTF8 = u'\xef\xbb\xbf'
+        if url.startswith(BOM_UTF8):
+            url = url[len(BOM_UTF8):]
+        url = url.strip()
+        if url.startswith(('#', ';', ']')):
+            return False
+        return url
+
+    with contextlib.closing(batch_fd) as fd:
+        return [url for url in map(fixup, fd) if url]
index a92faa5a7625e6e9d4aa454de092795b5d583ae6..30cf4c188dc40fd89d8ecad148171b05f39bfaa7 100644 (file)
@@ -1,2 +1,2 @@
 
-__version__ = '2014.02.21.1'
+__version__ = '2014.03.04.2'