Merge remote-tracking branch 'jaimeMF/yt-toplists'
authorPhilipp Hagemeister <phihag@phihag.de>
Mon, 9 Dec 2013 03:49:32 +0000 (04:49 +0100)
committerPhilipp Hagemeister <phihag@phihag.de>
Mon, 9 Dec 2013 03:49:32 +0000 (04:49 +0100)
79 files changed:
README.md
test/parameters.json
test/test_all_urls.py
test/test_playlists.py
test/test_utils.py
test/test_write_info_json.py
youtube_dl/FileDownloader.py
youtube_dl/YoutubeDL.py
youtube_dl/__init__.py
youtube_dl/extractor/__init__.py
youtube_dl/extractor/addanime.py
youtube_dl/extractor/appletrailers.py
youtube_dl/extractor/archiveorg.py
youtube_dl/extractor/arte.py
youtube_dl/extractor/auengine.py
youtube_dl/extractor/bambuser.py
youtube_dl/extractor/bliptv.py
youtube_dl/extractor/bloomberg.py
youtube_dl/extractor/clipfish.py
youtube_dl/extractor/comedycentral.py
youtube_dl/extractor/common.py
youtube_dl/extractor/cspan.py
youtube_dl/extractor/dailymotion.py
youtube_dl/extractor/daum.py
youtube_dl/extractor/dreisat.py
youtube_dl/extractor/eighttracks.py
youtube_dl/extractor/exfm.py
youtube_dl/extractor/faz.py
youtube_dl/extractor/fktv.py
youtube_dl/extractor/francetv.py
youtube_dl/extractor/gamekings.py
youtube_dl/extractor/gamespot.py
youtube_dl/extractor/gametrailers.py
youtube_dl/extractor/generic.py
youtube_dl/extractor/hotnewhiphop.py
youtube_dl/extractor/ign.py
youtube_dl/extractor/imdb.py
youtube_dl/extractor/instagram.py
youtube_dl/extractor/jukebox.py
youtube_dl/extractor/liveleak.py
youtube_dl/extractor/livestream.py
youtube_dl/extractor/metacafe.py
youtube_dl/extractor/metacritic.py
youtube_dl/extractor/mixcloud.py
youtube_dl/extractor/mtv.py
youtube_dl/extractor/muzu.py
youtube_dl/extractor/myspass.py
youtube_dl/extractor/naver.py
youtube_dl/extractor/ninegag.py [new file with mode: 0644]
youtube_dl/extractor/orf.py
youtube_dl/extractor/pbs.py
youtube_dl/extractor/pyvideo.py [new file with mode: 0644]
youtube_dl/extractor/redtube.py
youtube_dl/extractor/rutube.py
youtube_dl/extractor/slashdot.py
youtube_dl/extractor/smotri.py [new file with mode: 0644]
youtube_dl/extractor/soundcloud.py
youtube_dl/extractor/southparkstudios.py
youtube_dl/extractor/space.py
youtube_dl/extractor/stanfordoc.py
youtube_dl/extractor/tf1.py
youtube_dl/extractor/theplatform.py [new file with mode: 0644]
youtube_dl/extractor/trilulilu.py
youtube_dl/extractor/unistra.py
youtube_dl/extractor/veehd.py
youtube_dl/extractor/vevo.py
youtube_dl/extractor/vice.py
youtube_dl/extractor/viddler.py
youtube_dl/extractor/videofyme.py
youtube_dl/extractor/vimeo.py
youtube_dl/extractor/wat.py
youtube_dl/extractor/wimp.py
youtube_dl/extractor/wistia.py [new file with mode: 0644]
youtube_dl/extractor/xhamster.py
youtube_dl/extractor/yahoo.py
youtube_dl/extractor/youjizz.py
youtube_dl/extractor/youtube.py
youtube_dl/utils.py
youtube_dl/version.py

index af4d969d6be7ee80a0e3215269e3102369a883af..9d4835053752cada7e5cd5b7e41426bd4cdde9c3 100644 (file)
--- a/README.md
+++ b/README.md
@@ -30,13 +30,16 @@ which means you can modify it, redistribute it or use it however you like.
     --list-extractors          List all supported extractors and the URLs they
                                would handle
     --extractor-descriptions   Output descriptions of all supported extractors
-    --proxy URL                Use the specified HTTP/HTTPS proxy
+    --proxy URL                Use the specified HTTP/HTTPS proxy. Pass in an
+                               empty string (--proxy "") for direct connection
     --no-check-certificate     Suppress HTTPS certificate validation.
     --cache-dir DIR            Location in the filesystem where youtube-dl can
                                store downloaded information permanently. By
                                default $XDG_CACHE_HOME/youtube-dl or ~/.cache
                                /youtube-dl .
     --no-cache-dir             Disable filesystem caching
+    --bidi-workaround          Work around terminals that lack bidirectional
+                               text support. Requires fribidi executable in PATH
 
 ## Video Selection:
     --playlist-start NUMBER    playlist video to start at (default is 1)
@@ -55,8 +58,9 @@ which means you can modify it, redistribute it or use it however you like.
     --dateafter DATE           download only videos uploaded after this date
     --no-playlist              download only the currently playing video
     --age-limit YEARS          download only videos suitable for the given age
-    --download-archive FILE    Download only videos not present in the archive
-                               file. Record all downloaded videos in it.
+    --download-archive FILE    Download only videos not listed in the archive
+                               file. Record the IDs of all downloaded videos in
+                               it.
 
 ## Download Options:
     -r, --rate-limit LIMIT     maximum download rate in bytes per second (e.g.
@@ -130,8 +134,8 @@ which means you can modify it, redistribute it or use it however you like.
     -v, --verbose              print various debugging information
     --dump-intermediate-pages  print downloaded pages to debug problems(very
                                verbose)
-    --write-pages              Write downloaded pages to files in the current
-                               directory
+    --write-pages              Write downloaded intermediary pages to files in
+                               the current directory to debug problems
 
 ## Video Format Options:
     -f, --format FORMAT        video format code, specify the order of
@@ -182,7 +186,7 @@ which means you can modify it, redistribute it or use it however you like.
 
 # CONFIGURATION
 
-You can configure youtube-dl by placing default arguments (such as `--extract-audio --no-mtime` to always extract the audio and not copy the mtime) into `/etc/youtube-dl.conf` and/or `~/.config/youtube-dl.conf`.
+You can configure youtube-dl by placing default arguments (such as `--extract-audio --no-mtime` to always extract the audio and not copy the mtime) into `/etc/youtube-dl.conf` and/or `~/.config/youtube-dl.conf`. On Windows, the configuration file locations are `%APPDATA%\youtube-dl\config.txt` and `C:\Users\<Yourname>\youtube-dl.conf`.
 
 # OUTPUT TEMPLATE
 
@@ -272,14 +276,54 @@ This README file was originally written by Daniel Bolton (<https://github.com/db
 
 # BUGS
 
-Bugs and suggestions should be reported at: <https://github.com/rg3/youtube-dl/issues>
+Bugs and suggestions should be reported at: <https://github.com/rg3/youtube-dl/issues> . Unless you were prompted so or there is another pertinent reason (e.g. GitHub fails to accept the bug report), please do not send bug reports via personal email.
 
-Please include:
-
-* Your exact command line, like `youtube-dl -t "http://www.youtube.com/watch?v=uHlDtZ6Oc3s&feature=channel_video_title"`. A common mistake is not to escape the `&`. Putting URLs in quotes should solve this problem.
-* If possible re-run the command with `--verbose`, and include the full output, it is really helpful to us.
-* The output of `youtube-dl --version`
-* The output of `python --version`
-* The name and version of your Operating System ("Ubuntu 11.04 x64" or "Windows 7 x64" is usually enough).
+Please include the full output of the command when run with `--verbose`. The output (including the first lines) contain important debugging information. Issues without the full output are often not reproducible and therefore do not get solved in short order, if ever.
 
 For discussions, join us in the irc channel #youtube-dl on freenode.
+
+When you submit a request, please re-read it once to avoid a couple of mistakes (you can and should use this as a checklist):
+
+### Is the description of the issue itself sufficient?
+
+We often get issue reports that we cannot really decipher. While in most cases we eventually get the required information after asking back multiple times, this poses an unnecessary drain on our resources. Many contributors, including myself, are also not native speakers, so we may misread some parts.
+
+So please elaborate on what feature you are requesting, or what bug you want to be fixed. Make sure that it's obvious
+
+- What the problem is
+- How it could be fixed
+- How your proposed solution would look like
+
+If your report is shorter than two lines, it is almost certainly missing some of these, which makes it hard for us to respond to it. We're often too polite to close the issue outright, but the missing info makes misinterpretation likely. As a commiter myself, I often get frustrated by these issues, since the only possible way for me to move forward on them is to ask for clarification over and over.
+
+For bug reports, this means that your report should contain the *complete* output of youtube-dl when called with the -v flag. The error message you get for (most) bugs even says so, but you would not believe how many of our bug reports do not contain this information.
+
+Site support requests must contain an example URL. An example URL is a URL you might want to download, like http://www.youtube.com/watch?v=BaW_jenozKc . There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. http://www.youtube.com/ ) is *not* an example URL.
+
+###  Are you using the latest version?
+
+Before reporting any issue, type youtube-dl -U. This should report that you're up-to-date. Ábout 20% of the reports we receive are already fixed, but people are using outdated versions. This goes for feature requests as well.
+
+###  Is the issue already documented?
+
+Make sure that someone has not already opened the issue you're trying to open. Search at the top of the window or at https://github.com/rg3/youtube-dl/search?type=Issues . If there is an issue, feel free to write something along the lines of "This affects me as well, with version 2015.01.01. Here is some more information on the issue: ...". While some issues may be old, a new post into them often spurs rapid activity.
+
+###  Why are existing options not enough?
+
+Before requesting a new feature, please have a quick peek at [the list of supported options](https://github.com/rg3/youtube-dl/blob/master/README.md#synopsis). Many feature requests are for features that actually exist already! Please, absolutely do show off your work in the issue report and detail how the existing similar options do *not* solve your problem.
+
+###  Is there enough context in your bug report?
+
+People want to solve problems, and often think they do us a favor by breaking down their larger problems (e.g. wanting to skip already downloaded files) to a specific request (e.g. requesting us to look whether the file exists before downloading the info page). However, what often happens is that they break down the problem into two steps: One simple, and one impossible (or extremely complicated one).
+
+We are then presented with a very complicated request when the original problem could be solved far easier, e.g. by recording the downloaded video IDs in a separate file. To avoid this, you must include the greater context where it is non-obvious. In particular, every feature request that does not consist of adding support for a new site should contain a use case scenario that explains in what situation the missing feature would be useful.
+
+###  Does the issue involve one problem, and one problem only?
+
+Some of our users seem to think there is a limit of issues they can or should open. There is no limit of issues they can or should open. While it may seem appealing to be able to dump all your issues into one ticket, that means that someone who solves one of your issues cannot mark the issue as closed. Typically, reporting a bunch of issues leads to the ticket lingering since nobody wants to attack that behemoth, until someone mercifully splits the issue into multiple ones.
+
+In particular, every site support request issue should only pertain to services at one site (generally under a common domain, but always using the same backend technology). Do not request support for vimeo user videos, Whitehouse podcasts, and Google Plus pages in the same issue. Also, make sure that you don't post bug reports alongside feature requests. As a rule of thumb, a feature request does not include outputs of youtube-dl that are not immediately related to the feature at hand. Do not post reports of a network error alongside the request for a new video service.
+
+###  Is anyone going to need the feature?
+
+Only post features that you (or an incapicated friend you can personally talk to) require. Do not post features because they seem like a good idea. If they are really useful, they will be requested by someone who requires them.
index f042880edbf0bcab661900b86809ade8ec7697af..487a46d56670c1ded91cd71ed35055e54232187b 100644 (file)
@@ -39,5 +39,6 @@
     "writeinfojson": true, 
     "writesubtitles": false,
     "allsubtitles": false,
-    "listssubtitles": false
+    "listssubtitles": false,
+    "socket_timeout": 20
 }
index 1f1adb6b46e0fa2e8a683e6593f699476397a0cd..6b9764c67e98ba47a63b227d824bb97b82757b53 100644 (file)
@@ -106,6 +106,10 @@ class TestAllURLsMatching(unittest.TestCase):
         self.assertMatch(':colbertreport', ['ComedyCentralShows'])
         self.assertMatch(':cr', ['ComedyCentralShows'])
 
+    def test_vimeo_matching(self):
+        self.assertMatch('http://vimeo.com/channels/tributes', ['vimeo:channel'])
+        self.assertMatch('http://vimeo.com/user7108434', ['vimeo:user'])
+
 
 if __name__ == '__main__':
     unittest.main()
index 167801ae246087aae4c7068cb11b84245e560649..87ca401e5be52eb24fd7dc6653691437ef060570 100644 (file)
@@ -15,13 +15,18 @@ from youtube_dl.extractor import (
     DailymotionPlaylistIE,
     DailymotionUserIE,
     VimeoChannelIE,
+    VimeoUserIE,
+    VimeoAlbumIE,
+    VimeoGroupsIE,
     UstreamChannelIE,
     SoundcloudSetIE,
     SoundcloudUserIE,
     LivestreamIE,
     NHLVideocenterIE,
     BambuserChannelIE,
-    BandcampAlbumIE
+    BandcampAlbumIE,
+    SmotriCommunityIE,
+    SmotriUserIE
 )
 
 
@@ -54,6 +59,30 @@ class TestPlaylists(unittest.TestCase):
         self.assertEqual(result['title'], u'Vimeo Tributes')
         self.assertTrue(len(result['entries']) > 24)
 
+    def test_vimeo_user(self):
+        dl = FakeYDL()
+        ie = VimeoUserIE(dl)
+        result = ie.extract('http://vimeo.com/nkistudio/videos')
+        self.assertIsPlaylist(result)
+        self.assertEqual(result['title'], u'Nki')
+        self.assertTrue(len(result['entries']) > 65)
+
+    def test_vimeo_album(self):
+        dl = FakeYDL()
+        ie = VimeoAlbumIE(dl)
+        result = ie.extract('http://vimeo.com/album/2632481')
+        self.assertIsPlaylist(result)
+        self.assertEqual(result['title'], u'Staff Favorites: November 2013')
+        self.assertTrue(len(result['entries']) > 12)
+
+    def test_vimeo_groups(self):
+        dl = FakeYDL()
+        ie = VimeoGroupsIE(dl)
+        result = ie.extract('http://vimeo.com/groups/rolexawards')
+        self.assertIsPlaylist(result)
+        self.assertEqual(result['title'], u'Rolex Awards for Enterprise')
+        self.assertTrue(len(result['entries']) > 72)
+
     def test_ustream_channel(self):
         dl = FakeYDL()
         ie = UstreamChannelIE(dl)
@@ -110,6 +139,24 @@ class TestPlaylists(unittest.TestCase):
         self.assertIsPlaylist(result)
         self.assertEqual(result['title'], u'Nightmare Night EP')
         self.assertTrue(len(result['entries']) >= 4)
+        
+    def test_smotri_community(self):
+        dl = FakeYDL()
+        ie = SmotriCommunityIE(dl)
+        result = ie.extract('http://smotri.com/community/video/kommuna')
+        self.assertIsPlaylist(result)
+        self.assertEqual(result['id'], u'kommuna')
+        self.assertEqual(result['title'], u'КПРФ')
+        self.assertTrue(len(result['entries']) >= 4)
+        
+    def test_smotri_user(self):
+        dl = FakeYDL()
+        ie = SmotriUserIE(dl)
+        result = ie.extract('http://smotri.com/user/inspector')
+        self.assertIsPlaylist(result)
+        self.assertEqual(result['id'], u'inspector')
+        self.assertEqual(result['title'], u'Inspector')
+        self.assertTrue(len(result['entries']) >= 9)
 
 if __name__ == '__main__':
     unittest.main()
index e9e590e749f131a0950c79bcf4fee1e9fb9004c2..0fa66beecd3f8f82b599704af260ca3d0aa0298d 100644 (file)
@@ -26,6 +26,7 @@ from youtube_dl.utils import (
     unsmuggle_url,
     shell_quote,
     encodeFilename,
+    str_to_int,
 )
 
 if sys.version_info < (3, 0):
@@ -176,6 +177,10 @@ class TestUtil(unittest.TestCase):
         args = ['ffmpeg', '-i', encodeFilename(u'ñ€ß\'.mp4')]
         self.assertEqual(shell_quote(args), u"""ffmpeg -i 'ñ€ß'"'"'.mp4'""")
 
+    def test_str_to_int(self):
+        self.assertEqual(str_to_int('123,456'), 123456)
+        self.assertEqual(str_to_int('123.456'), 123456)
+
 
 if __name__ == '__main__':
     unittest.main()
index d7177611b5e1a90aa3bdf612ae873336ff44d686..90426a559551571a7769c6137a76270a4ad47b1c 100644 (file)
@@ -33,6 +33,7 @@ TEST_ID = 'BaW_jenozKc'
 INFO_JSON_FILE = TEST_ID + '.info.json'
 DESCRIPTION_FILE = TEST_ID + '.mp4.description'
 EXPECTED_DESCRIPTION = u'''test chars:  "'/\ä↭𝕐
+test URL: https://github.com/rg3/youtube-dl/issues/1892
 
 This is a test video for youtube-dl.
 
index 3ff9716b33b22e39a0a6d925bfa33aba8fa092f9..47124932fc7e9ff3c40ec29d003757cdb20cf967 100644 (file)
@@ -204,11 +204,27 @@ class FileDownloader(object):
         """Report destination filename."""
         self.to_screen(u'[download] Destination: ' + filename)
 
+    def _report_progress_status(self, msg, is_last_line=False):
+        fullmsg = u'[download] ' + msg
+        if self.params.get('progress_with_newline', False):
+            self.to_screen(fullmsg)
+        else:
+            if os.name == 'nt':
+                prev_len = getattr(self, '_report_progress_prev_line_length',
+                                   0)
+                if prev_len > len(fullmsg):
+                    fullmsg += u' ' * (prev_len - len(fullmsg))
+                self._report_progress_prev_line_length = len(fullmsg)
+                clear_line = u'\r'
+            else:
+                clear_line = (u'\r\x1b[K' if sys.stderr.isatty() else u'\r')
+            self.to_screen(clear_line + fullmsg, skip_eol=not is_last_line)
+        self.to_console_title(u'youtube-dl ' + msg)
+
     def report_progress(self, percent, data_len_str, speed, eta):
         """Report download progress."""
         if self.params.get('noprogress', False):
             return
-        clear_line = (u'\x1b[K' if sys.stderr.isatty() and os.name != 'nt' else u'')
         if eta is not None:
             eta_str = self.format_eta(eta)
         else:
@@ -218,14 +234,29 @@ class FileDownloader(object):
         else:
             percent_str = 'Unknown %'
         speed_str = self.format_speed(speed)
-        if self.params.get('progress_with_newline', False):
-            self.to_screen(u'[download] %s of %s at %s ETA %s' %
-                (percent_str, data_len_str, speed_str, eta_str))
+
+        msg = (u'%s of %s at %s ETA %s' %
+               (percent_str, data_len_str, speed_str, eta_str))
+        self._report_progress_status(msg)
+
+    def report_progress_live_stream(self, downloaded_data_len, speed, elapsed):
+        if self.params.get('noprogress', False):
+            return
+        downloaded_str = format_bytes(downloaded_data_len)
+        speed_str = self.format_speed(speed)
+        elapsed_str = FileDownloader.format_seconds(elapsed)
+        msg = u'%s at %s (%s)' % (downloaded_str, speed_str, elapsed_str)
+        self._report_progress_status(msg)
+
+    def report_finish(self, data_len_str, tot_time):
+        """Report download finished."""
+        if self.params.get('noprogress', False):
+            self.to_screen(u'[download] Download completed')
         else:
-            self.to_screen(u'\r%s[download] %s of %s at %s ETA %s' %
-                (clear_line, percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
-        self.to_console_title(u'youtube-dl - %s of %s at %s ETA %s' %
-                (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
+            self._report_progress_status(
+                (u'100%% of %s in %s' %
+                 (data_len_str, self.format_seconds(tot_time))),
+                is_last_line=True)
 
     def report_resuming_byte(self, resume_len):
         """Report attempt to resume at given byte."""
@@ -246,16 +277,7 @@ class FileDownloader(object):
         """Report it was impossible to resume download."""
         self.to_screen(u'[download] Unable to resume')
 
-    def report_finish(self, data_len_str, tot_time):
-        """Report download finished."""
-        if self.params.get('noprogress', False):
-            self.to_screen(u'[download] Download completed')
-        else:
-            clear_line = (u'\x1b[K' if sys.stderr.isatty() and os.name != 'nt' else u'')
-            self.to_screen(u'\r%s[download] 100%% of %s in %s' %
-                (clear_line, data_len_str, self.format_seconds(tot_time)))
-
-    def _download_with_rtmpdump(self, filename, url, player_url, page_url, play_path, tc_url, live):
+    def _download_with_rtmpdump(self, filename, url, player_url, page_url, play_path, tc_url, live, conn):
         def run_rtmpdump(args):
             start = time.time()
             resume_percent = None
@@ -301,11 +323,27 @@ class FileDownloader(object):
                         'eta': eta,
                         'speed': speed,
                     })
-                elif self.params.get('verbose', False):
-                    if not cursor_in_new_line:
-                        self.to_screen(u'')
-                    cursor_in_new_line = True
-                    self.to_screen(u'[rtmpdump] '+line)
+                else:
+                    # no percent for live streams
+                    mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec', line)
+                    if mobj:
+                        downloaded_data_len = int(float(mobj.group(1))*1024)
+                        time_now = time.time()
+                        speed = self.calc_speed(start, time_now, downloaded_data_len)
+                        self.report_progress_live_stream(downloaded_data_len, speed, time_now - start)
+                        cursor_in_new_line = False
+                        self._hook_progress({
+                            'downloaded_bytes': downloaded_data_len,
+                            'tmpfilename': tmpfilename,
+                            'filename': filename,
+                            'status': 'downloading',
+                            'speed': speed,
+                        })
+                    elif self.params.get('verbose', False):
+                        if not cursor_in_new_line:
+                            self.to_screen(u'')
+                        cursor_in_new_line = True
+                        self.to_screen(u'[rtmpdump] '+line)
             proc.wait()
             if not cursor_in_new_line:
                 self.to_screen(u'')
@@ -338,6 +376,8 @@ class FileDownloader(object):
             basic_args += ['--stop', '1']
         if live:
             basic_args += ['--live']
+        if conn:
+            basic_args += ['--conn', conn]
         args = basic_args + [[], ['--resume', '--skip', '1']][self.params.get('continuedl', False)]
 
         if sys.platform == 'win32' and sys.version_info < (3, 0):
@@ -479,7 +519,8 @@ class FileDownloader(object):
                                                 info_dict.get('page_url', None),
                                                 info_dict.get('play_path', None),
                                                 info_dict.get('tc_url', None),
-                                                info_dict.get('rtmp_live', False))
+                                                info_dict.get('rtmp_live', False),
+                                                info_dict.get('rtmp_conn', None))
 
         # Attempt to download using mplayer
         if url.startswith('mms') or url.startswith('rtsp'):
index b822930cbe358226c70b514287509d391adc116c..3fbbf3ba09f957c96b8178d01e7bfa7ac2c9df26 100644 (file)
@@ -132,6 +132,9 @@ class YoutubeDL(object):
     cookiefile:        File name where cookies should be read from and dumped to.
     nocheckcertificate:Do not verify SSL certificates
     proxy:             URL of the proxy server to use
+    socket_timeout:    Time to wait for unresponsive hosts, in seconds
+    bidi_workaround:   Work around buggy terminals without bidirectional text
+                       support, using fridibi
 
     The following parameters are not used by YoutubeDL itself, they are used by
     the FileDownloader:
@@ -155,8 +158,45 @@ class YoutubeDL(object):
         self._download_retcode = 0
         self._num_downloads = 0
         self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
+        self._err_file = sys.stderr
         self.params = {} if params is None else params
 
+        # Pipe messsages through fribidi
+        if params.get('bidi_workaround', False):
+            # fribidi does not support ungetting, so force newlines
+            params['progress_with_newline'] = True
+
+            for fid in ['_screen_file', '_err_file']:
+                class FribidiOut(object):
+                    def __init__(self, outfile, errfile):
+                        self.outfile = outfile
+                        self.process = subprocess.Popen(
+                            ['fribidi'],
+                            stdin=subprocess.PIPE,
+                            stdout=outfile,
+                            stderr=errfile)
+
+                    def write(self, s):
+                        res = self.process.stdin.write(s)
+                        self.flush()
+                        return res
+
+                    def flush(self):
+                        return self.process.stdin.flush()
+
+                    def isatty(self):
+                        return self.outfile.isatty()
+
+                try:
+                    vout = FribidiOut(getattr(self, fid), self._err_file)
+                    setattr(self, fid, vout)
+                except OSError as ose:
+                    if ose.errno == 2:
+                        self.report_warning(u'Could not find fribidi executable, ignoring --bidi-workaround . Make sure that  fribidi  is an executable file in one of the directories in your $PATH.')
+                        break
+                    else:
+                        raise
+
         if (sys.version_info >= (3,) and sys.platform != 'win32' and
                 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
                 and not params['restrictfilenames']):
@@ -205,10 +245,14 @@ class YoutubeDL(object):
         pp.set_downloader(self)
 
     def to_screen(self, message, skip_eol=False):
+        """Print message to stdout if not in quiet mode."""
+        return self.to_stdout(message, skip_eol, check_quiet=True)
+
+    def to_stdout(self, message, skip_eol=False, check_quiet=False):
         """Print message to stdout if not in quiet mode."""
         if self.params.get('logger'):
             self.params['logger'].debug(message)
-        elif not self.params.get('quiet', False):
+        elif not check_quiet or not self.params.get('quiet', False):
             terminator = [u'\n', u''][skip_eol]
             output = message + terminator
             write_string(output, self._screen_file)
@@ -220,9 +264,7 @@ class YoutubeDL(object):
             self.params['logger'].error(message)
         else:
             output = message + u'\n'
-            if 'b' in getattr(self._screen_file, 'mode', '') or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
-                output = output.encode(preferredencoding())
-            sys.stderr.write(output)
+            write_string(output, self._err_file)
 
     def to_console_title(self, message):
         if not self.params.get('consoletitle', False):
@@ -293,7 +335,7 @@ class YoutubeDL(object):
         Print the message to stderr, it will be prefixed with 'WARNING:'
         If stderr is a tty file the 'WARNING:' will be colored
         '''
-        if sys.stderr.isatty() and os.name != 'nt':
+        if self._err_file.isatty() and os.name != 'nt':
             _msg_header = u'\033[0;33mWARNING:\033[0m'
         else:
             _msg_header = u'WARNING:'
@@ -305,7 +347,7 @@ class YoutubeDL(object):
         Do the same as trouble, but prefixes the message with 'ERROR:', colored
         in red if stderr is a tty file.
         '''
-        if sys.stderr.isatty() and os.name != 'nt':
+        if self._err_file.isatty() and os.name != 'nt':
             _msg_header = u'\033[0;31mERROR:\033[0m'
         else:
             _msg_header = u'ERROR:'
@@ -404,7 +446,8 @@ class YoutubeDL(object):
         for key, value in extra_info.items():
             info_dict.setdefault(key, value)
 
-    def extract_info(self, url, download=True, ie_key=None, extra_info={}):
+    def extract_info(self, url, download=True, ie_key=None, extra_info={},
+                     process=True):
         '''
         Returns a list with a dictionary for each video we find.
         If 'download', also downloads the videos.
@@ -440,7 +483,10 @@ class YoutubeDL(object):
                         'webpage_url': url,
                         'extractor_key': ie.ie_key(),
                     })
-                return self.process_ie_result(ie_result, download, extra_info)
+                if process:
+                    return self.process_ie_result(ie_result, download, extra_info)
+                else:
+                    return ie_result
             except ExtractorError as de: # An error we somewhat expected
                 self.report_error(compat_str(de), de.format_traceback())
                 break
@@ -473,8 +519,33 @@ class YoutubeDL(object):
                                      download,
                                      ie_key=ie_result.get('ie_key'),
                                      extra_info=extra_info)
+        elif result_type == 'url_transparent':
+            # Use the information from the embedding page
+            info = self.extract_info(
+                ie_result['url'], ie_key=ie_result.get('ie_key'),
+                extra_info=extra_info, download=False, process=False)
+
+            def make_result(embedded_info):
+                new_result = ie_result.copy()
+                for f in ('_type', 'url', 'ext', 'player_url', 'formats',
+                          'entries', 'urlhandle', 'ie_key', 'duration',
+                          'subtitles', 'annotations', 'format',
+                          'thumbnail', 'thumbnails'):
+                    if f in new_result:
+                        del new_result[f]
+                    if f in embedded_info:
+                        new_result[f] = embedded_info[f]
+                return new_result
+            new_result = make_result(info)
+
+            assert new_result.get('_type') != 'url_transparent'
+            if new_result.get('_type') == 'compat_list':
+                new_result['entries'] = [
+                    make_result(e) for e in new_result['entries']]
+
+            return self.process_ie_result(
+                new_result, download=download, extra_info=extra_info)
         elif result_type == 'playlist':
-
             # We process each entry in the playlist
             playlist = ie_result.get('title', None) or ie_result.get('id', None)
             self.to_screen(u'[download] Downloading playlist: %s' % playlist)
@@ -665,22 +736,23 @@ class YoutubeDL(object):
 
         # Forced printings
         if self.params.get('forcetitle', False):
-            compat_print(info_dict['fulltitle'])
+            self.to_stdout(info_dict['fulltitle'])
         if self.params.get('forceid', False):
-            compat_print(info_dict['id'])
+            self.to_stdout(info_dict['id'])
         if self.params.get('forceurl', False):
             # For RTMP URLs, also include the playpath
-            compat_print(info_dict['url'] + info_dict.get('play_path', u''))
+            self.to_stdout(info_dict['url'] + info_dict.get('play_path', u''))
         if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
-            compat_print(info_dict['thumbnail'])
+            self.to_stdout(info_dict['thumbnail'])
         if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
-            compat_print(info_dict['description'])
+            self.to_stdout(info_dict['description'])
         if self.params.get('forcefilename', False) and filename is not None:
-            compat_print(filename)
+            self.to_stdout(filename)
         if self.params.get('forceformat', False):
-            compat_print(info_dict['format'])
+            self.to_stdout(info_dict['format'])
         if self.params.get('forcejson', False):
-            compat_print(json.dumps(info_dict))
+            info_dict['_filename'] = filename
+            self.to_stdout(json.dumps(info_dict))
 
         # Do nothing else if in simulate mode
         if self.params.get('simulate', False):
@@ -969,7 +1041,10 @@ class YoutubeDL(object):
                 proxy_map.update(handler.proxies)
         write_string(u'[debug] Proxy map: ' + compat_str(proxy_map) + u'\n')
 
-    def _setup_opener(self, timeout=20):
+    def _setup_opener(self):
+        timeout_val = self.params.get('socket_timeout')
+        timeout = 600 if timeout_val is None else float(timeout_val)
+
         opts_cookiefile = self.params.get('cookiefile')
         opts_proxy = self.params.get('proxy')
 
index 92e583744df557b02c76dfa1299c73fdbdd4d53c..2e3f969193eb9a777cd3e847eb550a80ad79e18a 100644 (file)
@@ -36,6 +36,7 @@ __authors__  = (
     'Marcin Cieślak',
     'Anton Larionov',
     'Takuya Tsuchida',
+    'Sergey M.',
 )
 
 __license__ = 'Public Domain'
@@ -80,11 +81,11 @@ from .PostProcessor import (
 
 
 def parseOpts(overrideArguments=None):
-    def _readOptions(filename_bytes):
+    def _readOptions(filename_bytes, default=[]):
         try:
             optionf = open(filename_bytes)
         except IOError:
-            return [] # silently skip if file is not present
+            return default  # silently skip if file is not present
         try:
             res = []
             for l in optionf:
@@ -190,7 +191,9 @@ def parseOpts(overrideArguments=None):
     general.add_option('--extractor-descriptions',
             action='store_true', dest='list_extractor_descriptions',
             help='Output descriptions of all supported extractors', default=False)
-    general.add_option('--proxy', dest='proxy', default=None, help='Use the specified HTTP/HTTPS proxy', metavar='URL')
+    general.add_option(
+        '--proxy', dest='proxy', default=None, metavar='URL',
+        help='Use the specified HTTP/HTTPS proxy. Pass in an empty string (--proxy "") for direct connection')
     general.add_option('--no-check-certificate', action='store_true', dest='no_check_certificate', default=False, help='Suppress HTTPS certificate validation.')
     general.add_option(
         '--cache-dir', dest='cachedir', default=get_cachedir(), metavar='DIR',
@@ -198,6 +201,12 @@ def parseOpts(overrideArguments=None):
     general.add_option(
         '--no-cache-dir', action='store_const', const=None, dest='cachedir',
         help='Disable filesystem caching')
+    general.add_option(
+        '--socket-timeout', dest='socket_timeout',
+        type=float, default=None, help=optparse.SUPPRESS_HELP)
+    general.add_option(
+        '--bidi-workaround', dest='bidi_workaround', action='store_true',
+        help=u'Work around terminals that lack bidirectional text support. Requires fribidi executable in PATH')
 
 
     selection.add_option('--playlist-start',
@@ -220,7 +229,7 @@ def parseOpts(overrideArguments=None):
                          default=None, type=int)
     selection.add_option('--download-archive', metavar='FILE',
                          dest='download_archive',
-                         help='Download only videos not present in the archive file. Record the IDs of all downloaded videos in it.')
+                         help='Download only videos not listed in the archive file. Record the IDs of all downloaded videos in it.')
 
 
     authentication.add_option('-u', '--username',
@@ -415,6 +424,8 @@ def parseOpts(overrideArguments=None):
         if opts.verbose:
             write_string(u'[debug] Override config: ' + repr(overrideArguments) + '\n')
     else:
+        systemConf = _readOptions('/etc/youtube-dl.conf')
+
         xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
         if xdg_config_home:
             userConfFile = os.path.join(xdg_config_home, 'youtube-dl', 'config')
@@ -424,8 +435,31 @@ def parseOpts(overrideArguments=None):
             userConfFile = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl', 'config')
             if not os.path.isfile(userConfFile):
                 userConfFile = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
-        systemConf = _readOptions('/etc/youtube-dl.conf')
-        userConf = _readOptions(userConfFile)
+        userConf = _readOptions(userConfFile, None)
+
+        if userConf is None:
+            appdata_dir = os.environ.get('appdata')
+            if appdata_dir:
+                userConf = _readOptions(
+                    os.path.join(appdata_dir, 'youtube-dl', 'config'),
+                    default=None)
+                if userConf is None:
+                    userConf = _readOptions(
+                        os.path.join(appdata_dir, 'youtube-dl', 'config.txt'),
+                        default=None)
+
+        if userConf is None:
+            userConf = _readOptions(
+                os.path.join(os.path.expanduser('~'), 'youtube-dl.conf'),
+                default=None)
+        if userConf is None:
+            userConf = _readOptions(
+                os.path.join(os.path.expanduser('~'), 'youtube-dl.conf.txt'),
+                default=None)
+
+        if userConf is None:
+            userConf = []
+
         commandLineConf = sys.argv[1:]
         argv = systemConf + userConf + commandLineConf
         opts, args = parser.parse_args(argv)
@@ -652,6 +686,8 @@ def _real_main(argv=None):
         'cookiefile': opts.cookiefile,
         'nocheckcertificate': opts.no_check_certificate,
         'proxy': opts.proxy,
+        'socket_timeout': opts.socket_timeout,
+        'bidi_workaround': opts.bidi_workaround,
     }
 
     with YoutubeDL(ydl_opts) as ydl:
index 0abf86e44ca2590eeb0899eb7bec540a5b5320c0..3f740baa13ff8c2c5f6891cc32042ed14b10188c 100644 (file)
@@ -8,6 +8,7 @@ from .arte import (
     ArteTVPlus7IE,
     ArteTVCreativeIE,
     ArteTVFutureIE,
+    ArteTVDDCIE,
 )
 from .auengine import AUEngineIE
 from .bambuser import BambuserIE, BambuserChannelIE
@@ -56,7 +57,7 @@ from .flickr import FlickrIE
 from .francetv import (
     PluzzIE,
     FranceTvInfoIE,
-    France2IE,
+    FranceTVIE,
     GenerationQuoiIE
 )
 from .freesound import FreesoundIE
@@ -102,6 +103,7 @@ from .nbc import NBCNewsIE
 from .newgrounds import NewgroundsIE
 from .nhl import NHLIE, NHLVideocenterIE
 from .niconico import NiconicoIE
+from .ninegag import NineGagIE
 from .nowvideo import NowVideoIE
 from .ooyala import OoyalaIE
 from .orf import ORFIE
@@ -110,6 +112,7 @@ from .photobucket import PhotobucketIE
 from .podomatic import PodomaticIE
 from .pornhub import PornHubIE
 from .pornotube import PornotubeIE
+from .pyvideo import PyvideoIE
 from .rbmaradio import RBMARadioIE
 from .redtube import RedTubeIE
 from .ringtv import RingTVIE
@@ -121,6 +124,12 @@ from .rutube import RutubeIE
 from .sina import SinaIE
 from .slashdot import SlashdotIE
 from .slideshare import SlideshareIE
+from .smotri import (
+    SmotriIE,
+    SmotriCommunityIE,
+    SmotriUserIE,
+    SmotriBroadcastIE,
+)
 from .sohu import SohuIE
 from .soundcloud import SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE
 from .southparkstudios import (
@@ -139,6 +148,7 @@ from .teamcoco import TeamcocoIE
 from .techtalks import TechTalksIE
 from .ted import TEDIE
 from .tf1 import TF1IE
+from .theplatform import ThePlatformIE
 from .thisav import ThisAVIE
 from .toutv import TouTvIE
 from .traileraddict import TrailerAddictIE
@@ -159,7 +169,13 @@ from .viddler import ViddlerIE
 from .videodetective import VideoDetectiveIE
 from .videofyme import VideofyMeIE
 from .videopremium import VideoPremiumIE
-from .vimeo import VimeoIE, VimeoChannelIE
+from .vimeo import (
+    VimeoIE,
+    VimeoChannelIE,
+    VimeoUserIE,
+    VimeoAlbumIE,
+    VimeoGroupsIE,
+)
 from .vine import VineIE
 from .viki import VikiIE
 from .vk import VKIE
@@ -167,6 +183,7 @@ from .wat import WatIE
 from .websurg import WeBSurgIE
 from .weibo import WeiboIE
 from .wimp import WimpIE
+from .wistia import WistiaIE
 from .worldstarhiphop import WorldStarHipHopIE
 from .xhamster import XHamsterIE
 from .xnxx import XNXXIE
index b99d4b96689c23a13379d4392484c3763ce0e36f..a3a1b999df25da791617c46a793843b2fd6ddc99 100644 (file)
@@ -13,7 +13,7 @@ from ..utils import (
 
 class AddAnimeIE(InfoExtractor):
 
-    _VALID_URL = r'^http://(?:\w+\.)?add-anime\.net/watch_video.php\?(?:.*?)v=(?P<video_id>[\w_]+)(?:.*)'
+    _VALID_URL = r'^http://(?:\w+\.)?add-anime\.net/watch_video\.php\?(?:.*?)v=(?P<video_id>[\w_]+)(?:.*)'
     IE_NAME = u'AddAnime'
     _TEST = {
         u'url': u'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9',
index 6d6237f8af79c02048da0e1b1624f33086a120b6..a527f10de250596e42f19f0957433e2a72fe5bbf 100644 (file)
@@ -10,7 +10,7 @@ from ..utils import (
 
 
 class AppleTrailersIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?trailers.apple.com/trailers/(?P<company>[^/]+)/(?P<movie>[^/]+)'
+    _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/trailers/(?P<company>[^/]+)/(?P<movie>[^/]+)'
     _TEST = {
         u"url": u"http://trailers.apple.com/trailers/wb/manofsteel/",
         u"playlist": [
@@ -113,7 +113,7 @@ class AppleTrailersIE(InfoExtractor):
                 })
             formats = sorted(formats, key=lambda f: (f['height'], f['width']))
 
-            info = {
+            playlist.append({
                 '_type': 'video',
                 'id': video_id,
                 'title': title,
@@ -124,12 +124,7 @@ class AppleTrailersIE(InfoExtractor):
                 'upload_date': upload_date,
                 'uploader_id': uploader_id,
                 'user_agent': 'QuickTime compatible (youtube-dl)',
-            }
-            # TODO: Remove when #980 has been merged
-            info['url'] = formats[-1]['url']
-            info['ext'] = formats[-1]['ext']
-
-            playlist.append(info)
+            })
 
         return {
             '_type': 'playlist',
index 61ce4469a05dd3cdf9bddbecf8c82119c40b5c3f..8bb546410f7a7486bdaa964bc724cf2c501e8851 100644 (file)
@@ -11,7 +11,7 @@ from ..utils import (
 class ArchiveOrgIE(InfoExtractor):
     IE_NAME = 'archive.org'
     IE_DESC = 'archive.org videos'
-    _VALID_URL = r'(?:https?://)?(?:www\.)?archive.org/details/(?P<id>[^?/]+)(?:[?].*)?$'
+    _VALID_URL = r'(?:https?://)?(?:www\.)?archive\.org/details/(?P<id>[^?/]+)(?:[?].*)?$'
     _TEST = {
         u"url": u"http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect",
         u'file': u'XD300-23_68HighlightsAResearchCntAugHumanIntellect.ogv',
@@ -49,7 +49,7 @@ class ArchiveOrgIE(InfoExtractor):
         for f in formats:
             f['ext'] = determine_ext(f['url'])
 
-        info = {
+        return {
             '_type': 'video',
             'id': video_id,
             'title': title,
@@ -57,12 +57,5 @@ class ArchiveOrgIE(InfoExtractor):
             'description': description,
             'uploader': uploader,
             'upload_date': upload_date,
+            'thumbnail': data.get('misc', {}).get('image'),
         }
-        thumbnail = data.get('misc', {}).get('image')
-        if thumbnail:
-            info['thumbnail'] = thumbnail
-
-        # TODO: Remove when #980 has been merged
-        info.update(formats[-1])
-
-        return info
index 8b62ee774cc021d4b77e97aced8034f72d4d64e4..4b7bef775ee1e029a1093785ffc32c9f220f3fa6 100644 (file)
@@ -10,6 +10,7 @@ from ..utils import (
     determine_ext,
     get_element_by_id,
     compat_str,
+    get_element_by_attribute,
 )
 
 # There are different sources of video in arte.tv, the extraction process 
@@ -17,8 +18,8 @@ from ..utils import (
 # add tests.
 
 class ArteTvIE(InfoExtractor):
-    _VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?P<lang>fr|de)/.*-(?P<id>.*?).html'
-    _LIVEWEB_URL = r'(?:http://)?liveweb.arte.tv/(?P<lang>fr|de)/(?P<subpage>.+?)/(?P<name>.+)'
+    _VIDEOS_URL = r'(?:http://)?videos\.arte\.tv/(?P<lang>fr|de)/.*-(?P<id>.*?)\.html'
+    _LIVEWEB_URL = r'(?:http://)?liveweb\.arte\.tv/(?P<lang>fr|de)/(?P<subpage>.+?)/(?P<name>.+)'
     _LIVE_URL = r'index-[0-9]+\.html$'
 
     IE_NAME = u'arte.tv'
@@ -142,7 +143,9 @@ class ArteTVPlus7IE(InfoExtractor):
 
     def _extract_from_webpage(self, webpage, video_id, lang):
         json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url')
+        return self._extract_from_json_url(json_url, video_id, lang)
 
+    def _extract_from_json_url(self, json_url, video_id, lang):
         json_info = self._download_webpage(json_url, video_id, 'Downloading info json')
         self.report_extraction(video_id)
         info = json.loads(json_info)
@@ -257,3 +260,35 @@ class ArteTVFutureIE(ArteTVPlus7IE):
         webpage = self._download_webpage(url, anchor_id)
         row = get_element_by_id(anchor_id, webpage)
         return self._extract_from_webpage(row, anchor_id, lang)
+
+
+class ArteTVDDCIE(ArteTVPlus7IE):
+    IE_NAME = u'arte.tv:ddc'
+    _VALID_URL = r'http?://ddc\.arte\.tv/(?P<lang>emission|folge)/(?P<id>.+)'
+
+    _TEST = {
+        u'url': u'http://ddc.arte.tv/folge/neues-aus-mauretanien',
+        u'file': u'049881-009_PLUS7-D.flv',
+        u'info_dict': {
+            u'title': u'Mit offenen Karten',
+            u'description': u'md5:57929b0eaeddeb8a0c983f58e9ebd3b6',
+            u'upload_date': u'20131207',
+        },
+        u'params': {
+            # rtmp download
+            u'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id, lang = self._extract_url_info(url)
+        if lang == 'folge':
+            lang = 'de'
+        elif lang == 'emission':
+            lang = 'fr'
+        webpage = self._download_webpage(url, video_id)
+        scriptElement = get_element_by_attribute('class', 'visu_video_block', webpage)
+        script_url = self._html_search_regex(r'src="(.*?)"', scriptElement, 'script url')
+        javascriptPlayerGenerator = self._download_webpage(script_url, video_id, 'Download javascript player generator')
+        json_url = self._search_regex(r"json_url=(.*)&rendering_place.*", javascriptPlayerGenerator, 'json url')
+        return self._extract_from_json_url(json_url, video_id, lang)
index 95c038003b431dc48ac3bb89dcc03f8aa39ea07f..bcccc0b7a54c8b03b84a3e1303672509577faa66 100644 (file)
@@ -16,7 +16,7 @@ class AUEngineIE(InfoExtractor):
             u"title": u"[Commie]The Legend of the Legendary Heroes - 03 - Replication Eye (Alpha Stigma)[F9410F5A]"
         }
     }
-    _VALID_URL = r'(?:http://)?(?:www\.)?auengine\.com/embed.php\?.*?file=([^&]+).*?'
+    _VALID_URL = r'(?:http://)?(?:www\.)?auengine\.com/embed\.php\?.*?file=([^&]+).*?'
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
index b80508efed09a7ccece8e6980706e7083d3b96e9..d48c0c38d0ecfc787ce364e015d5a53260b922d4 100644 (file)
@@ -54,7 +54,7 @@ class BambuserIE(InfoExtractor):
 
 class BambuserChannelIE(InfoExtractor):
     IE_NAME = u'bambuser:channel'
-    _VALID_URL = r'http://bambuser.com/channel/(?P<user>.*?)(?:/|#|\?|$)'
+    _VALID_URL = r'https?://bambuser\.com/channel/(?P<user>.*?)(?:/|#|\?|$)'
     # The maximum number we can get with each request
     _STEP = 50
 
index 493504f75082f7b7605121acbfd88dbb621e84fb..5e33a69df42fcbaa1b17f1737d66f5841ca50318 100644 (file)
@@ -51,8 +51,7 @@ class BlipTVIE(InfoExtractor):
             url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
         urlp = compat_urllib_parse_urlparse(url)
         if urlp.path.startswith('/play/'):
-            request = compat_urllib_request.Request(url)
-            response = compat_urllib_request.urlopen(request)
+            response = self._request_webpage(url, None, False)
             redirecturl = response.geturl()
             rurlp = compat_urllib_parse_urlparse(redirecturl)
             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
@@ -69,25 +68,23 @@ class BlipTVIE(InfoExtractor):
         request.add_header('User-Agent', 'iTunes/10.6.1')
         self.report_extraction(mobj.group(1))
         info = None
-        try:
-            urlh = compat_urllib_request.urlopen(request)
-            if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
-                basename = url.split('/')[-1]
-                title,ext = os.path.splitext(basename)
-                title = title.decode('UTF-8')
-                ext = ext.replace('.', '')
-                self.report_direct_download(title)
-                info = {
-                    'id': title,
-                    'url': url,
-                    'uploader': None,
-                    'upload_date': None,
-                    'title': title,
-                    'ext': ext,
-                    'urlhandle': urlh
-                }
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
+        urlh = self._request_webpage(request, None, False,
+            u'unable to download video info webpage')
+        if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
+            basename = url.split('/')[-1]
+            title,ext = os.path.splitext(basename)
+            title = title.decode('UTF-8')
+            ext = ext.replace('.', '')
+            self.report_direct_download(title)
+            info = {
+                'id': title,
+                'url': url,
+                'uploader': None,
+                'upload_date': None,
+                'title': title,
+                'ext': ext,
+                'urlhandle': urlh
+            }
         if info is None: # Regular URL
             try:
                 json_code_bytes = urlh.read()
index 3666a780b9209da0125d319e3851f40a05bc4e4f..755d9c9ef2a093289df91409097320908ea06df7 100644 (file)
@@ -4,7 +4,7 @@ from .common import InfoExtractor
 
 
 class BloombergIE(InfoExtractor):
-    _VALID_URL = r'https?://www\.bloomberg\.com/video/(?P<name>.+?).html'
+    _VALID_URL = r'https?://www\.bloomberg\.com/video/(?P<name>.+?)\.html'
 
     _TEST = {
         u'url': u'http://www.bloomberg.com/video/shah-s-presentation-on-foreign-exchange-strategies-qurhIVlJSB6hzkVi229d8g.html',
index 0d18e9a7a39e98a44bb934a3c5f12b00273b91f3..43efb08bfc33accf5661bf0afa3d59aeb1bb0c0e 100644 (file)
@@ -17,7 +17,8 @@ class ClipfishIE(InfoExtractor):
         u'info_dict': {
             u'title': u'FIFA 14 - E3 2013 Trailer',
             u'duration': 82,
-        }
+        },
+        u'skip': 'Blocked in the US'
     }
 
     def _real_extract(self, url):
index 23647f99eec075af82c9b099d52323c38814bf4c..a54ce3ee7c44727a9e56b1ab8359bd099b48bb35 100644 (file)
@@ -1,7 +1,7 @@
 import re
 
 from .common import InfoExtractor
-from .mtv import MTVIE, _media_xml_tag
+from .mtv import MTVServicesInfoExtractor
 from ..utils import (
     compat_str,
     compat_urllib_parse,
@@ -11,8 +11,8 @@ from ..utils import (
 )
 
 
-class ComedyCentralIE(MTVIE):
-    _VALID_URL = r'http://www.comedycentral.com/(video-clips|episodes|cc-studios)/(?P<title>.*)'
+class ComedyCentralIE(MTVServicesInfoExtractor):
+    _VALID_URL = r'https?://(?:www.)?comedycentral.com/(video-clips|episodes|cc-studios)/(?P<title>.*)'
     _FEED_URL = u'http://comedycentral.com/feeds/mrss/'
 
     _TEST = {
@@ -25,12 +25,6 @@ class ComedyCentralIE(MTVIE):
             u'description': u'After a certain point, breastfeeding becomes c**kblocking.',
         },
     }
-    # Overwrite MTVIE properties we don't want
-    _TESTS = []
-
-    def _get_thumbnail_url(self, uri, itemdoc):
-        search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail'))
-        return itemdoc.find(search_path).attrib['url']
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
@@ -197,7 +191,7 @@ class ComedyCentralShowsIE(InfoExtractor):
                 })
 
             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
-            info = {
+            results.append({
                 'id': shortMediaId,
                 'formats': formats,
                 'uploader': showId,
@@ -205,11 +199,6 @@ class ComedyCentralShowsIE(InfoExtractor):
                 'title': effTitle,
                 'thumbnail': None,
                 'description': compat_str(officialTitle),
-            }
-
-            # TODO: Remove when #980 has been merged
-            info.update(info['formats'][-1])
-
-            results.append(info)
+            })
 
         return results
index 4f1b50880f73a7715a886e18122e40ca2f79023f..534908a2b89af8db08f5d7ba0fc7f983c1199fa9 100644 (file)
@@ -55,6 +55,9 @@ class InfoExtractor(object):
     subtitles:      The subtitle file contents as a dictionary in the format
                     {language: subtitles}.
     view_count:     How many users have watched the video on the platform.
+    like_count:     Number of positive ratings of the video
+    dislike_count:  Number of negative ratings of the video
+    comment_count:  Number of comments on the video
     urlhandle:      [internal] The urlHandle to be used to download the file,
                     like returned by urllib.request.urlopen
     age_limit:      Age restriction for the video, as an integer (years)
@@ -151,27 +154,38 @@ class InfoExtractor(object):
     def IE_NAME(self):
         return type(self).__name__[:-2]
 
-    def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
+    def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
         """ Returns the response handle """
         if note is None:
             self.report_download_webpage(video_id)
         elif note is not False:
-            self.to_screen(u'%s: %s' % (video_id, note))
+            if video_id is None:
+                self.to_screen(u'%s' % (note,))
+            else:
+                self.to_screen(u'%s: %s' % (video_id, note))
         try:
             return self._downloader.urlopen(url_or_request)
         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
             if errnote is None:
                 errnote = u'Unable to download webpage'
-            raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err)
+            errmsg = u'%s: %s' % (errnote, compat_str(err))
+            if fatal:
+                raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
+            else:
+                self._downloader.report_warning(errmsg)
+                return False
 
-    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
+    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
         """ Returns a tuple (page content as string, URL handle) """
 
         # Strip hashes from the URL (#1038)
         if isinstance(url_or_request, (compat_str, str)):
             url_or_request = url_or_request.partition('#')[0]
 
-        urlh = self._request_webpage(url_or_request, video_id, note, errnote)
+        urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
+        if urlh is False:
+            assert not fatal
+            return False
         content_type = urlh.headers.get('Content-Type', '')
         webpage_bytes = urlh.read()
         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
@@ -206,9 +220,14 @@ class InfoExtractor(object):
         content = webpage_bytes.decode(encoding, 'replace')
         return (content, urlh)
 
-    def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
+    def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
         """ Returns the data of the page as a string """
-        return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
+        res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
+        if res is False:
+            return res
+        else:
+            content, _ = res
+            return content
 
     def _download_xml(self, url_or_request, video_id,
                       note=u'Downloading XML', errnote=u'Unable to download XML'):
@@ -364,7 +383,8 @@ class InfoExtractor(object):
         if display_name is None:
             display_name = name
         return self._html_search_regex(
-            r'''(?ix)<meta(?=[^>]+(?:name|property)=["\']%s["\'])
+            r'''(?ix)<meta
+                    (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
             html, display_name, fatal=False)
 
index 7bf03c584c7388b162c9b3912a4aa0f410ed5b22..d5730684dc497b37d7ff57098f5a156ff620e40e 100644 (file)
@@ -6,7 +6,7 @@ from ..utils import (
 )
 
 class CSpanIE(InfoExtractor):
-    _VALID_URL = r'http://www.c-spanvideo.org/program/(.*)'
+    _VALID_URL = r'http://www\.c-spanvideo\.org/program/(.*)'
     _TEST = {
         u'url': u'http://www.c-spanvideo.org/program/HolderonV',
         u'file': u'315139.flv',
index 71f5e03eea393b7733bf3bfeb4f2eeea5b21eb85..3bd0b862c6551c8f40207f62db2daf964621db47 100644 (file)
@@ -11,6 +11,7 @@ from ..utils import (
     get_element_by_attribute,
     get_element_by_id,
     orderedSet,
+    str_to_int,
 
     ExtractorError,
 )
@@ -146,6 +147,9 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
             self._list_available_subtitles(video_id, webpage)
             return
 
+        view_count = str_to_int(self._search_regex(
+            r'video_views_value[^>]+>([\d\.,]+)<', webpage, u'view count'))
+
         return {
             'id':       video_id,
             'formats': formats,
@@ -155,6 +159,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
             'subtitles':    video_subtitles,
             'thumbnail': info['thumbnail_url'],
             'age_limit': age_limit,
+            'view_count': view_count,
         }
 
     def _get_available_subtitles(self, video_id, webpage):
index 3d1dcb793627cb2d642c974f689c130faffe9ff1..d418ce4a8a29c122e811c96aac76d388c790b560 100644 (file)
@@ -28,7 +28,8 @@ class DaumIE(InfoExtractor):
         video_id = mobj.group(1)
         canonical_url = 'http://tvpot.daum.net/v/%s' % video_id
         webpage = self._download_webpage(canonical_url, video_id)
-        full_id = self._search_regex(r'<link rel="video_src" href=".+?vid=(.+?)"',
+        full_id = self._search_regex(
+            r'<iframe src="http://videofarm.daum.net/controller/video/viewer/Video.html\?.*?vid=(.+?)[&"]',
             webpage, u'full id')
         query = compat_urllib_parse.urlencode({'vid': full_id})
         info = self._download_xml(
@@ -56,7 +57,7 @@ class DaumIE(InfoExtractor):
                 'format_id': profile,
             })
 
-        info = {
+        return {
             'id': video_id,
             'title': info.find('TITLE').text,
             'formats': formats,
@@ -65,6 +66,3 @@ class DaumIE(InfoExtractor):
             'duration': int(info.find('DURATION').text),
             'upload_date': info.find('REGDTTM').text[:8],
         }
-        # TODO: Remove when #980 has been merged
-        info.update(formats[-1])
-        return info
index 3cb382e1258580d67039fb7ed32c2072bcb79c04..cb7226f82a6af167569286918a56cce64e796150 100644 (file)
@@ -11,7 +11,7 @@ from ..utils import (
 
 class DreiSatIE(InfoExtractor):
     IE_NAME = '3sat'
-    _VALID_URL = r'(?:http://)?(?:www\.)?3sat.de/mediathek/index.php\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$'
+    _VALID_URL = r'(?:http://)?(?:www\.)?3sat\.de/mediathek/index\.php\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$'
     _TEST = {
         u"url": u"http://www.3sat.de/mediathek/index.php?obj=36983",
         u'file': u'36983.webm',
@@ -65,7 +65,7 @@ class DreiSatIE(InfoExtractor):
             return (qidx, prefer_http, format['video_bitrate'])
         formats.sort(key=_sortkey)
 
-        info = {
+        return {
             '_type': 'video',
             'id': video_id,
             'title': video_title,
@@ -76,8 +76,3 @@ class DreiSatIE(InfoExtractor):
             'uploader': video_uploader,
             'upload_date': upload_date,
         }
-
-        # TODO: Remove when #980 has been merged
-        info.update(formats[-1])
-
-        return info
index f21ef88530d2f8913b4b35d9c03fc4fc14de7ddc..88f5526b8a59491cc6cd40b48fe9451b3fc2d12b 100644 (file)
@@ -10,7 +10,7 @@ from ..utils import (
 
 class EightTracksIE(InfoExtractor):
     IE_NAME = '8tracks'
-    _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
+    _VALID_URL = r'https?://8tracks\.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
     _TEST = {
         u"name": u"EightTracks",
         u"url": u"http://8tracks.com/ytdl/youtube-dl-test-tracks-a",
index a51d79b08c656144c3f67d853fcae8fe52bc6e1f..682901d16227e088e203bd01656db21cc2f70dda 100644 (file)
@@ -8,7 +8,7 @@ class ExfmIE(InfoExtractor):
     IE_NAME = u'exfm'
     IE_DESC = u'ex.fm'
     _VALID_URL = r'(?:http://)?(?:www\.)?ex\.fm/song/([^/]+)'
-    _SOUNDCLOUD_URL = r'(?:http://)?(?:www\.)?api\.soundcloud.com/tracks/([^/]+)/stream'
+    _SOUNDCLOUD_URL = r'(?:http://)?(?:www\.)?api\.soundcloud\.com/tracks/([^/]+)/stream'
     _TESTS = [
         {
             u'url': u'http://ex.fm/song/eh359',
index c0169de048fce6910aaac3da53daabefa1af2969..c6ab6952e84dc9074816f28ebb7fe6d8ce02cb47 100644 (file)
@@ -9,7 +9,7 @@ from ..utils import (
 
 class FazIE(InfoExtractor):
     IE_NAME = u'faz.net'
-    _VALID_URL = r'https?://www\.faz\.net/multimedia/videos/.*?-(?P<id>\d+).html'
+    _VALID_URL = r'https?://www\.faz\.net/multimedia/videos/.*?-(?P<id>\d+)\.html'
 
     _TEST = {
         u'url': u'http://www.faz.net/multimedia/videos/stockholm-chemie-nobelpreis-fuer-drei-amerikanische-forscher-12610585.html',
@@ -44,13 +44,10 @@ class FazIE(InfoExtractor):
             })
 
         descr = self._html_search_regex(r'<p class="Content Copy">(.*?)</p>', webpage, u'description')
-        info = {
+        return {
             'id': video_id,
             'title': self._og_search_title(webpage),
             'formats': formats,
             'description': descr,
             'thumbnail': config.find('STILL/STILL_BIG').text,
         }
-        # TODO: Remove when #980 has been merged
-        info.update(formats[-1])
-        return info
index dba1a8dc262979b5afce987211bab2f14e502dba..d7048c8c1ae7e6ba149552a7b32ec2ab42c8a3f2 100644 (file)
@@ -12,7 +12,7 @@ from ..utils import (
 
 class FKTVIE(InfoExtractor):
     IE_NAME = u'fernsehkritik.tv'
-    _VALID_URL = r'(?:http://)?(?:www\.)?fernsehkritik.tv/folge-(?P<ep>[0-9]+)(?:/.*)?'
+    _VALID_URL = r'(?:http://)?(?:www\.)?fernsehkritik\.tv/folge-(?P<ep>[0-9]+)(?:/.*)?'
 
     _TEST = {
         u'url': u'http://fernsehkritik.tv/folge-1',
@@ -52,7 +52,7 @@ class FKTVIE(InfoExtractor):
 
 class FKTVPosteckeIE(InfoExtractor):
     IE_NAME = u'fernsehkritik.tv:postecke'
-    _VALID_URL = r'(?:http://)?(?:www\.)?fernsehkritik.tv/inline-video/postecke.php\?(.*&)?ep=(?P<ep>[0-9]+)(&|$)'
+    _VALID_URL = r'(?:http://)?(?:www\.)?fernsehkritik\.tv/inline-video/postecke\.php\?(.*&)?ep=(?P<ep>[0-9]+)(&|$)'
     _TEST = {
         u'url': u'http://fernsehkritik.tv/inline-video/postecke.php?iframe=true&width=625&height=440&ep=120',
         u'file': u'0120.flv',
index 6e1971043b3853b9fe54e682473a61621c9989e2..ad85bc16d7796cfcf42331a05bb0392e773f70c5 100644 (file)
@@ -21,7 +21,7 @@ class FranceTVBaseInfoExtractor(InfoExtractor):
         thumbnail_path = info.find('image').text
 
         return {'id': video_id,
-                'ext': 'mp4',
+                'ext': 'flv' if video_url.startswith('rtmp') else 'mp4',
                 'url': video_url,
                 'title': info.find('titre').text,
                 'thumbnail': compat_urlparse.urljoin('http://pluzz.francetv.fr', thumbnail_path),
@@ -45,7 +45,7 @@ class PluzzIE(FranceTVBaseInfoExtractor):
 
 class FranceTvInfoIE(FranceTVBaseInfoExtractor):
     IE_NAME = u'francetvinfo.fr'
-    _VALID_URL = r'https?://www\.francetvinfo\.fr/replay.*/(?P<title>.+).html'
+    _VALID_URL = r'https?://www\.francetvinfo\.fr/replay.*/(?P<title>.+)\.html'
 
     _TEST = {
         u'url': u'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html',
@@ -66,35 +66,101 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor):
         return self._extract_video(video_id)
 
 
-class France2IE(FranceTVBaseInfoExtractor):
-    IE_NAME = u'france2.fr'
-    _VALID_URL = r'''(?x)https?://www\.france2\.fr/
+class FranceTVIE(FranceTVBaseInfoExtractor):
+    IE_NAME = u'francetv'
+    IE_DESC = u'France 2, 3, 4, 5 and Ô'
+    _VALID_URL = r'''(?x)https?://www\.france[2345o]\.fr/
         (?:
-            emissions/.*?/videos/(?P<id>\d+)
-        |   emission/(?P<key>[^/?]+)
+            emissions/.*?/(videos|emissions)/(?P<id>[^/?]+)
+        |   (emissions?|jt)/(?P<key>[^/?]+)
         )'''
 
-    _TEST = {
-        u'url': u'http://www.france2.fr/emissions/13h15-le-samedi-le-dimanche/videos/75540104',
-        u'file': u'75540104.mp4',
-        u'info_dict': {
-            u'title': u'13h15, le samedi...',
-            u'description': u'md5:2e5b58ba7a2d3692b35c792be081a03d',
+    _TESTS = [
+        # france2
+        {
+            u'url': u'http://www.france2.fr/emissions/13h15-le-samedi-le-dimanche/videos/75540104',
+            u'file': u'75540104.mp4',
+            u'info_dict': {
+                u'title': u'13h15, le samedi...',
+                u'description': u'md5:2e5b58ba7a2d3692b35c792be081a03d',
+            },
+            u'params': {
+                # m3u8 download
+                u'skip_download': True,
+            },
         },
-        u'params': {
-            u'skip_download': True,
+        # france3
+        {
+            u'url': u'http://www.france3.fr/emissions/pieces-a-conviction/diffusions/13-11-2013_145575',
+            u'info_dict': {
+                u'id': u'000702326_CAPP_PicesconvictionExtrait313022013_120220131722_Au',
+                u'ext': u'flv',
+                u'title': u'Le scandale du prix des médicaments',
+                u'description': u'md5:1384089fbee2f04fc6c9de025ee2e9ce',
+            },
+            u'params': {
+                # rtmp download
+                u'skip_download': True,
+            },
         },
-    }
+        # france4
+        {
+            u'url': u'http://www.france4.fr/emissions/hero-corp/videos/rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4',
+            u'info_dict': {
+                u'id': u'rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4',
+                u'ext': u'flv',
+                u'title': u'Hero Corp Making of - Extrait 1',
+                u'description': u'md5:c87d54871b1790679aec1197e73d650a',
+            },
+            u'params': {
+                # rtmp download
+                u'skip_download': True,
+            },
+        },
+        # france5
+        {
+            u'url': u'http://www.france5.fr/emissions/c-a-dire/videos/92837968',
+            u'info_dict': {
+                u'id': u'92837968',
+                u'ext': u'mp4',
+                u'title': u'C à dire ?!',
+                u'description': u'md5:fb1db1cbad784dcce7c7a7bd177c8e2f',
+            },
+            u'params': {
+                # m3u8 download
+                u'skip_download': True,
+            },
+        },
+        # franceo
+        {
+            u'url': u'http://www.franceo.fr/jt/info-afrique/04-12-2013',
+            u'info_dict': {
+                u'id': u'92327925',
+                u'ext': u'mp4',
+                u'title': u'Infô-Afrique',
+                u'description': u'md5:ebf346da789428841bee0fd2a935ea55',
+            },
+            u'params': {
+                # m3u8 download
+                u'skip_download': True,
+            },
+            u'skip': u'The id changes frequently',
+        },
+    ]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         if mobj.group('key'):
             webpage = self._download_webpage(url, mobj.group('key'))
-            video_id = self._html_search_regex(
-                r'''(?x)<div\s+class="video-player">\s*
+            id_res = [
+                (r'''(?x)<div\s+class="video-player">\s*
                     <a\s+href="http://videos.francetv.fr/video/([0-9]+)"\s+
-                    class="francetv-video-player">''',
-                webpage, u'video ID')
+                    class="francetv-video-player">'''),
+                (r'<a id="player_direct" href="http://info\.francetelevisions'
+                 '\.fr/\?id-video=([^"/&]+)'),
+                (r'<a class="video" id="ftv_player_(.+?)"'),
+            ]
+            video_id = self._html_search_regex(id_res, webpage, u'video ID')
         else:
             video_id = mobj.group('id')
         return self._extract_video(video_id)
index c91669b0ebaeac6085ba010c242dc571b28e57e1..a3a5251fe5711173ccb3986c263994d560345bf8 100644 (file)
@@ -4,7 +4,7 @@ from .common import InfoExtractor
 
 
 class GamekingsIE(InfoExtractor):
-    _VALID_URL = r'http?://www\.gamekings\.tv/videos/(?P<name>[0-9a-z\-]+)'
+    _VALID_URL = r'http://www\.gamekings\.tv/videos/(?P<name>[0-9a-z\-]+)'
     _TEST = {
         u"url": u"http://www.gamekings.tv/videos/phoenix-wright-ace-attorney-dual-destinies-review/",
         u'file': u'20130811.mp4',
index 9645b00c3307a42ba48b66af599345ba80349a3d..26b7d2ae531f785bc3177af4029652c531d840da 100644 (file)
@@ -47,13 +47,10 @@ class GameSpotIE(InfoExtractor):
                 'format_id': q,
             })
 
-        info = {
+        return {
             'id': data_video['guid'],
             'title': compat_urllib_parse.unquote(data_video['title']),
             'formats': formats,
             'description': get_meta_content('description', webpage),
             'thumbnail': self._og_search_thumbnail(webpage),
         }
-        # TODO: Remove when #980 has been merged
-        info.update(formats[-1])
-        return info
index 3cc02d97e04aace34e0eb03cccab254f4927f77d..d82a5d4b2a30578298080f03a8bba5f502e48f20 100644 (file)
@@ -1,13 +1,10 @@
 import re
 
-from .mtv import MTVIE, _media_xml_tag
+from .mtv import MTVServicesInfoExtractor
 
-class GametrailersIE(MTVIE):
-    """
-    Gametrailers use the same videos system as MTVIE, it just changes the feed
-    url, where the uri is and the method to get the thumbnails.
-    """
-    _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
+
+class GametrailersIE(MTVServicesInfoExtractor):
+    _VALID_URL = r'http://www\.gametrailers\.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
     _TEST = {
         u'url': u'http://www.gametrailers.com/videos/zbvr8i/mirror-s-edge-2-e3-2013--debut-trailer',
         u'file': u'70e9a5d7-cf25-4a10-9104-6f3e7342ae0d.mp4',
@@ -17,15 +14,9 @@ class GametrailersIE(MTVIE):
             u'description': u'Faith is back!  Check out the World Premiere trailer for Mirror\'s Edge 2 straight from the EA Press Conference at E3 2013!',
         },
     }
-    # Overwrite MTVIE properties we don't want
-    _TESTS = []
 
     _FEED_URL = 'http://www.gametrailers.com/feeds/mrss'
 
-    def _get_thumbnail_url(self, uri, itemdoc):
-        search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail'))
-        return itemdoc.find(search_path).attrib['url']
-
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('id')
index 37671430a99b66dea8339dfd986503f3cb57f59e..216e032186297b7b91a488fb1edd1421e3270b39 100644 (file)
@@ -169,8 +169,13 @@ class GenericIE(InfoExtractor):
         #   Site Name | Video Title
         #   Video Title - Tagline | Site Name
         # and so on and so forth; it's just not practical
-        video_title = self._html_search_regex(r'<title>(.*)</title>',
-            webpage, u'video title', default=u'video', flags=re.DOTALL)
+        video_title = self._html_search_regex(
+            r'(?s)<title>(.*?)</title>', webpage, u'video title',
+            default=u'video')
+
+        # video uploader is domain name
+        video_uploader = self._search_regex(
+            r'^(?:https?://)?([^/]*)/.*', url, u'video uploader')
 
         # Look for BrightCove:
         bc_url = BrightcoveIE._extract_brightcove_url(webpage)
@@ -188,13 +193,35 @@ class GenericIE(InfoExtractor):
 
         # Look for embedded YouTube player
         matches = re.findall(
-            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?youtube.com/embed/.+?)\1', webpage)
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?youtube\.com/embed/.+?)\1', webpage)
         if matches:
             urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Youtube')
                      for tuppl in matches]
             return self.playlist_result(
                 urlrs, playlist_id=video_id, playlist_title=video_title)
 
+        # Look for embedded Dailymotion player
+        matches = re.findall(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
+        if matches:
+            urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Dailymotion')
+                     for tuppl in matches]
+            return self.playlist_result(
+                urlrs, playlist_id=video_id, playlist_title=video_title)
+
+        # Look for embedded Wistia player
+        match = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
+        if match:
+            return {
+                '_type': 'url_transparent',
+                'url': unescapeHTML(match.group('url')),
+                'ie_key': 'Wistia',
+                'uploader': video_uploader,
+                'title': video_title,
+                'id': video_id,
+            }
+
         # Look for Bandcamp pages with custom domain
         mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
         if mobj is not None:
@@ -238,14 +265,9 @@ class GenericIE(InfoExtractor):
         # here's a fun little line of code for you:
         video_id = os.path.splitext(video_id)[0]
 
-        # video uploader is domain name
-        video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
-            url, u'video uploader')
-
         return {
             'id':       video_id,
             'url':      video_url,
             'uploader': video_uploader,
-            'upload_date':  None,
             'title':    video_title,
         }
index 3798118a7fc491f9b2437878cf9d99df1f05b5ec..0ee74fb38410a4acce1e15c7a9ce98d80409012e 100644 (file)
@@ -11,7 +11,7 @@ class HotNewHipHopIE(InfoExtractor):
         u'file': u'1435540.mp3',
         u'md5': u'2c2cd2f76ef11a9b3b581e8b232f3d96',
         u'info_dict': {
-            u"title": u"Freddie Gibbs - Lay It Down"
+            u"title": u'Freddie Gibbs "Lay It Down"'
         }
     }
 
index c52146f7d716dd02ba34230e9fbb7c4dfe5ac15d..57b79a3363484d2e1a663112d63c219e2895781f 100644 (file)
@@ -103,7 +103,7 @@ class IGNIE(InfoExtractor):
 class OneUPIE(IGNIE):
     """Extractor for 1up.com, it uses the ign videos system."""
 
-    _VALID_URL = r'https?://gamevideos.1up.com/(?P<type>video)/id/(?P<name_or_id>.+)'
+    _VALID_URL = r'https?://gamevideos\.1up\.com/(?P<type>video)/id/(?P<name_or_id>.+)'
     IE_NAME = '1up.com'
 
     _DESCRIPTION_RE = r'<div id="vid_summary">(.+?)</div>'
index d8e9712a7acd39db97c8a55b2551137ca0e56a41..6fb373db2ae1a1385efc267284f3bbbc5383836c 100644 (file)
@@ -21,7 +21,6 @@ class ImdbIE(InfoExtractor):
             u'ext': u'mp4',
             u'title': u'Ice Age: Continental Drift Trailer (No. 2) - IMDb',
             u'description': u'md5:9061c2219254e5d14e03c25c98e96a81',
-            u'duration': 151,
         }
     }
 
@@ -35,6 +34,7 @@ class ImdbIE(InfoExtractor):
             flags=re.MULTILINE)
         formats = []
         for f_id, f_path in available_formats:
+            f_path = f_path.strip()
             format_page = self._download_webpage(
                 compat_urlparse.urljoin(url, f_path),
                 u'Downloading info for %s format' % f_id)
@@ -46,7 +46,6 @@ class ImdbIE(InfoExtractor):
             formats.append({
                 'format_id': f_id,
                 'url': format_info['url'],
-                'height': int(info['titleObject']['encoding']['selected'][:-1]),
             })
 
         return {
@@ -55,5 +54,4 @@ class ImdbIE(InfoExtractor):
             'formats': formats,
             'description': descr,
             'thumbnail': format_info['slate'],
-            'duration': int(info['titleObject']['title']['duration_seconds']),
         }
index 213aac428451bfcb860585b26de0e1c43abc732d..660573d022d267b1dfbf0d7274083f5ae47e9953 100644 (file)
@@ -3,7 +3,7 @@ import re
 from .common import InfoExtractor
 
 class InstagramIE(InfoExtractor):
-    _VALID_URL = r'(?:http://)?instagram.com/p/(.*?)/'
+    _VALID_URL = r'(?:http://)?instagram\.com/p/(.*?)/'
     _TEST = {
         u'url': u'http://instagram.com/p/aye83DjauH/?foo=bar#abc',
         u'file': u'aye83DjauH.mp4',
index c7bb234fe9eec9bd848f2a19c4307722cc4bbca0..592c64e1de0a47299770ef838095abf1f0988bcc 100644 (file)
@@ -8,7 +8,7 @@ from ..utils import (
 )
 
 class JukeboxIE(InfoExtractor):
-    _VALID_URL = r'^http://www\.jukebox?\..+?\/.+[,](?P<video_id>[a-z0-9\-]+).html'
+    _VALID_URL = r'^http://www\.jukebox?\..+?\/.+[,](?P<video_id>[a-z0-9\-]+)\.html'
     _IFRAME = r'<iframe .*src="(?P<iframe>[^"]*)".*>'
     _VIDEO_URL = r'"config":{"file":"(?P<video_url>http:[^"]+[.](?P<video_ext>[^.?]+)[?]mdtk=[0-9]+)"'
     _TITLE = r'<h1 class="inline">(?P<title>[^<]+)</h1>.*<span id="infos_article_artist">(?P<artist>[^<]+)</span>'
index dd062a14e736ba84b3aacb9d3bf426bca4c8f86f..5ae57a77c65c84d559946a651a3612fab86c8535 100644 (file)
@@ -8,7 +8,7 @@ from ..utils import (
 
 class LiveLeakIE(InfoExtractor):
 
-    _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
+    _VALID_URL = r'^(?:http://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
     IE_NAME = u'liveleak'
     _TEST = {
         u'url': u'http://www.liveleak.com/view?i=757_1364311680',
index 9bc35b115033ce641e4435ebb807c6e1c93c975e..1dcd1fb2de42894d80c494185caeb600540b02da 100644 (file)
@@ -11,7 +11,7 @@ from ..utils import (
 
 class LivestreamIE(InfoExtractor):
     IE_NAME = u'livestream'
-    _VALID_URL = r'http://new.livestream.com/.*?/(?P<event_name>.*?)(/videos/(?P<id>\d+))?/?$'
+    _VALID_URL = r'http://new\.livestream\.com/.*?/(?P<event_name>.*?)(/videos/(?P<id>\d+))?/?$'
     _TEST = {
         u'url': u'http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370',
         u'file': u'4719370.mp4',
index 91480ba875d5fff781ce08a47c41a3824e94e910..bd044fb60220ed1664351172126ca499da806e1c 100644 (file)
@@ -1,11 +1,8 @@
 import re
-import socket
 
 from .common import InfoExtractor
 from ..utils import (
-    compat_http_client,
     compat_parse_qs,
-    compat_urllib_error,
     compat_urllib_parse,
     compat_urllib_request,
     compat_str,
@@ -69,6 +66,21 @@ class MetacafeIE(InfoExtractor):
             u'age_limit': 18,
         },
     },
+    # cbs video
+    {
+        u'url': u'http://www.metacafe.com/watch/cb-0rOxMBabDXN6/samsung_galaxy_note_2_samsungs_next_generation_phablet/',
+        u'info_dict': {
+            u'id': u'0rOxMBabDXN6',
+            u'ext': u'flv',
+            u'title': u'Samsung Galaxy Note 2: Samsung\'s next-generation phablet',
+            u'description': u'md5:54d49fac53d26d5a0aaeccd061ada09d',
+            u'duration': 129,
+        },
+        u'params': {
+            # rtmp download
+            u'skip_download': True,
+        },
+    },
     ]
 
 
@@ -78,12 +90,8 @@ class MetacafeIE(InfoExtractor):
 
     def _real_initialize(self):
         # Retrieve disclaimer
-        request = compat_urllib_request.Request(self._DISCLAIMER)
-        try:
-            self.report_disclaimer()
-            compat_urllib_request.urlopen(request).read()
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
+        self.report_disclaimer()
+        self._download_webpage(self._DISCLAIMER, None, False, u'Unable to retrieve disclaimer')
 
         # Confirm age
         disclaimer_form = {
@@ -92,11 +100,8 @@ class MetacafeIE(InfoExtractor):
             }
         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
         request.add_header('Content-Type', 'application/x-www-form-urlencoded')
-        try:
-            self.report_age_confirmation()
-            compat_urllib_request.urlopen(request).read()
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
+        self.report_age_confirmation()
+        self._download_webpage(request, None, False, u'Unable to confirm age')
 
     def _real_extract(self, url):
         # Extract id and simplified title from URL
@@ -106,10 +111,16 @@ class MetacafeIE(InfoExtractor):
 
         video_id = mobj.group(1)
 
-        # Check if video comes from YouTube
-        mobj2 = re.match(r'^yt-(.*)$', video_id)
-        if mobj2 is not None:
-            return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
+        # the video may come from an external site
+        m_external = re.match('^(\w{2})-(.*)$', video_id)
+        if m_external is not None:
+            prefix, ext_id = m_external.groups()
+            # Check if video comes from YouTube
+            if prefix == 'yt':
+                return self.url_result('http://www.youtube.com/watch?v=%s' % ext_id, 'Youtube')
+            # CBS videos use theplatform.com
+            if prefix == 'cb':
+                return self.url_result('theplatform:%s' % ext_id, 'ThePlatform')
 
         # Retrieve video webpage to extract further information
         req = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
index 449138b569f80c97154ea79ac874617efc484a3f..6b95b4998852ac61d1061e0dcf6c3f442772fee2 100644 (file)
@@ -43,13 +43,10 @@ class MetacriticIE(InfoExtractor):
         description = self._html_search_regex(r'<b>Description:</b>(.*?)</p>',
             webpage, u'description', flags=re.DOTALL)
 
-        info = {
+        return {
             'id': video_id,
             'title': clip.find('title').text,
             'formats': formats,
             'description': description,
             'duration': int(clip.find('duration').text),
         }
-        # TODO: Remove when #980 has been merged
-        info.update(formats[-1])
-        return info
index e2baf44d7e15032022e6b304ace2bf8ef11a09b2..04fa3ac7ac7a3bc5ffa570d201ddfbb54762fc2b 100644 (file)
@@ -1,13 +1,10 @@
 import json
 import re
-import socket
 
 from .common import InfoExtractor
 from ..utils import (
-    compat_http_client,
-    compat_urllib_error,
-    compat_urllib_request,
     unified_strdate,
+    ExtractorError,
 )
 
 
@@ -31,9 +28,11 @@ class MixcloudIE(InfoExtractor):
         """Returns 1st active url from list"""
         for url in url_list:
             try:
-                compat_urllib_request.urlopen(url)
+                # We only want to know if the request succeed
+                # don't download the whole file
+                self._request_webpage(url, None, False)
                 return url
-            except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error):
+            except ExtractorError:
                 url = None
 
         return None
index 42aee58befdbe9e0e73a72dfefddd30a7c7cbd81..6b3feb560768f96c4d5b3bb3adc0989ecf1c1d4f 100644 (file)
@@ -10,35 +10,8 @@ from ..utils import (
 def _media_xml_tag(tag):
     return '{http://search.yahoo.com/mrss/}%s' % tag
 
-class MTVIE(InfoExtractor):
-    _VALID_URL = r'^https?://(?:www\.)?mtv\.com/videos/.+?/(?P<videoid>[0-9]+)/[^/]+$'
-
-    _FEED_URL = 'http://www.mtv.com/player/embed/AS3/rss/'
-
-    _TESTS = [
-        {
-            u'url': u'http://www.mtv.com/videos/misc/853555/ours-vh1-storytellers.jhtml',
-            u'file': u'853555.mp4',
-            u'md5': u'850f3f143316b1e71fa56a4edfd6e0f8',
-            u'info_dict': {
-                u'title': u'Taylor Swift - "Ours (VH1 Storytellers)"',
-                u'description': u'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.',
-            },
-        },
-        {
-            u'add_ie': ['Vevo'],
-            u'url': u'http://www.mtv.com/videos/taylor-swift/916187/everything-has-changed-ft-ed-sheeran.jhtml',
-            u'file': u'USCJY1331283.mp4',
-            u'md5': u'73b4e7fcadd88929292fe52c3ced8caf',
-            u'info_dict': {
-                u'title': u'Everything Has Changed',
-                u'upload_date': u'20130606',
-                u'uploader': u'Taylor Swift',
-            },
-            u'skip': u'VEVO is only available in some countries',
-        },
-    ]
 
+class MTVServicesInfoExtractor(InfoExtractor):
     @staticmethod
     def _id_from_uri(uri):
         return uri.split(':')[-1]
@@ -53,7 +26,12 @@ class MTVIE(InfoExtractor):
         return base + m.group('finalid')
 
     def _get_thumbnail_url(self, uri, itemdoc):
-        return 'http://mtv.mtvnimages.com/uri/' + uri
+        search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail'))
+        thumb_node = itemdoc.find(search_path)
+        if thumb_node is None:
+            return None
+        else:
+            return thumb_node.attrib['url']
 
     def _extract_video_formats(self, metadataXml):
         if '/error_country_block.swf' in metadataXml:
@@ -93,7 +71,7 @@ class MTVIE(InfoExtractor):
         else:
             description = None
 
-        info = {
+        return {
             'title': itemdoc.find('title').text,
             'formats': self._extract_video_formats(mediagen_page),
             'id': video_id,
@@ -101,11 +79,6 @@ class MTVIE(InfoExtractor):
             'description': description,
         }
 
-        # TODO: Remove when #980 has been merged
-        info.update(info['formats'][-1])
-
-        return info
-
     def _get_videos_info(self, uri):
         video_id = self._id_from_uri(uri)
         data = compat_urllib_parse.urlencode({'uri': uri})
@@ -113,6 +86,39 @@ class MTVIE(InfoExtractor):
                                          u'Downloading info')
         return [self._get_video_info(item) for item in idoc.findall('.//item')]
 
+
+class MTVIE(MTVServicesInfoExtractor):
+    _VALID_URL = r'^https?://(?:www\.)?mtv\.com/videos/.+?/(?P<videoid>[0-9]+)/[^/]+$'
+
+    _FEED_URL = 'http://www.mtv.com/player/embed/AS3/rss/'
+
+    _TESTS = [
+        {
+            u'url': u'http://www.mtv.com/videos/misc/853555/ours-vh1-storytellers.jhtml',
+            u'file': u'853555.mp4',
+            u'md5': u'850f3f143316b1e71fa56a4edfd6e0f8',
+            u'info_dict': {
+                u'title': u'Taylor Swift - "Ours (VH1 Storytellers)"',
+                u'description': u'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.',
+            },
+        },
+        {
+            u'add_ie': ['Vevo'],
+            u'url': u'http://www.mtv.com/videos/taylor-swift/916187/everything-has-changed-ft-ed-sheeran.jhtml',
+            u'file': u'USCJY1331283.mp4',
+            u'md5': u'73b4e7fcadd88929292fe52c3ced8caf',
+            u'info_dict': {
+                u'title': u'Everything Has Changed',
+                u'upload_date': u'20130606',
+                u'uploader': u'Taylor Swift',
+            },
+            u'skip': u'VEVO is only available in some countries',
+        },
+    ]
+
+    def _get_thumbnail_url(self, uri, itemdoc):
+        return 'http://mtv.mtvnimages.com/uri/' + uri
+
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('videoid')
index 03e31ea1c9ed98fd59c72504dffb2fa37c80edb7..1772b7f9ae43c2eaef57a15a5b3df5d9e7244213 100644 (file)
@@ -9,7 +9,7 @@ from ..utils import (
 
 
 class MuzuTVIE(InfoExtractor):
-    _VALID_URL = r'https?://www.muzu.tv/(.+?)/(.+?)/(?P<id>\d+)'
+    _VALID_URL = r'https?://www\.muzu\.tv/(.+?)/(.+?)/(?P<id>\d+)'
     IE_NAME = u'muzu.tv'
 
     _TEST = {
index 0067bf134fb416596c5db6948060ede7881421fa..4becddee604b4ec60a7ffd44c0619a07d31c2514 100644 (file)
@@ -9,7 +9,7 @@ from ..utils import (
 
 
 class MySpassIE(InfoExtractor):
-    _VALID_URL = r'http://www.myspass.de/.*'
+    _VALID_URL = r'http://www\.myspass\.de/.*'
     _TEST = {
         u'url': u'http://www.myspass.de/myspass/shows/tvshows/absolute-mehrheit/Absolute-Mehrheit-vom-17022013-Die-Highlights-Teil-2--/11741/',
         u'file': u'11741.mp4',
index d290397c7874736948850038f0f7d0b0fc565128..c012ec0cfacb2afea6b395c5c87509f53ed58614 100644 (file)
@@ -56,7 +56,7 @@ class NaverIE(InfoExtractor):
                 'height': int(format_el.find('height').text),
             })
 
-        info = {
+        return {
             'id': video_id,
             'title': info.find('Subject').text,
             'formats': formats,
@@ -65,6 +65,3 @@ class NaverIE(InfoExtractor):
             'upload_date': info.find('WriteDate').text.replace('.', ''),
             'view_count': int(info.find('PlayCount').text),
         }
-        # TODO: Remove when #980 has been merged
-        info.update(formats[-1])
-        return info
diff --git a/youtube_dl/extractor/ninegag.py b/youtube_dl/extractor/ninegag.py
new file mode 100644 (file)
index 0000000..ea986c0
--- /dev/null
@@ -0,0 +1,43 @@
+import json
+import re
+
+from .common import InfoExtractor
+
+
+class NineGagIE(InfoExtractor):
+    IE_NAME = '9gag'
+    _VALID_URL = r'^https?://(?:www\.)?9gag\.tv/v/(?P<id>[0-9]+)'
+
+    _TEST = {
+        u"url": u"http://9gag.tv/v/1912",
+        u"file": u"1912.mp4",
+        u"info_dict": {
+            u"description": u"This 3-minute video will make you smile and then make you feel untalented and insignificant. Anyway, you should share this awesomeness. (Thanks, Dino!)",
+            u"title": u"\"People Are Awesome 2013\" Is Absolutely Awesome"
+        },
+        u'add_ie': [u'Youtube']
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+        data_json = self._html_search_regex(r'''(?x)
+            <div\s*id="tv-video"\s*data-video-source="youtube"\s*
+                data-video-meta="([^"]+)"''', webpage, u'video metadata')
+
+        data = json.loads(data_json)
+
+        return {
+            '_type': 'url_transparent',
+            'url': data['youtubeVideoId'],
+            'ie_key': 'Youtube',
+            'id': video_id,
+            'title': data['title'],
+            'description': data['description'],
+            'view_count': int(data['view_count']),
+            'like_count': int(data['statistic']['like']),
+            'dislike_count': int(data['statistic']['dislike']),
+            'thumbnail': data['thumbnail_url'],
+        }
index cfca2a06352287038ff367e0f83fa67bd4cee782..b42eae89aca1bdc894e29a876d06e4c5d49564a0 100644 (file)
@@ -12,7 +12,7 @@ from ..utils import (
 )
 
 class ORFIE(InfoExtractor):
-    _VALID_URL = r'https?://tvthek.orf.at/(programs/.+?/episodes|topics/.+?)/(?P<id>\d+)'
+    _VALID_URL = r'https?://tvthek\.orf\.at/(programs/.+?/episodes|topics/.+?)/(?P<id>\d+)'
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
index 65462d867027b67f3cf8a26d6e6fa7a545471fe7..25f019231e8f98b49666f6d6a74400d494305b82 100644 (file)
@@ -5,7 +5,7 @@ from .common import InfoExtractor
 
 
 class PBSIE(InfoExtractor):
-    _VALID_URL = r'https?://video.pbs.org/video/(?P<id>\d+)/?'
+    _VALID_URL = r'https?://video\.pbs\.org/video/(?P<id>\d+)/?'
 
     _TEST = {
         u'url': u'http://video.pbs.org/video/2365006249/',
diff --git a/youtube_dl/extractor/pyvideo.py b/youtube_dl/extractor/pyvideo.py
new file mode 100644 (file)
index 0000000..3305459
--- /dev/null
@@ -0,0 +1,51 @@
+import re
+import os
+
+from .common import InfoExtractor
+
+
+class PyvideoIE(InfoExtractor):
+    _VALID_URL = r'(?:http://)?(?:www\.)?pyvideo\.org/video/(?P<id>\d+)/(.*)'
+    _TESTS = [{
+        u'url': u'http://pyvideo.org/video/1737/become-a-logging-expert-in-30-minutes',
+        u'file': u'24_4WWkSmNo.mp4',
+        u'md5': u'de317418c8bc76b1fd8633e4f32acbc6',
+        u'info_dict': {
+            u"title": u"Become a logging expert in 30 minutes",
+            u"description": u"md5:9665350d466c67fb5b1598de379021f7",
+            u"upload_date": u"20130320",
+            u"uploader": u"NextDayVideo",
+            u"uploader_id": u"NextDayVideo",
+        },
+        u'add_ie': ['Youtube'],
+    },
+    {
+        u'url': u'http://pyvideo.org/video/2542/gloriajw-spotifywitherikbernhardsson182m4v',
+        u'md5': u'5fe1c7e0a8aa5570330784c847ff6d12',
+        u'info_dict': {
+            u'id': u'2542',
+            u'ext': u'm4v',
+            u'title': u'Gloriajw-SpotifyWithErikBernhardsson182',
+        },
+    },
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        webpage = self._download_webpage(url, video_id)
+        m_youtube = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', webpage)
+
+        if m_youtube is not None:
+            return self.url_result(m_youtube.group(1), 'Youtube')
+
+        title = self._html_search_regex(r'<div class="section">.*?<h3>([^>]+?)</h3>',
+            webpage, u'title', flags=re.DOTALL)
+        video_url = self._search_regex([r'<source src="(.*?)"',
+            r'<dt>Download</dt>.*?<a href="(.+?)"'],
+            webpage, u'video url', flags=re.DOTALL)
+        return {
+            'id': video_id,
+            'title': os.path.splitext(title)[0],
+            'url': video_url,
+        }
index 3bbda128e1a3881ffd0b7a81e6c45da128994db5..c2254ae8abdca2ab9dde2388fb2182b056ffd0e2 100644 (file)
@@ -30,7 +30,7 @@ class RedTubeIE(InfoExtractor):
             r'<source src="(.+?)" type="video/mp4">', webpage, u'video URL')
 
         video_title = self._html_search_regex(
-            r'<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
+            r'<h1 class="videoTitle[^"]*">(.+?)</h1>',
             webpage, u'title')
 
         # No self-labeling, but they describe themselves as
index a18034fe26411288bf13f49ea86ee617317fc780..e3e9bc07ffbf9cfbfb6a092f6f88583a31a012fb 100644 (file)
@@ -11,7 +11,7 @@ from ..utils import (
 
 
 class RutubeIE(InfoExtractor):
-    _VALID_URL = r'https?://rutube.ru/video/(?P<long_id>\w+)'
+    _VALID_URL = r'https?://rutube\.ru/video/(?P<long_id>\w+)'
 
     _TEST = {
         u'url': u'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/',
index f5003c7f91bc78d10a63d25604537e5e77f9fdb8..d68646d24bf80c31e31ec71c6d7a4fe872f8b033 100644 (file)
@@ -4,7 +4,7 @@ from .common import InfoExtractor
 
 
 class SlashdotIE(InfoExtractor):
-    _VALID_URL = r'https?://tv.slashdot.org/video/\?embed=(?P<id>.*?)(&|$)'
+    _VALID_URL = r'https?://tv\.slashdot\.org/video/\?embed=(?P<id>.*?)(&|$)'
 
     _TEST = {
         u'add_ie': ['Ooyala'],
diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py
new file mode 100644 (file)
index 0000000..4ea89bf
--- /dev/null
@@ -0,0 +1,356 @@
+# encoding: utf-8
+
+import re
+import json
+import hashlib
+import uuid
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_urllib_parse,
+    compat_urllib_request,
+    ExtractorError,
+)
+
+
+class SmotriIE(InfoExtractor):
+    IE_DESC = u'Smotri.com'
+    IE_NAME = u'smotri'
+    _VALID_URL = r'^https?://(?:www\.)?(?P<url>smotri\.com/video/view/\?id=(?P<videoid>v(?P<realvideoid>[0-9]+)[a-z0-9]{4}))'
+
+    _TESTS = [
+        # real video id 2610366
+        {
+            u'url': u'http://smotri.com/video/view/?id=v261036632ab',
+            u'file': u'v261036632ab.mp4',
+            u'md5': u'2a7b08249e6f5636557579c368040eb9',
+            u'info_dict': {
+                u'title': u'катастрофа с камер видеонаблюдения',
+                u'uploader': u'rbc2008',
+                u'uploader_id': u'rbc08',
+                u'upload_date': u'20131118',
+                u'description': u'катастрофа с камер видеонаблюдения, видео катастрофа с камер видеонаблюдения',
+                u'thumbnail': u'http://frame6.loadup.ru/8b/a9/2610366.3.3.jpg',
+            },
+        },
+        # real video id 57591
+        {
+            u'url': u'http://smotri.com/video/view/?id=v57591cb20',
+            u'file': u'v57591cb20.flv',
+            u'md5': u'830266dfc21f077eac5afd1883091bcd',
+            u'info_dict': {
+                u'title': u'test',
+                u'uploader': u'Support Photofile@photofile',
+                u'uploader_id': u'support-photofile',
+                u'upload_date': u'20070704',
+                u'description': u'test, видео test',
+                u'thumbnail': u'http://frame4.loadup.ru/03/ed/57591.2.3.jpg',
+            },
+        },
+        # video-password
+        {
+            u'url': u'http://smotri.com/video/view/?id=v1390466a13c',
+            u'file': u'v1390466a13c.mp4',
+            u'md5': u'f6331cef33cad65a0815ee482a54440b',
+            u'info_dict': {
+                u'title': u'TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1',
+                u'uploader': u'timoxa40',
+                u'uploader_id': u'timoxa40',
+                u'upload_date': u'20100404',
+                u'thumbnail': u'http://frame7.loadup.ru/af/3f/1390466.3.3.jpg',
+                u'description': u'TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1, видео TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1',
+            },
+            u'params': {
+                u'videopassword': u'qwerty',
+            },
+        },
+        # age limit + video-password
+        {
+            u'url': u'http://smotri.com/video/view/?id=v15408898bcf',
+            u'file': u'v15408898bcf.flv',
+            u'md5': u'91e909c9f0521adf5ee86fbe073aad70',
+            u'info_dict': {
+                u'title': u'этот ролик не покажут по ТВ',
+                u'uploader': u'zzxxx',
+                u'uploader_id': u'ueggb',
+                u'upload_date': u'20101001',
+                u'thumbnail': u'http://frame3.loadup.ru/75/75/1540889.1.3.jpg',
+                u'age_limit': 18,
+                u'description': u'этот ролик не покажут по ТВ, видео этот ролик не покажут по ТВ',
+            },
+            u'params': {
+                u'videopassword': u'333'
+            }
+        }
+    ]
+    
+    _SUCCESS = 0
+    _PASSWORD_NOT_VERIFIED = 1
+    _PASSWORD_DETECTED = 2
+    _VIDEO_NOT_FOUND = 3
+
+    def _search_meta(self, name, html, display_name=None):
+        if display_name is None:
+            display_name = name
+        return self._html_search_regex(
+            r'<meta itemprop="%s" content="([^"]+)" />' % re.escape(name),
+            html, display_name, fatal=False)
+        return self._html_search_meta(name, html, display_name)
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('videoid')
+        real_video_id = mobj.group('realvideoid')
+
+        # Download video JSON data
+        video_json_url = 'http://smotri.com/vt.php?id=%s' % real_video_id
+        video_json_page = self._download_webpage(video_json_url, video_id, u'Downloading video JSON')
+        video_json = json.loads(video_json_page)
+        
+        status = video_json['status']
+        if status == self._VIDEO_NOT_FOUND:
+            raise ExtractorError(u'Video %s does not exist' % video_id, expected=True)
+        elif status == self._PASSWORD_DETECTED:  # The video is protected by a password, retry with
+                                                # video-password set
+            video_password = self._downloader.params.get('videopassword', None)
+            if not video_password:
+                raise ExtractorError(u'This video is protected by a password, use the --video-password option', expected=True)
+            video_json_url += '&md5pass=%s' % hashlib.md5(video_password.encode('utf-8')).hexdigest()
+            video_json_page = self._download_webpage(video_json_url, video_id, u'Downloading video JSON (video-password set)')
+            video_json = json.loads(video_json_page)
+            status = video_json['status']
+            if status == self._PASSWORD_NOT_VERIFIED:
+                raise ExtractorError(u'Video password is invalid', expected=True)
+        
+        if status != self._SUCCESS:
+            raise ExtractorError(u'Unexpected status value %s' % status)
+        
+        # Extract the URL of the video
+        video_url = video_json['file_data']
+        
+        # Video JSON does not provide enough meta data
+        # We will extract some from the video web page instead
+        video_page_url = 'http://' + mobj.group('url')
+        video_page = self._download_webpage(video_page_url, video_id, u'Downloading video page')
+        
+        # Adult content
+        if re.search(u'EroConfirmText">', video_page) is not None:
+            self.report_age_confirmation()
+            confirm_string = self._html_search_regex(
+                r'<a href="/video/view/\?id=%s&confirm=([^"]+)" title="[^"]+">' % video_id,
+                video_page, u'confirm string')
+            confirm_url = video_page_url + '&confirm=%s' % confirm_string
+            video_page = self._download_webpage(confirm_url, video_id, u'Downloading video page (age confirmed)')
+            adult_content = True
+        else:
+            adult_content = False
+        
+        # Extract the rest of meta data
+        video_title = self._search_meta(u'name', video_page, u'title')
+        if not video_title:
+            video_title = video_url.rsplit('/', 1)[-1]
+
+        video_description = self._search_meta(u'description', video_page)
+        END_TEXT = u' на сайте Smotri.com'
+        if video_description.endswith(END_TEXT):
+            video_description = video_description[:-len(END_TEXT)]
+        START_TEXT = u'Смотреть онлайн ролик '
+        if video_description.startswith(START_TEXT):
+            video_description = video_description[len(START_TEXT):]
+        video_thumbnail = self._search_meta(u'thumbnail', video_page)
+
+        upload_date_str = self._search_meta(u'uploadDate', video_page, u'upload date')
+        upload_date_m = re.search(r'(?P<year>\d{4})\.(?P<month>\d{2})\.(?P<day>\d{2})T', upload_date_str)
+        video_upload_date = (
+            (
+                upload_date_m.group('year') +
+                upload_date_m.group('month') +
+                upload_date_m.group('day')
+            )
+            if upload_date_m else None
+        )
+        
+        duration_str = self._search_meta(u'duration', video_page)
+        duration_m = re.search(r'T(?P<hours>[0-9]{2})H(?P<minutes>[0-9]{2})M(?P<seconds>[0-9]{2})S', duration_str)
+        video_duration = (
+            (
+                (int(duration_m.group('hours')) * 60 * 60) +
+                (int(duration_m.group('minutes')) * 60) +
+                int(duration_m.group('seconds'))
+            )
+            if duration_m else None
+        )
+        
+        video_uploader = self._html_search_regex(
+            u'<div class="DescrUser"><div>Автор.*?onmouseover="popup_user_info[^"]+">(.*?)</a>',
+            video_page, u'uploader', fatal=False, flags=re.MULTILINE|re.DOTALL)
+        
+        video_uploader_id = self._html_search_regex(
+            u'<div class="DescrUser"><div>Автор.*?onmouseover="popup_user_info\\(.*?\'([^\']+)\'\\);">',
+            video_page, u'uploader id', fatal=False, flags=re.MULTILINE|re.DOTALL)
+        
+        video_view_count = self._html_search_regex(
+            u'Общее количество просмотров.*?<span class="Number">(\\d+)</span>',
+            video_page, u'view count', fatal=False, flags=re.MULTILINE|re.DOTALL)
+                
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': video_title,
+            'thumbnail': video_thumbnail,
+            'description': video_description,
+            'uploader': video_uploader,
+            'upload_date': video_upload_date,
+            'uploader_id': video_uploader_id,
+            'video_duration': video_duration,
+            'view_count': video_view_count,
+            'age_limit': 18 if adult_content else 0,
+            'video_page_url': video_page_url
+        }
+
+
+class SmotriCommunityIE(InfoExtractor):
+    IE_DESC = u'Smotri.com community videos'
+    IE_NAME = u'smotri:community'
+    _VALID_URL = r'^https?://(?:www\.)?smotri\.com/community/video/(?P<communityid>[0-9A-Za-z_\'-]+)'
+    
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        community_id = mobj.group('communityid')
+
+        url = 'http://smotri.com/export/rss/video/by/community/-/%s/video.xml' % community_id
+        rss = self._download_xml(url, community_id, u'Downloading community RSS')
+
+        entries = [self.url_result(video_url.text, 'Smotri')
+                   for video_url in rss.findall('./channel/item/link')]
+
+        description_text = rss.find('./channel/description').text
+        community_title = self._html_search_regex(
+            u'^Видео сообщества "([^"]+)"$', description_text, u'community title')
+
+        return self.playlist_result(entries, community_id, community_title)
+
+
+class SmotriUserIE(InfoExtractor):
+    IE_DESC = u'Smotri.com user videos'
+    IE_NAME = u'smotri:user'
+    _VALID_URL = r'^https?://(?:www\.)?smotri\.com/user/(?P<userid>[0-9A-Za-z_\'-]+)'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        user_id = mobj.group('userid')
+
+        url = 'http://smotri.com/export/rss/user/video/-/%s/video.xml' % user_id
+        rss = self._download_xml(url, user_id, u'Downloading user RSS')
+
+        entries = [self.url_result(video_url.text, 'Smotri')
+                   for video_url in rss.findall('./channel/item/link')]
+
+        description_text = rss.find('./channel/description').text
+        user_nickname = self._html_search_regex(
+            u'^Видео режиссера (.*)$', description_text,
+            u'user nickname')
+
+        return self.playlist_result(entries, user_id, user_nickname)
+
+
+class SmotriBroadcastIE(InfoExtractor):
+    IE_DESC = u'Smotri.com broadcasts'
+    IE_NAME = u'smotri:broadcast'
+    _VALID_URL = r'^https?://(?:www\.)?(?P<url>smotri\.com/live/(?P<broadcastid>[^/]+))/?.*'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        broadcast_id = mobj.group('broadcastid')
+
+        broadcast_url = 'http://' + mobj.group('url')
+        broadcast_page = self._download_webpage(broadcast_url, broadcast_id, u'Downloading broadcast page')
+
+        if re.search(u'>Режиссер с логином <br/>"%s"<br/> <span>не существует<' % broadcast_id, broadcast_page) is not None:
+            raise ExtractorError(u'Broadcast %s does not exist' % broadcast_id, expected=True)
+
+        # Adult content
+        if re.search(u'EroConfirmText">', broadcast_page) is not None:
+
+            (username, password) = self._get_login_info()
+            if username is None:
+                raise ExtractorError(u'Erotic broadcasts allowed only for registered users, '
+                    u'use --username and --password options to provide account credentials.', expected=True)
+
+            # Log in
+            login_form_strs = {
+                u'login-hint53': '1',
+                u'confirm_erotic': '1',
+                u'login': username,
+                u'password': password,
+            }
+            # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
+            # chokes on unicode
+            login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
+            login_data = compat_urllib_parse.urlencode(login_form).encode('utf-8')
+            login_url = broadcast_url + '/?no_redirect=1'
+            request = compat_urllib_request.Request(login_url, login_data)
+            request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+            broadcast_page = self._download_webpage(
+                request, broadcast_id, note=u'Logging in and confirming age')
+
+            if re.search(u'>Неверный логин или пароль<', broadcast_page) is not None:
+                raise ExtractorError(u'Unable to log in: bad username or password', expected=True)
+
+            adult_content = True
+        else:
+            adult_content = False
+
+        ticket = self._html_search_regex(
+            u'window\.broadcast_control\.addFlashVar\\(\'file\', \'([^\']+)\'\\);',
+            broadcast_page, u'broadcast ticket')
+
+        url = 'http://smotri.com/broadcast/view/url/?ticket=%s' % ticket
+
+        broadcast_password = self._downloader.params.get('videopassword', None)
+        if broadcast_password:
+            url += '&pass=%s' % hashlib.md5(broadcast_password.encode('utf-8')).hexdigest()
+
+        broadcast_json_page = self._download_webpage(url, broadcast_id, u'Downloading broadcast JSON')
+
+        try:
+            broadcast_json = json.loads(broadcast_json_page)
+
+            protected_broadcast = broadcast_json['_pass_protected'] == 1
+            if protected_broadcast and not broadcast_password:
+                raise ExtractorError(u'This broadcast is protected by a password, use the --video-password option', expected=True)
+
+            broadcast_offline = broadcast_json['is_play'] == 0
+            if broadcast_offline:
+                raise ExtractorError(u'Broadcast %s is offline' % broadcast_id, expected=True)
+
+            rtmp_url = broadcast_json['_server']
+            if not rtmp_url.startswith('rtmp://'):
+                raise ExtractorError(u'Unexpected broadcast rtmp URL')
+
+            broadcast_playpath = broadcast_json['_streamName']
+            broadcast_thumbnail = broadcast_json['_imgURL']
+            broadcast_title = broadcast_json['title']
+            broadcast_description = broadcast_json['description']
+            broadcaster_nick = broadcast_json['nick']
+            broadcaster_login = broadcast_json['login']
+            rtmp_conn = 'S:%s' % uuid.uuid4().hex
+        except KeyError:
+            if protected_broadcast:
+                raise ExtractorError(u'Bad broadcast password', expected=True)
+            raise ExtractorError(u'Unexpected broadcast JSON')
+
+        return {
+            'id': broadcast_id,
+            'url': rtmp_url,
+            'title': broadcast_title,
+            'thumbnail': broadcast_thumbnail,
+            'description': broadcast_description,
+            'uploader': broadcaster_nick,
+            'uploader_id': broadcaster_login,
+            'age_limit': 18 if adult_content else 0,
+            'ext': 'flv',
+            'play_path': broadcast_playpath,
+            'rtmp_live': True,
+            'rtmp_conn': rtmp_conn
+        }
index 3a19ab17222831d87ffde4992e5712b01359e6eb..cb6dedab758aa93ce498e0d6d61a08ce15eeab0a 100644 (file)
@@ -25,7 +25,7 @@ class SoundcloudIE(InfoExtractor):
     _VALID_URL = r'''^(?:https?://)?
                     (?:(?:(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)/?(?:[?].*)?$)
                        |(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+))
-                       |(?P<widget>w.soundcloud.com/player/?.*?url=.*)
+                       |(?P<widget>w\.soundcloud\.com/player/?.*?url=.*)
                     )
                     '''
     IE_NAME = u'soundcloud'
@@ -217,7 +217,7 @@ class SoundcloudSetIE(SoundcloudIE):
 
 
 class SoundcloudUserIE(SoundcloudIE):
-    _VALID_URL = r'https?://(www\.)?soundcloud.com/(?P<user>[^/]+)(/?(tracks/)?)?(\?.*)?$'
+    _VALID_URL = r'https?://(www\.)?soundcloud\.com/(?P<user>[^/]+)(/?(tracks/)?)?(\?.*)?$'
     IE_NAME = u'soundcloud:user'
 
     # it's in tests/test_playlists.py
index a711531e668bbc3ba32bfa3a93872c5f25ac73ab..fd90cc5dd18f966242d658df1e133456271c8ee3 100644 (file)
@@ -1,15 +1,14 @@
 import re
 
-from .mtv import MTVIE, _media_xml_tag
+from .mtv import MTVServicesInfoExtractor
 
 
-class SouthParkStudiosIE(MTVIE):
+class SouthParkStudiosIE(MTVServicesInfoExtractor):
     IE_NAME = u'southparkstudios.com'
     _VALID_URL = r'(https?://)?(www\.)?(?P<url>southparkstudios\.com/(clips|full-episodes)/(?P<id>.+?)(\?|#|$))'
 
     _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss'
 
-    # Overwrite MTVIE properties we don't want
     _TESTS = [{
         u'url': u'http://www.southparkstudios.com/clips/104437/bat-daded#tab=featured',
         u'file': u'a7bff6c2-ed00-11e0-aca6-0026b9414f30.mp4',
@@ -19,14 +18,6 @@ class SouthParkStudiosIE(MTVIE):
         },
     }]
 
-    def _get_thumbnail_url(self, uri, itemdoc):
-        search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail'))
-        thumb_node = itemdoc.find(search_path)
-        if thumb_node is None:
-            return None
-        else:
-            return thumb_node.attrib['url']
-
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         url = u'http://www.' + mobj.group(u'url')
index 0d32a068895e1c0a53cd23c61d6cdc233d82826d..11455e0fa212f3ab6ec2b9cb258f2824346a2862 100644 (file)
@@ -6,7 +6,7 @@ from ..utils import RegexNotFoundError, ExtractorError
 
 
 class SpaceIE(InfoExtractor):
-    _VALID_URL = r'https?://www\.space\.com/\d+-(?P<title>[^/\.\?]*?)-video.html'
+    _VALID_URL = r'https?://www\.space\.com/\d+-(?P<title>[^/\.\?]*?)-video\.html'
     _TEST = {
         u'add_ie': ['Brightcove'],
         u'url': u'http://www.space.com/23373-huge-martian-landforms-detail-revealed-by-european-probe-video.html',
index b27838bf9dc5ea430f01b054eb152d78fb946d0d..d0d0989f09bd154451cf45c04acd451e47db23da 100644 (file)
@@ -1,13 +1,8 @@
 import re
-import socket
-import xml.etree.ElementTree
 
 from .common import InfoExtractor
 from ..utils import (
-    compat_http_client,
     compat_str,
-    compat_urllib_error,
-    compat_urllib_request,
 
     ExtractorError,
     orderedSet,
@@ -18,7 +13,7 @@ from ..utils import (
 class StanfordOpenClassroomIE(InfoExtractor):
     IE_NAME = u'stanfordoc'
     IE_DESC = u'Stanford Open ClassRoom'
-    _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
+    _VALID_URL = r'^(?:https?://)?openclassroom\.stanford\.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
     _TEST = {
         u'url': u'http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100',
         u'file': u'PracticalUnix_intro-environment.mp4',
@@ -45,11 +40,7 @@ class StanfordOpenClassroomIE(InfoExtractor):
             self.report_extraction(info['id'])
             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
             xmlUrl = baseUrl + video + '.xml'
-            try:
-                metaXml = compat_urllib_request.urlopen(xmlUrl).read()
-            except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-                raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
-            mdoc = xml.etree.ElementTree.fromstring(metaXml)
+            mdoc = self._download_xml(xmlUrl, info['id'])
             try:
                 info['title'] = mdoc.findall('./title')[0].text
                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
@@ -95,12 +86,9 @@ class StanfordOpenClassroomIE(InfoExtractor):
                 'upload_date': None,
             }
 
-            self.report_download_webpage(info['id'])
             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
-            try:
-                rootpage = compat_urllib_request.urlopen(rootURL).read()
-            except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-                raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
+            rootpage = self._download_webpage(rootURL, info['id'],
+                errnote=u'Unable to download course info page')
 
             info['title'] = info['id']
 
index 772134a128e6f75d3a15d4fbb4ee37a776edfe10..2c5c88be8ede5ae6d0fa9f3c4e540cddb13190b6 100644 (file)
@@ -7,7 +7,7 @@ from .common import InfoExtractor
 
 class TF1IE(InfoExtractor):
     """TF1 uses the wat.tv player."""
-    _VALID_URL = r'http://videos.tf1.fr/.*-(.*?).html'
+    _VALID_URL = r'http://videos\.tf1\.fr/.*-(.*?)\.html'
     _TEST = {
         u'url': u'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html',
         u'file': u'10635995.mp4',
diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py
new file mode 100644 (file)
index 0000000..61452e4
--- /dev/null
@@ -0,0 +1,68 @@
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+    xpath_with_ns,
+)
+
+_x = lambda p: xpath_with_ns(p, {'smil': 'http://www.w3.org/2005/SMIL21/Language'})
+
+
+class ThePlatformIE(InfoExtractor):
+    _VALID_URL = r'(?:https?://link\.theplatform\.com/s/[^/]+/|theplatform:)(?P<id>[^/\?]+)'
+
+    _TEST = {
+        # from http://www.metacafe.com/watch/cb-e9I_cZgTgIPd/blackberrys_big_bold_z30/
+        u'url': u'http://link.theplatform.com/s/dJ5BDC/e9I_cZgTgIPd/meta.smil?format=smil&Tracking=true&mbr=true',
+        u'info_dict': {
+            u'id': u'e9I_cZgTgIPd',
+            u'ext': u'flv',
+            u'title': u'Blackberry\'s big, bold Z30',
+            u'description': u'The Z30 is Blackberry\'s biggest, baddest mobile messaging device yet.',
+            u'duration': 247,
+        },
+        u'params': {
+            # rtmp download
+            u'skip_download': True,
+        },
+    }
+
+    def _get_info(self, video_id):
+        smil_url = ('http://link.theplatform.com/s/dJ5BDC/{0}/meta.smil?'
+            'format=smil&mbr=true'.format(video_id))
+        meta = self._download_xml(smil_url, video_id)
+        info_url = 'http://link.theplatform.com/s/dJ5BDC/{0}?format=preview'.format(video_id)
+        info_json = self._download_webpage(info_url, video_id)
+        info = json.loads(info_json)
+
+        head = meta.find(_x('smil:head'))
+        body = meta.find(_x('smil:body'))
+        base_url = head.find(_x('smil:meta')).attrib['base']
+        switch = body.find(_x('smil:switch'))
+        formats = []
+        for f in switch.findall(_x('smil:video')):
+            attr = f.attrib
+            formats.append({
+                'url': base_url,
+                'play_path': 'mp4:' + attr['src'],
+                'ext': 'flv',
+                'width': int(attr['width']),
+                'height': int(attr['height']),
+                'vbr': int(attr['system-bitrate']),
+            })
+        formats.sort(key=lambda f: (f['height'], f['width'], f['vbr']))
+
+        return {
+            'id': video_id,
+            'title': info['title'],
+            'formats': formats,
+            'description': info['description'],
+            'thumbnail': info['defaultThumbnailUrl'],
+            'duration': info['duration']//1000,
+        }
+        
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        return self._get_info(video_id)
index 1c49e580d19f65561f539b93d07d15eac4e9f0e4..d64aaa41f690956b08211ed4fe07e1bc27267641 100644 (file)
@@ -55,7 +55,7 @@ class TriluliluIE(InfoExtractor):
             for fnode in format_doc.findall('./formats/format')
         ]
 
-        info = {
+        return {
             '_type': 'video',
             'id': video_id,
             'formats': formats,
@@ -64,7 +64,3 @@ class TriluliluIE(InfoExtractor):
             'thumbnail': thumbnail,
         }
 
-        # TODO: Remove when #980 has been merged
-        info.update(formats[-1])
-
-        return info
index 516e18914e0f81a3fc0c7a137afaa487d63b7045..474610eec79483da01c14ca3e1d985b7aa8fd49a 100644 (file)
@@ -3,7 +3,7 @@ import re
 from .common import InfoExtractor
 
 class UnistraIE(InfoExtractor):
-    _VALID_URL = r'http://utv.unistra.fr/(?:index|video).php\?id_video\=(\d+)'
+    _VALID_URL = r'http://utv\.unistra\.fr/(?:index|video)\.php\?id_video\=(\d+)'
 
     _TEST = {
         u'url': u'http://utv.unistra.fr/video.php?id_video=154',
index 3a99a29c6520ba6824a9060264d678a5cf31e6e6..3cf8c853d2e466e00228d7eb3cb0f33d664beb9b 100644 (file)
@@ -9,7 +9,7 @@ from ..utils import (
 )
 
 class VeeHDIE(InfoExtractor):
-    _VALID_URL = r'https?://veehd.com/video/(?P<id>\d+)'
+    _VALID_URL = r'https?://veehd\.com/video/(?P<id>\d+)'
 
     _TEST = {
         u'url': u'http://veehd.com/video/4686958',
index 4378b17800f1df78275d68a9525ca95585dc8b9d..4823992ef40f9987a8ef39dde6200286f3bd40a6 100644 (file)
@@ -15,7 +15,7 @@ class VevoIE(InfoExtractor):
     Accepts urls from vevo.com or in the format 'vevo:{id}'
     (currently used by MTVIE)
     """
-    _VALID_URL = r'((http://www.vevo.com/watch/.*?/.*?/)|(vevo:))(?P<id>.*?)(\?|$)'
+    _VALID_URL = r'((http://www\.vevo\.com/watch/(?:[^/]+/[^/]+/)?)|(vevo:))(?P<id>.*?)(\?|$)'
     _TESTS = [{
         u'url': u'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280',
         u'file': u'GB1101300280.mp4',
@@ -24,7 +24,7 @@ class VevoIE(InfoExtractor):
             u"upload_date": u"20130624",
             u"uploader": u"Hurts",
             u"title": u"Somebody to Die For",
-            u"duration": 230,
+            u"duration": 230.12,
             u"width": 1920,
             u"height": 1080,
         }
index 6b93afa50f765a0be24265b3154ee8f670f68312..87812d6afa6db12558fbd5abd314f2046f29bdd4 100644 (file)
@@ -6,7 +6,7 @@ from ..utils import ExtractorError
 
 
 class ViceIE(InfoExtractor):
-    _VALID_URL = r'http://www.vice.com/.*?/(?P<name>.+)'
+    _VALID_URL = r'http://www\.vice\.com/.*?/(?P<name>.+)'
 
     _TEST = {
         u'url': u'http://www.vice.com/Fringes/cowboy-capitalists-part-1',
index 826804af37af54e308f90349e909d3e0e3aa5126..9328ef4a2121f091c256e9324d0de0e8b7dcbecd 100644 (file)
@@ -2,13 +2,10 @@ import json
 import re
 
 from .common import InfoExtractor
-from ..utils import (
-    determine_ext,
-)
 
 
 class ViddlerIE(InfoExtractor):
-    _VALID_URL = r'(?P<domain>https?://(?:www\.)?viddler.com)/(?:v|embed|player)/(?P<id>[a-z0-9]+)'
+    _VALID_URL = r'(?P<domain>https?://(?:www\.)?viddler\.com)/(?:v|embed|player)/(?P<id>[a-z0-9]+)'
     _TEST = {
         u"url": u"http://www.viddler.com/v/43903784",
         u'file': u'43903784.mp4',
@@ -47,7 +44,7 @@ class ViddlerIE(InfoExtractor):
             r"thumbnail\s*:\s*'([^']*)'",
             webpage, u'thumbnail', fatal=False)
 
-        info = {
+        return {
             '_type': 'video',
             'id': video_id,
             'title': title,
@@ -56,9 +53,3 @@ class ViddlerIE(InfoExtractor):
             'duration': duration,
             'formats': formats,
         }
-
-        # TODO: Remove when #980 has been merged
-        info['formats'][-1]['ext'] = determine_ext(info['formats'][-1]['url'])
-        info.update(info['formats'][-1])
-
-        return info
index 912802d9aa22082f2f39148db7920a6287c74ec6..f75169041b4f958b9f345daba99a4a1ba575cf4e 100644 (file)
@@ -7,7 +7,7 @@ from ..utils import (
 )
 
 class VideofyMeIE(InfoExtractor):
-    _VALID_URL = r'https?://(www.videofy.me/.+?|p.videofy.me/v)/(?P<id>\d+)(&|#|$)'
+    _VALID_URL = r'https?://(www\.videofy\.me/.+?|p\.videofy\.me/v)/(?P<id>\d+)(&|#|$)'
     IE_NAME = u'videofy.me'
 
     _TEST = {
index 7d82c2cfa84bd9b57b7ebc9eb35537b4033ba45d..fb2bd225ab0b3c21b16b9a717475cfc42232d4e5 100644 (file)
@@ -20,7 +20,7 @@ class VimeoIE(InfoExtractor):
     """Information extractor for vimeo.com."""
 
     # _VALID_URL matches Vimeo URLs
-    _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|(?P<player>player))\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)/?(?:[?].*)?(?:#.*)?$'
+    _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|(?P<player>player))\.)?vimeo(?P<pro>pro)?\.com/(?:.*?/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)/?(?:[?].*)?(?:#.*)?$'
     _NETRC_MACHINE = 'vimeo'
     IE_NAME = u'vimeo'
     _TESTS = [
@@ -196,6 +196,16 @@ class VimeoIE(InfoExtractor):
         if mobj is not None:
             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
 
+        try:
+            view_count = int(self._search_regex(r'UserPlays:(\d+)', webpage, u'view count'))
+            like_count = int(self._search_regex(r'UserLikes:(\d+)', webpage, u'like count'))
+            comment_count = int(self._search_regex(r'UserComments:(\d+)', webpage, u'comment count'))
+        except RegexNotFoundError:
+            # This info is only available in vimeo.com/{id} urls
+            view_count = None
+            like_count = None
+            comment_count = None
+
         # Vimeo specific: extract request signature and timestamp
         sig = config['request']['signature']
         timestamp = config['request']['timestamp']
@@ -242,6 +252,9 @@ class VimeoIE(InfoExtractor):
             'description':  video_description,
             'formats': formats,
             'webpage_url': url,
+            'view_count': view_count,
+            'like_count': like_count,
+            'comment_count': comment_count,
         }
 
 
@@ -249,25 +262,77 @@ class VimeoChannelIE(InfoExtractor):
     IE_NAME = u'vimeo:channel'
     _VALID_URL = r'(?:https?://)?vimeo.\com/channels/(?P<id>[^/]+)'
     _MORE_PAGES_INDICATOR = r'<a.+?rel="next"'
+    _TITLE_RE = r'<link rel="alternate"[^>]+?title="(.*?)"'
 
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        channel_id =  mobj.group('id')
-        video_ids = []
+    def _page_url(self, base_url, pagenum):
+        return '%s/videos/page:%d/' % (base_url, pagenum)
 
+    def _extract_list_title(self, webpage):
+        return self._html_search_regex(self._TITLE_RE, webpage, u'list title')
+
+    def _extract_videos(self, list_id, base_url):
+        video_ids = []
         for pagenum in itertools.count(1):
-            webpage = self._download_webpage('http://vimeo.com/channels/%s/videos/page:%d' % (channel_id, pagenum),
-                                             channel_id, u'Downloading page %s' % pagenum)
+            webpage = self._download_webpage(
+                self._page_url(base_url, pagenum) ,list_id,
+                u'Downloading page %s' % pagenum)
             video_ids.extend(re.findall(r'id="clip_(\d+?)"', webpage))
             if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None:
                 break
 
         entries = [self.url_result('http://vimeo.com/%s' % video_id, 'Vimeo')
                    for video_id in video_ids]
-        channel_title = self._html_search_regex(r'<a href="/channels/%s">(.*?)</a>' % channel_id,
-                                                webpage, u'channel title')
         return {'_type': 'playlist',
-                'id': channel_id,
-                'title': channel_title,
+                'id': list_id,
+                'title': self._extract_list_title(webpage),
                 'entries': entries,
                 }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        channel_id =  mobj.group('id')
+        return self._extract_videos(channel_id, 'http://vimeo.com/channels/%s' % channel_id)
+
+
+class VimeoUserIE(VimeoChannelIE):
+    IE_NAME = u'vimeo:user'
+    _VALID_URL = r'(?:https?://)?vimeo.\com/(?P<name>[^/]+)'
+    _TITLE_RE = r'<a[^>]+?class="user">([^<>]+?)</a>'
+
+    @classmethod
+    def suitable(cls, url):
+        if VimeoChannelIE.suitable(url) or VimeoIE.suitable(url) or VimeoAlbumIE.suitable(url) or VimeoGroupsIE.suitable(url):
+            return False
+        return super(VimeoUserIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        name = mobj.group('name')
+        return self._extract_videos(name, 'http://vimeo.com/%s' % name)
+
+
+class VimeoAlbumIE(VimeoChannelIE):
+    IE_NAME = u'vimeo:album'
+    _VALID_URL = r'(?:https?://)?vimeo.\com/album/(?P<id>\d+)'
+    _TITLE_RE = r'<header id="page_header">\n\s*<h1>(.*?)</h1>'
+
+    def _page_url(self, base_url, pagenum):
+        return '%s/page:%d/' % (base_url, pagenum)
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        album_id =  mobj.group('id')
+        return self._extract_videos(album_id, 'http://vimeo.com/album/%s' % album_id)
+
+
+class VimeoGroupsIE(VimeoAlbumIE):
+    IE_NAME = u'vimeo:group'
+    _VALID_URL = r'(?:https?://)?vimeo.\com/groups/(?P<name>[^/]+)'
+
+    def _extract_list_title(self, webpage):
+        return self._og_search_title(webpage)
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        name = mobj.group('name')
+        return self._extract_videos(name, 'http://vimeo.com/groups/%s' % name)
index 29c25f0e309c7d4179d1226ed0a079a0d17fcba6..4fab6c6e8511711047e3ba9143452397a0aca0fa 100644 (file)
@@ -11,7 +11,7 @@ from ..utils import (
 
 
 class WatIE(InfoExtractor):
-    _VALID_URL=r'http://www.wat.tv/.*-(?P<shortID>.*?)_.*?.html'
+    _VALID_URL=r'http://www\.wat\.tv/.*-(?P<shortID>.*?)_.*?\.html'
     IE_NAME = 'wat.tv'
     _TEST = {
         u'url': u'http://www.wat.tv/video/world-war-philadelphia-vost-6bv55_2fjr7_.html',
index b9c3b13f918f5fbffa73010cbefb46543dac9586..3635691e7a15cbed6957cc7757138f3179c318f3 100644 (file)
@@ -11,7 +11,8 @@ class WimpIE(InfoExtractor):
         u'file': u'deerfence.flv',
         u'md5': u'8b215e2e0168c6081a1cf84b2846a2b5',
         u'info_dict': {
-            u"title": u"Watch Till End: Herd of deer jump over a fence."
+            u"title": u"Watch Till End: Herd of deer jump over a fence.",
+            u"description": u"These deer look as fluid as running water when they jump over this fence as a herd. This video is one that needs to be watched until the very end for the true majesty to be witnessed, but once it comes, it's sure to take your breath away.",
         }
     }
 
@@ -19,18 +20,15 @@ class WimpIE(InfoExtractor):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group(1)
         webpage = self._download_webpage(url, video_id)
-        title = self._search_regex(r'<meta name="description" content="(.+?)" />',webpage, 'video title')
-        thumbnail_url = self._search_regex(r'<meta property="og\:image" content="(.+?)" />', webpage,'video thumbnail')
+        title = self._html_search_meta('description', webpage, u'video title')
         googleString = self._search_regex("googleCode = '(.*?)'", webpage, 'file url')
         googleString = base64.b64decode(googleString).decode('ascii')
         final_url = self._search_regex('","(.*?)"', googleString,'final video url')
-        ext = final_url.rpartition(u'.')[2]
-
-        return [{
-            'id':        video_id,
-            'url':       final_url,
-            'ext':       ext,
-            'title':     title,
-            'thumbnail': thumbnail_url,
-        }]
 
+        return {
+            'id': video_id,
+            'url': final_url,
+            'title': self._og_search_title(webpage),
+            'thumbnail': self._og_search_thumbnail(webpage),
+            'description': self._og_search_description(webpage),
+        }
diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py
new file mode 100644 (file)
index 0000000..e1748c2
--- /dev/null
@@ -0,0 +1,55 @@
+import json
+import re
+
+from .common import InfoExtractor
+
+
+class WistiaIE(InfoExtractor):
+    _VALID_URL = r'^https?://(?:fast\.)?wistia\.net/embed/iframe/(?P<id>[a-z0-9]+)'
+
+    _TEST = {
+        u"url": u"http://fast.wistia.net/embed/iframe/sh7fpupwlt",
+        u"file": u"sh7fpupwlt.mov",
+        u"md5": u"cafeb56ec0c53c18c97405eecb3133df",
+        u"info_dict": {
+            u"title": u"cfh_resourceful_zdkh_final_1"
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+        data_json = self._html_search_regex(
+            r'Wistia.iframeInit\((.*?), {}\);', webpage, u'video data')
+
+        data = json.loads(data_json)
+
+        formats = []
+        thumbnails = []
+        for atype, a in data['assets'].items():
+            if atype == 'still':
+                thumbnails.append({
+                    'url': a['url'],
+                    'resolution': '%dx%d' % (a['width'], a['height']),
+                })
+                continue
+            if atype == 'preview':
+                continue
+            formats.append({
+                'format_id': atype,
+                'url': a['url'],
+                'width': a['width'],
+                'height': a['height'],
+                'filesize': a['size'],
+                'ext': a['ext'],
+            })
+        formats.sort(key=lambda a: a['filesize'])
+
+        return {
+            'id': video_id,
+            'title': data['name'],
+            'formats': formats,
+            'thumbnails': thumbnails,
+        }
index 7444d3393a25f8a49778a5bd589aa839591bd9d8..ef9997ee4456f4ec1aafdbcd915ae3b670ce1489 100644 (file)
@@ -46,7 +46,7 @@ class XHamsterIE(InfoExtractor):
                 return mobj.group('server')+'/key='+mobj.group('file')
 
         def is_hd(webpage):
-            return webpage.find('<div class=\'icon iconHD\'>') != -1
+            return webpage.find('<div class=\'icon iconHD\'') != -1
 
         mobj = re.match(self._VALID_URL, url)
 
index e457c4707a8feda7c3d0709c18671282b6da3814..5c9c361b9ee5658d307a7759040b855a3e794cf1 100644 (file)
@@ -47,7 +47,7 @@ class YahooIE(InfoExtractor):
         # The 'meta' field is not always in the video webpage, we request it
         # from another page
         long_id = info['id']
-        return self._get_info(info['id'], video_id)
+        return self._get_info(long_id, video_id)
 
     def _get_info(self, long_id, video_id):
         query = ('SELECT * FROM yahoo.media.video.streams WHERE id="%s"'
index 1fcc518acde9dbb08fef1ccb42a9ee7ae550967a..e971b5b4b3a32b801edb12efb429c8228417c307 100644 (file)
@@ -7,7 +7,7 @@ from ..utils import (
 
 
 class YouJizzIE(InfoExtractor):
-    _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
+    _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+)\.html$'
     _TEST = {
         u'url': u'http://www.youjizz.com/videos/zeichentrick-1-2189178.html',
         u'file': u'2189178.flv',
index a1a4d896debdf8fd7c38fa38d3ad95023e12302e..874429b78cc4917ca1cbbec7245c85436dd73783 100644 (file)
@@ -7,7 +7,6 @@ import itertools
 import json
 import os.path
 import re
-import socket
 import string
 import struct
 import traceback
@@ -17,9 +16,7 @@ from .common import InfoExtractor, SearchInfoExtractor
 from .subtitles import SubtitlesInfoExtractor
 from ..utils import (
     compat_chr,
-    compat_http_client,
     compat_parse_qs,
-    compat_urllib_error,
     compat_urllib_parse,
     compat_urllib_request,
     compat_urlparse,
@@ -45,19 +42,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
     # If True it will raise an error if no login info is provided
     _LOGIN_REQUIRED = False
 
-    def report_lang(self):
-        """Report attempt to set language."""
-        self.to_screen(u'Setting language')
-
     def _set_language(self):
-        request = compat_urllib_request.Request(self._LANG_URL)
-        try:
-            self.report_lang()
-            compat_urllib_request.urlopen(request).read()
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
-            return False
-        return True
+        return bool(self._download_webpage(
+            self._LANG_URL, None,
+            note=u'Setting language', errnote='unable to set language',
+            fatal=False))
 
     def _login(self):
         (username, password) = self._get_login_info()
@@ -67,12 +56,12 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
                 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
             return False
 
-        request = compat_urllib_request.Request(self._LOGIN_URL)
-        try:
-            login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
-            return False
+        login_page = self._download_webpage(
+            self._LOGIN_URL, None,
+            note=u'Downloading login page',
+            errnote=u'unable to fetch login page', fatal=False)
+        if login_page is False:
+            return
 
         galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
                                   login_page, u'Login GALX parameter')
@@ -102,29 +91,28 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
         # chokes on unicode
         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
-        request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
-        try:
-            self.report_login()
-            login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
-            if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
-                self._downloader.report_warning(u'unable to log in: bad username or password')
-                return False
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
+
+        req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
+        login_results = self._download_webpage(
+            req, None,
+            note=u'Logging in', errnote=u'unable to log in', fatal=False)
+        if login_results is False:
+            return False
+        if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
+            self._downloader.report_warning(u'unable to log in: bad username or password')
             return False
         return True
 
     def _confirm_age(self):
         age_form = {
-                'next_url':     '/',
-                'action_confirm':   'Confirm',
-                }
-        request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
-        try:
-            self.report_age_confirmation()
-            compat_urllib_request.urlopen(request).read().decode('utf-8')
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
+            'next_url': '/',
+            'action_confirm': 'Confirm',
+        }
+        req = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
+
+        self._download_webpage(
+            req, None,
+            note=u'Confirming age', errnote=u'Unable to confirm age')
         return True
 
     def _real_initialize(self):
@@ -336,7 +324,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
                 u"uploader": u"Philipp Hagemeister",
                 u"uploader_id": u"phihag",
                 u"upload_date": u"20121002",
-                u"description": u"test chars:  \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
+                u"description": u"test chars:  \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
             }
         },
         {
@@ -388,10 +376,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
         super(YoutubeIE, self).__init__(*args, **kwargs)
         self._player_cache = {}
 
-    def report_video_webpage_download(self, video_id):
-        """Report attempt to download video webpage."""
-        self.to_screen(u'%s: Downloading video webpage' % video_id)
-
     def report_video_info_webpage_download(self, video_id):
         """Report attempt to download video info webpage."""
         self.to_screen(u'%s: Downloading video info webpage' % video_id)
@@ -1258,15 +1242,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
         video_id = self._extract_id(url)
 
         # Get video webpage
-        self.report_video_webpage_download(video_id)
         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
-        request = compat_urllib_request.Request(url)
-        try:
-            video_webpage_bytes = compat_urllib_request.urlopen(request).read()
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
-
-        video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
+        video_webpage = self._download_webpage(url, video_id)
 
         # Attempt to extract SWF player URL
         mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
@@ -1366,6 +1343,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
         # description
         video_description = get_element_by_id("eow-description", video_webpage)
         if video_description:
+            video_description = re.sub(r'''(?x)
+                <a\s+
+                    (?:[a-zA-Z-]+="[^"]+"\s+)*?
+                    title="([^"]+)"\s+
+                    (?:[a-zA-Z-]+="[^"]+"\s+)*?
+                    class="yt-uix-redirect-link"\s*>
+                [^<]+
+                </a>
+            ''', r'\1', video_description)
             video_description = clean_html(video_description)
         else:
             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
@@ -1374,6 +1360,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
             else:
                 video_description = u''
 
+        def _extract_count(klass):
+            count = self._search_regex(r'class="%s">([\d,]+)</span>' % re.escape(klass), video_webpage, klass, fatal=False)
+            if count is not None:
+                return int(count.replace(',', ''))
+            return None
+        like_count = _extract_count(u'likes-count')
+        dislike_count = _extract_count(u'dislikes-count')
+
         # subtitles
         video_subtitles = self.extract_subtitles(video_id, video_webpage)
 
@@ -1506,6 +1500,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
                 'annotations':  video_annotations,
                 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
                 'view_count': view_count,
+                'like_count': like_count,
+                'dislike_count': dislike_count,
             })
         return results
 
@@ -1520,10 +1516,10 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
                            \? (?:.*?&)*? (?:p|a|list)=
                         |  p/
                         )
-                        ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
+                        ((?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,})
                         .*
                      |
-                        ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
+                        ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
                      )"""
     _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
     _MORE_PAGES_INDICATOR = r'data-link-type="next"'
@@ -1545,7 +1541,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
     def _extract_mix(self, playlist_id):
         # The mixes are generated from a a single video
         # the id of the playlist is just 'RD' + video_id
-        url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[2:], playlist_id)
+        url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
         webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
         title_span = (get_element_by_attribute('class', 'title long-title', webpage) or
             get_element_by_attribute('class', 'title ', webpage))
@@ -1573,7 +1569,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
             else:
                 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
 
-        if len(playlist_id) == 13:  # 'RD' + 11 characters for the video id
+        if playlist_id.startswith('RD'):
             # Mixes require a custom extraction process
             return self._extract_mix(playlist_id)
         if playlist_id.startswith('TL'):
@@ -1658,10 +1654,11 @@ class YoutubeChannelIE(InfoExtractor):
         video_ids = []
         url = 'https://www.youtube.com/channel/%s/videos' % channel_id
         channel_page = self._download_webpage(url, channel_id)
-        if re.search(r'channel-header-autogenerated-label', channel_page) is not None:
-            autogenerated = True
-        else:
-            autogenerated = False
+        autogenerated = re.search(r'''(?x)
+                class="[^"]*?(?:
+                    channel-header-autogenerated-label|
+                    yt-channel-title-autogenerated
+                )[^"]*"''', channel_page) is not None
 
         if autogenerated:
             # The videos are contained in a single page
@@ -1763,10 +1760,6 @@ class YoutubeSearchIE(SearchInfoExtractor):
     IE_NAME = u'youtube:search'
     _SEARCH_KEY = 'ytsearch'
 
-    def report_download_page(self, query, pagenum):
-        """Report attempt to download search page with given number."""
-        self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
-
     def _get_n_results(self, query, n):
         """Get a specified number of results for a query"""
 
@@ -1775,16 +1768,15 @@ class YoutubeSearchIE(SearchInfoExtractor):
         limit = n
 
         while (50 * pagenum) < limit:
-            self.report_download_page(query, pagenum+1)
             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
-            request = compat_urllib_request.Request(result_url)
-            try:
-                data = compat_urllib_request.urlopen(request).read().decode('utf-8')
-            except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-                raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
-            api_response = json.loads(data)['data']
-
-            if not 'items' in api_response:
+            data_json = self._download_webpage(
+                result_url, video_id=u'query "%s"' % query,
+                note=u'Downloading page %s' % (pagenum + 1),
+                errnote=u'Unable to download API page')
+            data = json.loads(data_json)
+            api_response = data['data']
+
+            if 'items' not in api_response:
                 raise ExtractorError(u'[youtube] No video results')
 
             new_ids = list(video['id'] for video in api_response['items'])
@@ -1800,6 +1792,7 @@ class YoutubeSearchIE(SearchInfoExtractor):
         return self.playlist_result(videos, query)
 
 class YoutubeSearchDateIE(YoutubeSearchIE):
+    IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
     _SEARCH_KEY = 'ytsearchdate'
     IE_DESC = u'YouTube.com searches, newest videos first'
index c486ef8ecfef9772aaabdb3863a2814349a296b7..5ba06d965fdd0ca3ca2be9624948c359ce7c5bf5 100644 (file)
@@ -17,7 +17,6 @@ import ssl
 import socket
 import sys
 import traceback
-import xml.etree.ElementTree
 import zlib
 
 try:
@@ -548,7 +547,7 @@ def make_HTTPS_handler(opts_no_check_certificate):
 
             def connect(self):
                 sock = socket.create_connection((self.host, self.port), self.timeout)
-                if self._tunnel_host:
+                if getattr(self, '_tunnel_host', False):
                     self.sock = sock
                     self._tunnel()
                 try:
@@ -562,11 +561,14 @@ def make_HTTPS_handler(opts_no_check_certificate):
         return HTTPSHandlerV3()
     else:
         context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
-        context.set_default_verify_paths()
-        
         context.verify_mode = (ssl.CERT_NONE
                                if opts_no_check_certificate
                                else ssl.CERT_REQUIRED)
+        context.set_default_verify_paths()
+        try:
+            context.load_default_certs()
+        except AttributeError:
+            pass  # Python < 3.4
         return compat_urllib_request.HTTPSHandler(context=context)
 
 class ExtractorError(Exception):
@@ -1021,3 +1023,7 @@ def format_bytes(bytes):
     suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
     converted = float(bytes) / float(1024 ** exponent)
     return u'%.2f%s' % (converted, suffix)
+
+def str_to_int(int_str):
+    int_str = re.sub(r'[,\.]', u'', int_str)
+    return int(int_str)
index a73d7fb5ce54945fae85539adc0c747f490984d6..f7f658f49fd0f65b9f9ea6ca6aeba5fca3065a45 100644 (file)
@@ -1,2 +1,2 @@
 
-__version__ = '2013.11.29'
+__version__ = '2013.12.09.1'