Merge remote-tracking branch 'derrotebaron/master'
authorPhilipp Hagemeister <phihag@phihag.de>
Fri, 23 Jan 2015 00:32:52 +0000 (01:32 +0100)
committerPhilipp Hagemeister <phihag@phihag.de>
Fri, 23 Jan 2015 00:32:52 +0000 (01:32 +0100)
33 files changed:
AUTHORS
README.md
test/test_YoutubeDL.py
test/test_all_urls.py
test/test_utils.py
youtube_dl/YoutubeDL.py
youtube_dl/extractor/__init__.py
youtube_dl/extractor/abc7news.py [new file with mode: 0644]
youtube_dl/extractor/atresplayer.py
youtube_dl/extractor/bandcamp.py
youtube_dl/extractor/cnn.py
youtube_dl/extractor/comedycentral.py
youtube_dl/extractor/fourtube.py
youtube_dl/extractor/generic.py
youtube_dl/extractor/hearthisat.py [new file with mode: 0644]
youtube_dl/extractor/lnkgo.py [new file with mode: 0644]
youtube_dl/extractor/nbc.py
youtube_dl/extractor/ndtv.py
youtube_dl/extractor/npo.py
youtube_dl/extractor/pornhub.py
youtube_dl/extractor/smotri.py
youtube_dl/extractor/spiegel.py
youtube_dl/extractor/streetvoice.py [new file with mode: 0644]
youtube_dl/extractor/testtube.py [new file with mode: 0644]
youtube_dl/extractor/tinypic.py
youtube_dl/extractor/tvp.py
youtube_dl/extractor/twitch.py
youtube_dl/extractor/videomega.py
youtube_dl/extractor/youtube.py
youtube_dl/options.py
youtube_dl/postprocessor/ffmpeg.py
youtube_dl/utils.py
youtube_dl/version.py

diff --git a/AUTHORS b/AUTHORS
index 35e256d496d0434d22d6304b55a9a6501bb5066f..b8bf3cb6f8db5c1e90be0f177b17c38618abcf0c 100644 (file)
--- a/AUTHORS
+++ b/AUTHORS
@@ -103,3 +103,4 @@ Christopher Krooss
 Ondřej Caletka
 Dinesh S
 Johan K. Jensen
+Yen Chi Hsuan
index c26546f2a9d4869ee8b69241b7fc536d61bd6b65..1408ebba06ddd209ae936999d100d0e057f8941d 100644 (file)
--- a/README.md
+++ b/README.md
@@ -267,10 +267,22 @@ which means you can modify it, redistribute it or use it however you like.
                                      by extension for the extensions aac, m4a,
                                      mp3, mp4, ogg, wav, webm. You can also use
                                      the special names "best", "bestvideo",
-                                     "bestaudio", "worst".  By default, youtube-
-                                     dl will pick the best quality. Use commas
-                                     to download multiple audio formats, such as
-                                     -f
+                                     "bestaudio", "worst".  You can filter the
+                                     video results by putting a condition in
+                                     brackets, as in -f "best[height=720]" (or
+                                     -f "[filesize>10M]").  This works for
+                                     filesize, height, width, tbr, abr, and vbr
+                                     and the comparisons <, <=, >, >=, =, != .
+                                     Formats for which the value is not known
+                                     are excluded unless you put a question mark
+                                     (?) after the operator. You can combine
+                                     format filters, so  -f "[height <=?
+                                     720][tbr>500]" selects up to 720p videos
+                                     (or videos where the height is not known)
+                                     with a bitrate of at least 500 KBit/s. By
+                                     default, youtube-dl will pick the best
+                                     quality. Use commas to download multiple
+                                     audio formats, such as -f
                                      136/137/mp4/bestvideo,140/m4a/bestaudio.
                                      You can merge the video and audio of two
                                      formats into a single file using -f <video-
@@ -304,7 +316,8 @@ which means you can modify it, redistribute it or use it however you like.
 
 ## Authentication Options:
     -u, --username USERNAME          login with this account ID
-    -p, --password PASSWORD          account password
+    -p, --password PASSWORD          account password. If this option is left
+                                     out, youtube-dl will ask interactively.
     -2, --twofactor TWOFACTOR        two-factor auth code
     -n, --netrc                      use .netrc authentication data
     --video-password PASSWORD        video password (vimeo, smotri)
@@ -444,9 +457,15 @@ Apparently YouTube requires you to pass a CAPTCHA test if you download too much.
 
 Once the video is fully downloaded, use any video player, such as [vlc](http://www.videolan.org) or [mplayer](http://www.mplayerhq.hu/).
 
-### The links provided by youtube-dl -g are not working anymore
+### I extracted a video URL with -g, but it does not play on another machine / in my webbrowser.
 
-The URLs youtube-dl outputs require the downloader to have the correct cookies. Use the `--cookies` option to write the required cookies into a file, and advise your downloader to read cookies from that file. Some sites also require a common user agent to be used, use `--dump-user-agent` to see the one in use by youtube-dl.
+It depends a lot on the service. In many cases, requests for the video (to download/play it) must come from the same IP address and with the same cookies.  Use the `--cookies` option to write the required cookies into a file, and advise your downloader to read cookies from that file. Some sites also require a common user agent to be used, use `--dump-user-agent` to see the one in use by youtube-dl.
+
+It may be beneficial to use IPv6; in some cases, the restrictions are only applied to IPv4. Some services (sometimes only for a subset of videos) do not restrict the video URL by IP address, cookie, or user-agent, but these are the exception rather than the rule.
+
+Please bear in mind that some URL protocols are **not** supported by browsers out of the box, including RTMP. If you are using -g, your own downloader must support these as well.
+
+If you want to play the video on a machine that is not running youtube-dl, you can relay the video content from the machine that runs youtube-dl. You can use `-o -` to let youtube-dl stream a video to stdout, or simply allow the player to download the files written by youtube-dl in turn.
 
 ### ERROR: no fmt_url_map or conn information found in video info
 
@@ -481,6 +500,10 @@ To make a different directory work - either for ffmpeg, or for youtube-dl, or fo
 
 From then on, after restarting your shell, you will be able to access both youtube-dl and ffmpeg (and youtube-dl will be able to find ffmpeg) by simply typing `youtube-dl` or `ffmpeg`, no matter what directory you're in.
 
+### How do I put downloads into a specific folder?
+
+Use the `-o` to specify an [output template](#output-template), for example `-o "/home/user/videos/%(title)s-%(id)s.%(ext)s"`. If you want this for all of your downloads, put the option into your [configuration file](#configuration).
+
 ### How can I detect whether a given URL is supported by youtube-dl?
 
 For one, have a look at the [list of supported sites](docs/supportedsites). Note that it can sometimes happen that the site changes its URL scheme (say, from http://example.com/v/1234567 to http://example.com/v/1234567 ) and youtube-dl reports an URL of a service in that list as unsupported. In that case, simply report a bug.
index 85d87f2c31e803aff668f1d71a6bbdfba33cdcd8..678b9f7d15ee66892ef010552753f0a9577b67a7 100644 (file)
@@ -281,6 +281,61 @@ class TestFormatSelection(unittest.TestCase):
             downloaded = ydl.downloaded_info_dicts[0]
             self.assertEqual(downloaded['format_id'], f1id)
 
+    def test_format_filtering(self):
+        formats = [
+            {'format_id': 'A', 'filesize': 500, 'width': 1000},
+            {'format_id': 'B', 'filesize': 1000, 'width': 500},
+            {'format_id': 'C', 'filesize': 1000, 'width': 400},
+            {'format_id': 'D', 'filesize': 2000, 'width': 600},
+            {'format_id': 'E', 'filesize': 3000},
+            {'format_id': 'F'},
+            {'format_id': 'G', 'filesize': 1000000},
+        ]
+        for f in formats:
+            f['url'] = 'http://_/'
+            f['ext'] = 'unknown'
+        info_dict = _make_result(formats)
+
+        ydl = YDL({'format': 'best[filesize<3000]'})
+        ydl.process_ie_result(info_dict)
+        downloaded = ydl.downloaded_info_dicts[0]
+        self.assertEqual(downloaded['format_id'], 'D')
+
+        ydl = YDL({'format': 'best[filesize<=3000]'})
+        ydl.process_ie_result(info_dict)
+        downloaded = ydl.downloaded_info_dicts[0]
+        self.assertEqual(downloaded['format_id'], 'E')
+
+        ydl = YDL({'format': 'best[filesize <= ? 3000]'})
+        ydl.process_ie_result(info_dict)
+        downloaded = ydl.downloaded_info_dicts[0]
+        self.assertEqual(downloaded['format_id'], 'F')
+
+        ydl = YDL({'format': 'best [filesize = 1000] [width>450]'})
+        ydl.process_ie_result(info_dict)
+        downloaded = ydl.downloaded_info_dicts[0]
+        self.assertEqual(downloaded['format_id'], 'B')
+
+        ydl = YDL({'format': 'best [filesize = 1000] [width!=450]'})
+        ydl.process_ie_result(info_dict)
+        downloaded = ydl.downloaded_info_dicts[0]
+        self.assertEqual(downloaded['format_id'], 'C')
+
+        ydl = YDL({'format': '[filesize>?1]'})
+        ydl.process_ie_result(info_dict)
+        downloaded = ydl.downloaded_info_dicts[0]
+        self.assertEqual(downloaded['format_id'], 'G')
+
+        ydl = YDL({'format': '[filesize<1M]'})
+        ydl.process_ie_result(info_dict)
+        downloaded = ydl.downloaded_info_dicts[0]
+        self.assertEqual(downloaded['format_id'], 'E')
+
+        ydl = YDL({'format': '[filesize<1MiB]'})
+        ydl.process_ie_result(info_dict)
+        downloaded = ydl.downloaded_info_dicts[0]
+        self.assertEqual(downloaded['format_id'], 'G')
+
     def test_add_extra_info(self):
         test_dict = {
             'extractor': 'Foo',
index bd4fe17bf2c0f37b4f9ac2b291a8f7d664f74534..e66264b4b16147cae6e41d329bf07dcc31ff83e4 100644 (file)
@@ -14,7 +14,6 @@ from test.helper import gettestcases
 from youtube_dl.extractor import (
     FacebookIE,
     gen_extractors,
-    TwitchIE,
     YoutubeIE,
 )
 
@@ -72,18 +71,6 @@ class TestAllURLsMatching(unittest.TestCase):
         self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url'])
         self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url'])
 
-    def test_twitch_channelid_matching(self):
-        self.assertTrue(TwitchIE.suitable('twitch.tv/vanillatv'))
-        self.assertTrue(TwitchIE.suitable('www.twitch.tv/vanillatv'))
-        self.assertTrue(TwitchIE.suitable('http://www.twitch.tv/vanillatv'))
-        self.assertTrue(TwitchIE.suitable('http://www.twitch.tv/vanillatv/'))
-
-    def test_twitch_videoid_matching(self):
-        self.assertTrue(TwitchIE.suitable('http://www.twitch.tv/vanillatv/b/328087483'))
-
-    def test_twitch_chapterid_matching(self):
-        self.assertTrue(TwitchIE.suitable('http://www.twitch.tv/tsm_theoddone/c/2349361'))
-
     def test_youtube_extract(self):
         assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id)
         assertExtractId('http://www.youtube.com/watch?&v=BaW_jenozKc', 'BaW_jenozKc')
@@ -115,8 +102,6 @@ class TestAllURLsMatching(unittest.TestCase):
         self.assertMatch(':ythistory', ['youtube:history'])
         self.assertMatch(':thedailyshow', ['ComedyCentralShows'])
         self.assertMatch(':tds', ['ComedyCentralShows'])
-        self.assertMatch(':colbertreport', ['ComedyCentralShows'])
-        self.assertMatch(':cr', ['ComedyCentralShows'])
 
     def test_vimeo_matching(self):
         self.assertMatch('http://vimeo.com/channels/tributes', ['vimeo:channel'])
index 399cdfdbd4baddf09866be9972f5ef36d2cf67cc..bdd7f268af46d5eaaef35507f839989a8343316b 100644 (file)
@@ -28,6 +28,7 @@ from youtube_dl.utils import (
     fix_xml_ampersands,
     InAdvancePagedList,
     intlist_to_bytes,
+    is_html,
     js_to_json,
     limit_length,
     OnDemandPagedList,
@@ -148,6 +149,7 @@ class TestUtil(unittest.TestCase):
         self.assertEqual(unified_strdate('8/7/2009'), '20090708')
         self.assertEqual(unified_strdate('Dec 14, 2012'), '20121214')
         self.assertEqual(unified_strdate('2012/10/11 01:56:38 +0000'), '20121011')
+        self.assertEqual(unified_strdate('1968 12 10'), '19681210')
         self.assertEqual(unified_strdate('1968-12-10'), '19681210')
         self.assertEqual(unified_strdate('28/01/2014 21:00:00 +0100'), '20140128')
         self.assertEqual(
@@ -416,5 +418,21 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4')
         self.assertTrue(age_restricted(18, 14))
         self.assertFalse(age_restricted(18, 18))
 
+    def test_is_html(self):
+        self.assertFalse(is_html(b'\x49\x44\x43<html'))
+        self.assertTrue(is_html(b'<!DOCTYPE foo>\xaaa'))
+        self.assertTrue(is_html(  # UTF-8 with BOM
+            b'\xef\xbb\xbf<!DOCTYPE foo>\xaaa'))
+        self.assertTrue(is_html(  # UTF-16-LE
+            b'\xff\xfe<\x00h\x00t\x00m\x00l\x00>\x00\xe4\x00'
+        ))
+        self.assertTrue(is_html(  # UTF-16-BE
+            b'\xfe\xff\x00<\x00h\x00t\x00m\x00l\x00>\x00\xe4'
+        ))
+        self.assertTrue(is_html(  # UTF-32-BE
+            b'\x00\x00\xFE\xFF\x00\x00\x00<\x00\x00\x00h\x00\x00\x00t\x00\x00\x00m\x00\x00\x00l\x00\x00\x00>\x00\x00\x00\xe4'))
+        self.assertTrue(is_html(  # UTF-32-LE
+            b'\xFF\xFE\x00\x00<\x00\x00\x00h\x00\x00\x00t\x00\x00\x00m\x00\x00\x00l\x00\x00\x00>\x00\x00\x00\xe4\x00\x00\x00'))
+
 if __name__ == '__main__':
     unittest.main()
index 772fddd4542f6726b57878a950410c90ee1a640c..8ef74e4145ed79263c7fe2347430dd56fbce3f34 100755 (executable)
@@ -10,6 +10,7 @@ import io
 import itertools
 import json
 import locale
+import operator
 import os
 import platform
 import re
@@ -49,6 +50,7 @@ from .utils import (
     make_HTTPS_handler,
     MaxDownloadsReached,
     PagedList,
+    parse_filesize,
     PostProcessingError,
     platform_name,
     preferredencoding,
@@ -768,7 +770,59 @@ class YoutubeDL(object):
         else:
             raise Exception('Invalid result type: %s' % result_type)
 
+    def _apply_format_filter(self, format_spec, available_formats):
+        " Returns a tuple of the remaining format_spec and filtered formats "
+
+        OPERATORS = {
+            '<': operator.lt,
+            '<=': operator.le,
+            '>': operator.gt,
+            '>=': operator.ge,
+            '=': operator.eq,
+            '!=': operator.ne,
+        }
+        operator_rex = re.compile(r'''(?x)\s*\[
+            (?P<key>width|height|tbr|abr|vbr|filesize)
+            \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
+            (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
+            \]$
+            ''' % '|'.join(map(re.escape, OPERATORS.keys())))
+        m = operator_rex.search(format_spec)
+        if not m:
+            raise ValueError('Invalid format specification %r' % format_spec)
+
+        try:
+            comparison_value = int(m.group('value'))
+        except ValueError:
+            comparison_value = parse_filesize(m.group('value'))
+            if comparison_value is None:
+                comparison_value = parse_filesize(m.group('value') + 'B')
+            if comparison_value is None:
+                raise ValueError(
+                    'Invalid value %r in format specification %r' % (
+                        m.group('value'), format_spec))
+        op = OPERATORS[m.group('op')]
+
+        def _filter(f):
+            actual_value = f.get(m.group('key'))
+            if actual_value is None:
+                return m.group('none_inclusive')
+            return op(actual_value, comparison_value)
+        new_formats = [f for f in available_formats if _filter(f)]
+
+        new_format_spec = format_spec[:-len(m.group(0))]
+        if not new_format_spec:
+            new_format_spec = 'best'
+
+        return (new_format_spec, new_formats)
+
     def select_format(self, format_spec, available_formats):
+        while format_spec.endswith(']'):
+            format_spec, available_formats = self._apply_format_filter(
+                format_spec, available_formats)
+        if not available_formats:
+            return None
+
         if format_spec == 'best' or format_spec is None:
             return available_formats[-1]
         elif format_spec == 'worst':
index 5a1c2f69b1c91186e9228cc2cf4b622fafc4950f..9ab90ac62dc4bea4195a011abcdf05e0b0c7cf0e 100644 (file)
@@ -1,6 +1,7 @@
 from __future__ import unicode_literals
 
 from .abc import ABCIE
+from .abc7news import Abc7NewsIE
 from .academicearth import AcademicEarthCourseIE
 from .addanime import AddAnimeIE
 from .adobetv import AdobeTVIE
@@ -175,6 +176,7 @@ from .goshgay import GoshgayIE
 from .grooveshark import GroovesharkIE
 from .groupon import GrouponIE
 from .hark import HarkIE
+from .hearthisat import HearThisAtIE
 from .heise import HeiseIE
 from .hellporno import HellPornoIE
 from .helsinki import HelsinkiIE
@@ -226,6 +228,7 @@ from .livestream import (
     LivestreamOriginalIE,
     LivestreamShortenerIE,
 )
+from .lnkgo import LnkGoIE
 from .lrt import LRTIE
 from .lynda import (
     LyndaIE,
@@ -295,6 +298,7 @@ from .nowness import NownessIE
 from .nowvideo import NowVideoIE
 from .npo import (
     NPOIE,
+    NPOLiveIE,
     TegenlichtVproIE,
 )
 from .nrk import (
@@ -406,6 +410,7 @@ from .stanfordoc import StanfordOpenClassroomIE
 from .steam import SteamIE
 from .streamcloud import StreamcloudIE
 from .streamcz import StreamCZIE
+from .streetvoice import StreetVoiceIE
 from .sunporno import SunPornoIE
 from .swrmediathek import SWRMediathekIE
 from .syfy import SyfyIE
@@ -427,6 +432,7 @@ from .telemb import TeleMBIE
 from .teletask import TeleTaskIE
 from .tenplay import TenPlayIE
 from .testurl import TestURLIE
+from .testtube import TestTubeIE
 from .tf1 import TF1IE
 from .theonion import TheOnionIE
 from .theplatform import ThePlatformIE
@@ -455,7 +461,14 @@ from .tvigle import TvigleIE
 from .tvp import TvpIE, TvpSeriesIE
 from .tvplay import TVPlayIE
 from .twentyfourvideo import TwentyFourVideoIE
-from .twitch import TwitchIE
+from .twitch import (
+    TwitchVideoIE,
+    TwitchChapterIE,
+    TwitchVodIE,
+    TwitchProfileIE,
+    TwitchPastBroadcastsIE,
+    TwitchStreamIE,
+)
 from .ubu import UbuIE
 from .udemy import (
     UdemyIE,
diff --git a/youtube_dl/extractor/abc7news.py b/youtube_dl/extractor/abc7news.py
new file mode 100644 (file)
index 0000000..c04949c
--- /dev/null
@@ -0,0 +1,68 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import parse_iso8601
+
+
+class Abc7NewsIE(InfoExtractor):
+    _VALID_URL = r'https?://abc7news\.com(?:/[^/]+/(?P<display_id>[^/]+))?/(?P<id>\d+)'
+    _TESTS = [
+        {
+            'url': 'http://abc7news.com/entertainment/east-bay-museum-celebrates-vintage-synthesizers/472581/',
+            'info_dict': {
+                'id': '472581',
+                'display_id': 'east-bay-museum-celebrates-vintage-synthesizers',
+                'ext': 'mp4',
+                'title': 'East Bay museum celebrates history of synthesized music',
+                'description': 'md5:a4f10fb2f2a02565c1749d4adbab4b10',
+                'thumbnail': 're:^https?://.*\.jpg$',
+                'timestamp': 1421123075,
+                'upload_date': '20150113',
+                'uploader': 'Jonathan Bloom',
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://abc7news.com/472581',
+            'only_matching': True,
+        },
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        display_id = mobj.group('display_id') or video_id
+
+        webpage = self._download_webpage(url, display_id)
+
+        m3u8 = self._html_search_meta(
+            'contentURL', webpage, 'm3u8 url', fatal=True)
+
+        formats = self._extract_m3u8_formats(m3u8, display_id, 'mp4')
+        self._sort_formats(formats)
+
+        title = self._og_search_title(webpage).strip()
+        description = self._og_search_description(webpage).strip()
+        thumbnail = self._og_search_thumbnail(webpage)
+        timestamp = parse_iso8601(self._search_regex(
+            r'<div class="meta">\s*<time class="timeago" datetime="([^"]+)">',
+            webpage, 'upload date', fatal=False))
+        uploader = self._search_regex(
+            r'rel="author">([^<]+)</a>',
+            webpage, 'uploader', default=None)
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'timestamp': timestamp,
+            'uploader': uploader,
+            'formats': formats,
+        }
index 72e83bfc2c1b4fa45080538e4c87339a7a8c530a..5db1941b339a0e6e9bde01ec28e337478f92ce57 100644 (file)
@@ -4,9 +4,12 @@ import time
 import hmac
 
 from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
     compat_str,
+    compat_urllib_parse,
     compat_urllib_request,
+)
+from ..utils import (
     int_or_none,
     float_or_none,
     xpath_text,
@@ -44,6 +47,33 @@ class AtresPlayerIE(InfoExtractor):
     _PLAYER_URL_TEMPLATE = 'https://servicios.atresplayer.com/episode/getplayer.json?episodePk=%s'
     _EPISODE_URL_TEMPLATE = 'http://www.atresplayer.com/episodexml/%s'
 
+    _LOGIN_URL = 'https://servicios.atresplayer.com/j_spring_security_check'
+
+    def _real_initialize(self):
+        self._login()
+
+    def _login(self):
+        (username, password) = self._get_login_info()
+        if username is None:
+            return
+
+        login_form = {
+            'j_username': username,
+            'j_password': password,
+        }
+
+        request = compat_urllib_request.Request(
+            self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
+        request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+        response = self._download_webpage(
+            request, None, 'Logging in as %s' % username)
+
+        error = self._html_search_regex(
+            r'(?s)<ul class="list_error">(.+?)</ul>', response, 'error', default=None)
+        if error:
+            raise ExtractorError(
+                'Unable to login: %s' % error, expected=True)
+
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
index b45d68a61416b66e2ea8e9585d8339c62da46000..aea0263d6b681ba33e66d20cc1a247213299540f 100644 (file)
@@ -161,7 +161,8 @@ class BandcampAlbumIE(InfoExtractor):
         entries = [
             self.url_result(compat_urlparse.urljoin(url, t_path), ie=BandcampIE.ie_key())
             for t_path in tracks_paths]
-        title = self._search_regex(r'album_title : "(.*?)"', webpage, 'title')
+        title = self._search_regex(
+            r'album_title\s*:\s*"(.*?)"', webpage, 'title', fatal=False)
         return {
             '_type': 'playlist',
             'id': playlist_id,
index 93e8d0de355d7ccb239f06aee956468d33cb43d9..90ea074387ef6afe4aaa87a41c13ec6cf5a1aa7b 100644 (file)
@@ -51,7 +51,7 @@ class CNNIE(InfoExtractor):
         mobj = re.match(self._VALID_URL, url)
         path = mobj.group('path')
         page_title = mobj.group('title')
-        info_url = 'http://cnn.com/video/data/3.0/%s/index.xml' % path
+        info_url = 'http://edition.cnn.com/video/data/3.0/%s/index.xml' % path
         info = self._download_xml(info_url, page_title)
 
         formats = []
@@ -143,13 +143,13 @@ class CNNArticleIE(InfoExtractor):
     _VALID_URL = r'https?://(?:(?:edition|www)\.)?cnn\.com/(?!video/)'
     _TEST = {
         'url': 'http://www.cnn.com/2014/12/21/politics/obama-north-koreas-hack-not-war-but-cyber-vandalism/',
-        'md5': '275b326f85d80dff7592a9820f5dc887',
+        'md5': '689034c2a3d9c6dc4aa72d65a81efd01',
         'info_dict': {
-            'id': 'bestoftv/2014/12/21/sotu-crowley-president-obama-north-korea-not-going-to-be-intimidated.cnn',
+            'id': 'bestoftv/2014/12/21/ip-north-korea-obama.cnn',
             'ext': 'mp4',
-            'title': 'Obama: We\'re not going to be intimidated',
-            'description': 'md5:e735586f3dc936075fa654a4d91b21f9',
-            'upload_date': '20141220',
+            'title': 'Obama: Cyberattack not an act of war',
+            'description': 'md5:51ce6750450603795cad0cdfbd7d05c5',
+            'upload_date': '20141221',
         },
         'add_ie': ['CNN'],
     }
index 8d27af5e57348e56a924d1d633df8799343245e4..15ca361f0bb4ea179b68f32f4854f26409d0c93a 100644 (file)
@@ -34,12 +34,12 @@ class ComedyCentralIE(MTVServicesInfoExtractor):
 
 class ComedyCentralShowsIE(MTVServicesInfoExtractor):
     IE_DESC = 'The Daily Show / The Colbert Report'
-    # urls can be abbreviations like :thedailyshow or :colbert
+    # urls can be abbreviations like :thedailyshow
     # urls for episodes like:
     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
-    _VALID_URL = r'''(?x)^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
+    _VALID_URL = r'''(?x)^(:(?P<shortname>tds|thedailyshow)
                       |https?://(:www\.)?
                           (?P<showname>thedailyshow|thecolbertreport)\.(?:cc\.)?com/
                          ((?:full-)?episodes/(?:[0-9a-z]{6}/)?(?P<episode>.*)|
index 7187e075291af16151e28b8041d7e06e49887fa7..b2284ab01cad03fa3152fbc0a2edb70df2ab020f 100644 (file)
@@ -7,10 +7,9 @@ from ..compat import (
     compat_urllib_request,
 )
 from ..utils import (
-    clean_html,
     parse_duration,
+    parse_iso8601,
     str_to_int,
-    unified_strdate,
 )
 
 
@@ -28,68 +27,81 @@ class FourTubeIE(InfoExtractor):
             'uploader': 'WCP Club',
             'uploader_id': 'wcp-club',
             'upload_date': '20131031',
+            'timestamp': 1383263892,
             'duration': 583,
+            'view_count': int,
+            'like_count': int,
+            'categories': list,
         }
     }
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
-        webpage_url = 'http://www.4tube.com/videos/' + video_id
-        webpage = self._download_webpage(webpage_url, video_id)
+        webpage = self._download_webpage(url, video_id)
 
-        self.report_extraction(video_id)
+        title = self._html_search_meta('name', webpage)
+        timestamp = parse_iso8601(self._html_search_meta(
+            'uploadDate', webpage))
+        thumbnail = self._html_search_meta('thumbnailUrl', webpage)
+        uploader_id = self._html_search_regex(
+            r'<a class="img-avatar" href="[^"]+/channels/([^/"]+)" title="Go to [^"]+ page">',
+            webpage, 'uploader id')
+        uploader = self._html_search_regex(
+            r'<a class="img-avatar" href="[^"]+/channels/[^/"]+" title="Go to ([^"]+) page">',
+            webpage, 'uploader')
 
-        playlist_json = self._html_search_regex(r'var playerConfigPlaylist\s+=\s+([^;]+)', webpage, 'Playlist')
-        media_id = self._search_regex(r'idMedia:\s*(\d+)', playlist_json, 'Media Id')
-        sources = self._search_regex(r'sources:\s*\[([^\]]*)\]', playlist_json, 'Sources').split(',')
-        title = self._search_regex(r'title:\s*"([^"]*)', playlist_json, 'Title')
-        thumbnail_url = self._search_regex(r'image:\s*"([^"]*)', playlist_json, 'Thumbnail', fatal=False)
+        categories_html = self._search_regex(
+            r'(?s)><i class="icon icon-tag"></i>\s*Categories / Tags\s*.*?<ul class="list">(.*?)</ul>',
+            webpage, 'categories', fatal=False)
+        categories = None
+        if categories_html:
+            categories = [
+                c.strip() for c in re.findall(
+                    r'(?s)<li><a.*?>(.*?)</a>', categories_html)]
 
-        uploader_str = self._search_regex(r'<span>Uploaded by</span>(.*?)<span>', webpage, 'uploader', fatal=False)
-        mobj = re.search(r'<a href="/sites/(?P<id>[^"]+)"><strong>(?P<name>[^<]+)</strong></a>', uploader_str)
-        (uploader, uploader_id) = (mobj.group('name'), mobj.group('id')) if mobj else (clean_html(uploader_str), None)
+        view_count = str_to_int(self._search_regex(
+            r'<meta itemprop="interactionCount" content="UserPlays:([0-9,]+)">',
+            webpage, 'view count', fatal=False))
+        like_count = str_to_int(self._search_regex(
+            r'<meta itemprop="interactionCount" content="UserLikes:([0-9,]+)">',
+            webpage, 'like count', fatal=False))
+        duration = parse_duration(self._html_search_meta('duration', webpage))
 
-        upload_date = None
-        view_count = None
-        duration = None
-        description = self._html_search_meta('description', webpage, 'description')
-        if description:
-            upload_date = self._search_regex(r'Published Date: (\d{2} [a-zA-Z]{3} \d{4})', description, 'upload date',
-                                             fatal=False)
-            if upload_date:
-                upload_date = unified_strdate(upload_date)
-            view_count = self._search_regex(r'Views: ([\d,\.]+)', description, 'view count', fatal=False)
-            if view_count:
-                view_count = str_to_int(view_count)
-            duration = parse_duration(self._search_regex(r'Length: (\d+m\d+s)', description, 'duration', fatal=False))
+        params_js = self._search_regex(
+            r'\$\.ajax\(url,\ opts\);\s*\}\s*\}\)\(([0-9,\[\] ]+)\)',
+            webpage, 'initialization parameters'
+        )
+        params = self._parse_json('[%s]' % params_js, video_id)
+        media_id = params[0]
+        sources = ['%s' % p for p in params[2]]
 
-        token_url = "http://tkn.4tube.com/{0}/desktop/{1}".format(media_id, "+".join(sources))
+        token_url = 'http://tkn.4tube.com/{0}/desktop/{1}'.format(
+            media_id, '+'.join(sources))
         headers = {
             b'Content-Type': b'application/x-www-form-urlencoded',
             b'Origin': b'http://www.4tube.com',
         }
         token_req = compat_urllib_request.Request(token_url, b'{}', headers)
         tokens = self._download_json(token_req, video_id)
-
         formats = [{
             'url': tokens[format]['token'],
             'format_id': format + 'p',
             'resolution': format + 'p',
             'quality': int(format),
         } for format in sources]
-
         self._sort_formats(formats)
 
         return {
             'id': video_id,
             'title': title,
             'formats': formats,
-            'thumbnail': thumbnail_url,
+            'categories': categories,
+            'thumbnail': thumbnail,
             'uploader': uploader,
             'uploader_id': uploader_id,
-            'upload_date': upload_date,
+            'timestamp': timestamp,
+            'like_count': like_count,
             'view_count': view_count,
             'duration': duration,
             'age_limit': 18,
-            'webpage_url': webpage_url,
         }
index 7a5bf939237ff45731fd3befca5ad0b7dfc0df1f..b893d8149f2c63e081341d2dec2d71ba05f3e64c 100644 (file)
@@ -17,6 +17,7 @@ from ..utils import (
     ExtractorError,
     float_or_none,
     HEADRequest,
+    is_html,
     orderedSet,
     parse_xml,
     smuggle_url,
@@ -647,7 +648,7 @@ class GenericIE(InfoExtractor):
         # Maybe it's a direct link to a video?
         # Be careful not to download the whole thing!
         first_bytes = full_response.read(512)
-        if not re.match(r'^\s*<', first_bytes.decode('utf-8', 'replace')):
+        if not is_html(first_bytes):
             self._downloader.report_warning(
                 'URL could be a direct video link, returning it as such.')
             upload_date = unified_strdate(
diff --git a/youtube_dl/extractor/hearthisat.py b/youtube_dl/extractor/hearthisat.py
new file mode 100644 (file)
index 0000000..a19b31a
--- /dev/null
@@ -0,0 +1,117 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_urllib_request,
+    compat_urlparse,
+)
+from ..utils import (
+    HEADRequest,
+    str_to_int,
+    urlencode_postdata,
+    urlhandle_detect_ext,
+)
+
+
+class HearThisAtIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?hearthis\.at/(?P<artist>[^/]+)/(?P<title>[A-Za-z0-9\-]+)/?$'
+    _PLAYLIST_URL = 'https://hearthis.at/playlist.php'
+    _TEST = {
+        'url': 'https://hearthis.at/moofi/dr-kreep',
+        'md5': 'ab6ec33c8fed6556029337c7885eb4e0',
+        'info_dict': {
+            'id': '150939',
+            'ext': 'wav',
+            'title': 'Moofi - Dr. Kreep',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'timestamp': 1421564134,
+            'description': 'Creepy Patch. Mutable Instruments Braids Vowel + Formant Mode.',
+            'upload_date': '20150118',
+            'comment_count': int,
+            'view_count': int,
+            'like_count': int,
+            'duration': 71,
+            'categories': ['Experimental'],
+        }
+    }
+
+    def _real_extract(self, url):
+        m = re.match(self._VALID_URL, url)
+        display_id = '{artist:s} - {title:s}'.format(**m.groupdict())
+
+        webpage = self._download_webpage(url, display_id)
+        track_id = self._search_regex(
+            r'intTrackId\s*=\s*(\d+)', webpage, 'track ID')
+
+        payload = urlencode_postdata({'tracks[]': track_id})
+        req = compat_urllib_request.Request(self._PLAYLIST_URL, payload)
+        req.add_header('Content-type', 'application/x-www-form-urlencoded')
+
+        track = self._download_json(req, track_id, 'Downloading playlist')[0]
+        title = '{artist:s} - {title:s}'.format(**track)
+
+        categories = None
+        if track.get('category'):
+            categories = [track['category']]
+
+        description = self._og_search_description(webpage)
+        thumbnail = self._og_search_thumbnail(webpage)
+
+        meta_span = r'<span[^>]+class="%s".*?</i>([^<]+)</span>'
+        view_count = str_to_int(self._search_regex(
+            meta_span % 'plays_count', webpage, 'view count', fatal=False))
+        like_count = str_to_int(self._search_regex(
+            meta_span % 'likes_count', webpage, 'like count', fatal=False))
+        comment_count = str_to_int(self._search_regex(
+            meta_span % 'comment_count', webpage, 'comment count', fatal=False))
+        duration = str_to_int(self._search_regex(
+            r'data-length="(\d+)', webpage, 'duration', fatal=False))
+        timestamp = str_to_int(self._search_regex(
+            r'<span[^>]+class="calctime"[^>]+data-time="(\d+)', webpage, 'timestamp', fatal=False))
+
+        formats = []
+        mp3_url = self._search_regex(
+            r'(?s)<a class="player-link"\s+(?:[a-zA-Z0-9_:-]+="[^"]+"\s+)*?data-mp3="([^"]+)"',
+            webpage, 'mp3 URL', fatal=False)
+        if mp3_url:
+            formats.append({
+                'format_id': 'mp3',
+                'vcodec': 'none',
+                'acodec': 'mp3',
+                'url': mp3_url,
+            })
+        download_path = self._search_regex(
+            r'<a class="[^"]*download_fct[^"]*"\s+href="([^"]+)"',
+            webpage, 'download URL', default=None)
+        if download_path:
+            download_url = compat_urlparse.urljoin(url, download_path)
+            ext_req = HEADRequest(download_url)
+            ext_handle = self._request_webpage(
+                ext_req, display_id, note='Determining extension')
+            ext = urlhandle_detect_ext(ext_handle)
+            formats.append({
+                'format_id': 'download',
+                'vcodec': 'none',
+                'ext': ext,
+                'url': download_url,
+                'preference': 2,  # Usually better quality
+            })
+        self._sort_formats(formats)
+
+        return {
+            'id': track_id,
+            'display_id': display_id,
+            'title': title,
+            'formats': formats,
+            'thumbnail': thumbnail,
+            'description': description,
+            'duration': duration,
+            'timestamp': timestamp,
+            'view_count': view_count,
+            'comment_count': comment_count,
+            'like_count': like_count,
+            'categories': categories,
+        }
diff --git a/youtube_dl/extractor/lnkgo.py b/youtube_dl/extractor/lnkgo.py
new file mode 100644 (file)
index 0000000..a8e3578
--- /dev/null
@@ -0,0 +1,124 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    js_to_json,
+    unified_strdate,
+)
+
+
+class LnkGoIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?lnkgo\.alfa\.lt/visi\-video/(?P<show>[^/]+)/ziurek\-(?P<display_id>[A-Za-z0-9\-]+)'
+    _TESTS = [{
+        'url': 'http://lnkgo.alfa.lt/visi-video/yra-kaip-yra/ziurek-yra-kaip-yra-162',
+        'info_dict': {
+            'id': '46712',
+            'ext': 'mp4',
+            'title': 'Yra kaip yra',
+            'upload_date': '20150107',
+            'description': 'md5:d82a5e36b775b7048617f263a0e3475e',
+            'age_limit': 7,
+            'duration': 3019,
+            'thumbnail': 're:^https?://.*\.jpg$'
+        },
+        'params': {
+            'skip_download': True,  # HLS download
+        },
+    }, {
+        'url': 'http://lnkgo.alfa.lt/visi-video/aktualai-pratesimas/ziurek-nerdas-taiso-kompiuteri-2',
+        'info_dict': {
+            'id': '47289',
+            'ext': 'mp4',
+            'title': 'Nėrdas: Kompiuterio Valymas',
+            'upload_date': '20150113',
+            'description': 'md5:7352d113a242a808676ff17e69db6a69',
+            'age_limit': 18,
+            'duration': 346,
+            'thumbnail': 're:^https?://.*\.jpg$'
+        },
+        'params': {
+            'skip_download': True,  # HLS download
+        },
+    }]
+    _AGE_LIMITS = {
+        'N-7': 7,
+        'N-14': 14,
+        'S': 18,
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        display_id = mobj.group('display_id')
+
+        webpage = self._download_webpage(
+            url, display_id, 'Downloading player webpage')
+
+        video_id = self._search_regex(
+            r'data-ep="([^"]+)"', webpage, 'video ID')
+        title = self._og_search_title(webpage)
+        description = self._og_search_description(webpage)
+
+        thumbnail_w = int_or_none(
+            self._og_search_property('image:width', webpage, 'thumbnail width', fatal=False))
+        thumbnail_h = int_or_none(
+            self._og_search_property('image:height', webpage, 'thumbnail height', fatal=False))
+        thumbnail = {
+            'url': self._og_search_thumbnail(webpage),
+        }
+        if thumbnail_w and thumbnail_h:
+            thumbnail.update({
+                'width': thumbnail_w,
+                'height': thumbnail_h,
+            })
+
+        upload_date = unified_strdate(self._search_regex(
+            r'class="meta-item\sair-time">.*?<strong>([^<]+)</strong>', webpage, 'upload date', fatal=False))
+        duration = int_or_none(self._search_regex(
+            r'VideoDuration = "([^"]+)"', webpage, 'duration', fatal=False))
+
+        pg_rating = self._search_regex(
+            r'pgrating="([^"]+)"', webpage, 'PG rating', fatal=False, default='')
+        age_limit = self._AGE_LIMITS.get(pg_rating.upper(), 0)
+
+        sources_js = self._search_regex(
+            r'(?s)sources:\s(\[.*?\]),', webpage, 'sources')
+        sources = self._parse_json(
+            sources_js, video_id, transform_source=js_to_json)
+
+        formats = []
+        for source in sources:
+            if source.get('provider') == 'rtmp':
+                m = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>[^/]+))/(?P<play_path>.+)$', source['file'])
+                if not m:
+                    continue
+                formats.append({
+                    'format_id': 'rtmp',
+                    'ext': 'flv',
+                    'url': m.group('url'),
+                    'play_path': m.group('play_path'),
+                    'page_url': url,
+                })
+            elif source.get('file').endswith('.m3u8'):
+                formats.append({
+                    'format_id': 'hls',
+                    'ext': source.get('type', 'mp4'),
+                    'url': source['file'],
+                })
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'formats': formats,
+            'thumbnails': [thumbnail],
+            'duration': duration,
+            'description': description,
+            'age_limit': age_limit,
+            'upload_date': upload_date,
+        }
index 690c46b6a57be11edf36899b959318af5e482119..f840f65321997078859ab5f74682969ec4499359 100644 (file)
@@ -6,6 +6,7 @@ import json
 from .common import InfoExtractor
 from ..compat import (
     compat_str,
+    compat_HTTPError,
 )
 from ..utils import (
     ExtractorError,
@@ -78,6 +79,16 @@ class NBCNewsIE(InfoExtractor):
             },
             'add_ie': ['ThePlatform'],
         },
+        {
+            'url': 'http://www.nbcnews.com/feature/dateline-full-episodes/full-episode-family-business-n285156',
+            'md5': 'fdbf39ab73a72df5896b6234ff98518a',
+            'info_dict': {
+                'id': 'Wjf9EDR3A_60',
+                'ext': 'mp4',
+                'title': 'FULL EPISODE: Family Business',
+                'description': 'md5:757988edbaae9d7be1d585eb5d55cc04',
+            },
+        },
     ]
 
     def _real_extract(self, url):
@@ -115,10 +126,19 @@ class NBCNewsIE(InfoExtractor):
                 if not base_url:
                     continue
                 playlist_url = base_url + '?form=MPXNBCNewsAPI'
-                all_videos = self._download_json(playlist_url, title)['videos']
 
                 try:
-                    info = next(v for v in all_videos if v['mpxId'] == mpxid)
+                    all_videos = self._download_json(playlist_url, title)
+                except ExtractorError as ee:
+                    if isinstance(ee.cause, compat_HTTPError):
+                        continue
+                    raise
+
+                if not all_videos or 'videos' not in all_videos:
+                    continue
+
+                try:
+                    info = next(v for v in all_videos['videos'] if v['mpxId'] == mpxid)
                     break
                 except StopIteration:
                     continue
index 95e7d63aade1edbc6a0c300bcc18168d707f8716..2a1ca80df797f0abe63cc6327c5e283965865f70 100644 (file)
@@ -27,9 +27,7 @@ class NDTVIE(InfoExtractor):
     }
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-
+        video_id = self._match_id(url)
         webpage = self._download_webpage(url, video_id)
 
         filename = self._search_regex(
index 6d20f755a89a244e41feb1da5e2416feeb351d7e..175b14583efbad65d9fbb1777d14dbb5576c1cc1 100644 (file)
@@ -1,7 +1,5 @@
 from __future__ import unicode_literals
 
-import re
-
 from .common import InfoExtractor
 from ..utils import (
     fix_xml_ampersands,
@@ -13,7 +11,16 @@ from ..utils import (
 )
 
 
-class NPOIE(InfoExtractor):
+class NPOBaseIE(InfoExtractor):
+    def _get_token(self, video_id):
+        token_page = self._download_webpage(
+            'http://ida.omroep.nl/npoplayer/i.js',
+            video_id, note='Downloading token')
+        return self._search_regex(
+            r'npoplayer\.token = "(.+?)"', token_page, 'token')
+
+
+class NPOIE(NPOBaseIE):
     IE_NAME = 'npo.nl'
     _VALID_URL = r'https?://www\.npo\.nl/[^/]+/[^/]+/(?P<id>[^/?]+)'
 
@@ -80,8 +87,7 @@ class NPOIE(InfoExtractor):
     ]
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
+        video_id = self._match_id(url)
         return self._get_info(video_id)
 
     def _get_info(self, video_id):
@@ -91,12 +97,8 @@ class NPOIE(InfoExtractor):
             # We have to remove the javascript callback
             transform_source=strip_jsonp,
         )
-        token_page = self._download_webpage(
-            'http://ida.omroep.nl/npoplayer/i.js',
-            video_id,
-            note='Downloading token'
-        )
-        token = self._search_regex(r'npoplayer\.token = "(.+?)"', token_page, 'token')
+
+        token = self._get_token(video_id)
 
         formats = []
 
@@ -170,6 +172,83 @@ class NPOIE(InfoExtractor):
         }
 
 
+class NPOLiveIE(NPOBaseIE):
+    IE_NAME = 'npo.nl:live'
+    _VALID_URL = r'https?://www\.npo\.nl/live/(?P<id>.+)'
+
+    _TEST = {
+        'url': 'http://www.npo.nl/live/npo-1',
+        'info_dict': {
+            'id': 'LI_NEDERLAND1_136692',
+            'display_id': 'npo-1',
+            'ext': 'mp4',
+            'title': 're:^Nederland 1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+            'description': 'Livestream',
+            'is_live': True,
+        },
+        'params': {
+            'skip_download': True,
+        }
+    }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        live_id = self._search_regex(
+            r'data-prid="([^"]+)"', webpage, 'live id')
+
+        metadata = self._download_json(
+            'http://e.omroep.nl/metadata/%s' % live_id,
+            display_id, transform_source=strip_jsonp)
+
+        token = self._get_token(display_id)
+
+        formats = []
+
+        streams = metadata.get('streams')
+        if streams:
+            for stream in streams:
+                stream_type = stream.get('type').lower()
+                if stream_type == 'ss':
+                    continue
+                stream_info = self._download_json(
+                    'http://ida.omroep.nl/aapi/?stream=%s&token=%s&type=jsonp'
+                    % (stream.get('url'), token),
+                    display_id, 'Downloading %s JSON' % stream_type)
+                if stream_info.get('error_code', 0) or stream_info.get('errorcode', 0):
+                    continue
+                stream_url = self._download_json(
+                    stream_info['stream'], display_id,
+                    'Downloading %s URL' % stream_type,
+                    transform_source=strip_jsonp)
+                if stream_type == 'hds':
+                    f4m_formats = self._extract_f4m_formats(stream_url, display_id)
+                    # f4m downloader downloads only piece of live stream
+                    for f4m_format in f4m_formats:
+                        f4m_format['preference'] = -1
+                    formats.extend(f4m_formats)
+                elif stream_type == 'hls':
+                    formats.extend(self._extract_m3u8_formats(stream_url, display_id, 'mp4'))
+                else:
+                    formats.append({
+                        'url': stream_url,
+                    })
+
+        self._sort_formats(formats)
+
+        return {
+            'id': live_id,
+            'display_id': display_id,
+            'title': self._live_title(metadata['titel']),
+            'description': metadata['info'],
+            'thumbnail': metadata.get('images', [{'url': None}])[-1]['url'],
+            'formats': formats,
+            'is_live': True,
+        }
+
+
 class TegenlichtVproIE(NPOIE):
     IE_NAME = 'tegenlicht.vpro.nl'
     _VALID_URL = r'https?://tegenlicht\.vpro\.nl/afleveringen/.*?'
index 634142d0d27300eb82ea2f460fd2163a20208709..fb2032832e4757e328d016ab289e892721d73af2 100644 (file)
@@ -10,6 +10,7 @@ from ..compat import (
     compat_urllib_request,
 )
 from ..utils import (
+    ExtractorError,
     str_to_int,
 )
 from ..aes import (
@@ -44,6 +45,15 @@ class PornHubIE(InfoExtractor):
         req.add_header('Cookie', 'age_verified=1')
         webpage = self._download_webpage(req, video_id)
 
+        error_msg = self._html_search_regex(
+            r'(?s)<div class="userMessageSection[^"]*".*?>(.*?)</div>',
+            webpage, 'error message', default=None)
+        if error_msg:
+            error_msg = re.sub(r'\s+', ' ', error_msg)
+            raise ExtractorError(
+                'PornHub said: %s' % error_msg,
+                expected=True, video_id=video_id)
+
         video_title = self._html_search_regex(r'<h1 [^>]+>([^<]+)', webpage, 'title')
         video_uploader = self._html_search_regex(
             r'(?s)From:&nbsp;.+?<(?:a href="/users/|a href="/channels/|<span class="username)[^>]+>(.+?)<',
index baef3daa04f70ceea7200f5f810eba7fc9df052f..26f361c93990b6b92ff31d2447b70f7e08263d00 100644 (file)
@@ -90,6 +90,20 @@ class SmotriIE(InfoExtractor):
             },
             'skip': 'Video is not approved by moderator',
         },
+        # not approved by moderator, but available
+        {
+            'url': 'http://smotri.com/video/view/?id=v28888533b73',
+            'md5': 'f44bc7adac90af518ef1ecf04893bb34',
+            'info_dict': {
+                'id': 'v28888533b73',
+                'ext': 'mp4',
+                'title': 'Russian Spies Killed By ISIL Child Soldier',
+                'uploader': 'Mopeder',
+                'uploader_id': 'mopeder',
+                'duration': 71,
+                'thumbnail': 'http://frame9.loadup.ru/d7/32/2888853.2.3.jpg',
+            },
+        },
         # swf player
         {
             'url': 'http://pics.smotri.com/scrubber_custom8.swf?file=v9188090500',
@@ -146,13 +160,16 @@ class SmotriIE(InfoExtractor):
 
         video = self._download_json(request, video_id, 'Downloading video JSON')
 
-        if video.get('_moderate_no') or not video.get('moderated'):
-            raise ExtractorError('Video %s has not been approved by moderator' % video_id, expected=True)
+        video_url = video.get('_vidURL') or video.get('_vidURL_mp4')
+
+        if not video_url:
+            if video.get('_moderate_no') or not video.get('moderated'):
+                raise ExtractorError(
+                    'Video %s has not been approved by moderator' % video_id, expected=True)
 
-        if video.get('error'):
-            raise ExtractorError('Video %s does not exist' % video_id, expected=True)
+            if video.get('error'):
+                raise ExtractorError('Video %s does not exist' % video_id, expected=True)
 
-        video_url = video.get('_vidURL') or video.get('_vidURL_mp4')
         title = video['title']
         thumbnail = video['_imgURL']
         upload_date = unified_strdate(video['added'])
index 1e55a9ffb5748b70969de11886c13720ff936be7..f345883c767438a91412e0619a993a70e3a21a92 100644 (file)
@@ -4,7 +4,14 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..compat import compat_urlparse
+from ..compat import (
+    compat_urlparse,
+    compat_HTTPError,
+)
+from ..utils import (
+    HEADRequest,
+    ExtractorError,
+)
 from .spiegeltv import SpiegeltvIE
 
 
@@ -60,21 +67,31 @@ class SpiegelIE(InfoExtractor):
         xml_url = base_url + video_id + '.xml'
         idoc = self._download_xml(xml_url, video_id)
 
-        formats = [
-            {
-                'format_id': n.tag.rpartition('type')[2],
-                'url': base_url + n.find('./filename').text,
-                'width': int(n.find('./width').text),
-                'height': int(n.find('./height').text),
-                'abr': int(n.find('./audiobitrate').text),
-                'vbr': int(n.find('./videobitrate').text),
-                'vcodec': n.find('./codec').text,
-                'acodec': 'MP4A',
-            }
-            for n in list(idoc)
-            # Blacklist type 6, it's extremely LQ and not available on the same server
-            if n.tag.startswith('type') and n.tag != 'type6'
-        ]
+        formats = []
+        for n in list(idoc):
+            if n.tag.startswith('type') and n.tag != 'type6':
+                format_id = n.tag.rpartition('type')[2]
+                video_url = base_url + n.find('./filename').text
+                # Test video URLs beforehand as some of them are invalid
+                try:
+                    self._request_webpage(
+                        HEADRequest(video_url), video_id,
+                        'Checking %s video URL' % format_id)
+                except ExtractorError as e:
+                    if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
+                        self.report_warning(
+                            '%s video URL is invalid, skipping' % format_id, video_id)
+                        continue
+                formats.append({
+                    'format_id': format_id,
+                    'url': video_url,
+                    'width': int(n.find('./width').text),
+                    'height': int(n.find('./height').text),
+                    'abr': int(n.find('./audiobitrate').text),
+                    'vbr': int(n.find('./videobitrate').text),
+                    'vcodec': n.find('./codec').text,
+                    'acodec': 'MP4A',
+                })
         duration = float(idoc[0].findall('./duration')[0].text)
 
         self._sort_formats(formats)
diff --git a/youtube_dl/extractor/streetvoice.py b/youtube_dl/extractor/streetvoice.py
new file mode 100644 (file)
index 0000000..6a57fa6
--- /dev/null
@@ -0,0 +1,51 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import unified_strdate
+
+
+class StreetVoiceIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:.+?\.)?streetvoice\.com/[^/]+/songs/(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://streetvoice.com/skippylu/songs/94440/',
+        'md5': '15974627fc01a29e492c98593c2fd472',
+        'info_dict': {
+            'id': '94440',
+            'ext': 'mp3',
+            'filesize': 4167053,
+            'title': '輸',
+            'description': 'Crispy脆樂團 - 輸',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'duration': 260,
+            'upload_date': '20091018',
+            'uploader': 'Crispy脆樂團',
+            'uploader_id': '627810',
+        }
+    }, {
+        'url': 'http://tw.streetvoice.com/skippylu/songs/94440/',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        song_id = self._match_id(url)
+
+        song = self._download_json(
+            'http://streetvoice.com/music/api/song/%s' % song_id, song_id)
+
+        title = song['name']
+        author = song['musician']['name']
+
+        return {
+            'id': song_id,
+            'url': song['file'],
+            'filesize': song.get('size'),
+            'title': title,
+            'description': '%s - %s' % (author, title),
+            'thumbnail': self._proto_relative_url(song.get('image'), 'http:'),
+            'duration': song.get('length'),
+            'upload_date': unified_strdate(song.get('created_at')),
+            'uploader': author,
+            'uploader_id': compat_str(song['musician']['id']),
+        }
diff --git a/youtube_dl/extractor/testtube.py b/youtube_dl/extractor/testtube.py
new file mode 100644 (file)
index 0000000..fd47e71
--- /dev/null
@@ -0,0 +1,60 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class TestTubeIE(InfoExtractor):
+    _VALID_URL = r'https?://testtube\.com/[^/?#]+/(?P<id>[^/?#]+)'
+    _TESTS = [{
+        'url': 'https://testtube.com/dnews/5-weird-ways-plants-can-eat-animals?utm_source=FB&utm_medium=DNews&utm_campaign=DNewsSocial',
+        'info_dict': {
+            'id': '60163',
+            'display_id': '5-weird-ways-plants-can-eat-animals',
+            'duration': 275,
+            'ext': 'mp4',
+            'title': '5 Weird Ways Plants Can Eat Animals',
+            'description': 'Why have some plants evolved to eat meat?',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'uploader': 'DNews',
+            'uploader_id': 'dnews',
+        },
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+        video_id = self._search_regex(
+            r"player\.loadRevision3Item\('video_id',\s*([0-9]+)\);",
+            webpage, 'video ID')
+
+        all_info = self._download_json(
+            'https://testtube.com/api/getPlaylist.json?api_key=ba9c741bce1b9d8e3defcc22193f3651b8867e62&codecs=h264,vp8,theora&video_id=%s' % video_id,
+            video_id)
+        info = all_info['items'][0]
+
+        formats = []
+        for vcodec, fdatas in info['media'].items():
+            for name, fdata in fdatas.items():
+                formats.append({
+                    'format_id': '%s-%s' % (vcodec, name),
+                    'url': fdata['url'],
+                    'vcodec': vcodec,
+                    'tbr': fdata.get('bitrate'),
+                })
+        self._sort_formats(formats)
+
+        duration = int_or_none(info.get('duration'))
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': info['title'],
+            'description': info.get('summary'),
+            'thumbnail': info.get('images', {}).get('large'),
+            'uploader': info.get('show', {}).get('name'),
+            'uploader_id': info.get('show', {}).get('slug'),
+            'duration': duration,
+            'formats': formats,
+        }
index 4fe89dbe516f8e25eb1f84239bc9cbc9f26bd648..e036b8cdf1e6ca6ad4277a4c3d22e79361322703 100644 (file)
@@ -9,17 +9,23 @@ from ..utils import ExtractorError
 class TinyPicIE(InfoExtractor):
     IE_NAME = 'tinypic'
     IE_DESC = 'tinypic.com videos'
-    _VALID_URL = r'http://tinypic\.com/player\.php\?v=(?P<id>[^&]+)&s=\d+'
-
-    _TEST = {
-        'url': 'http://tinypic.com/player.php?v=6xw7tc%3E&s=5#.UtqZmbRFCM8',
-        'md5': '609b74432465364e72727ebc6203f044',
-        'info_dict': {
-            'id': '6xw7tc',
-            'ext': 'flv',
-            'title': 'shadow phenomenon weird',
+    _VALID_URL = r'http://(?:.+?\.)?tinypic\.com/player\.php\?v=(?P<id>[^&]+)&s=\d+'
+
+    _TESTS = [
+        {
+            'url': 'http://tinypic.com/player.php?v=6xw7tc%3E&s=5#.UtqZmbRFCM8',
+            'md5': '609b74432465364e72727ebc6203f044',
+            'info_dict': {
+                'id': '6xw7tc',
+                'ext': 'flv',
+                'title': 'shadow phenomenon weird',
+            },
+        },
+        {
+            'url': 'http://de.tinypic.com/player.php?v=dy90yh&s=8',
+            'only_matching': True,
         }
-    }
+    ]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
index cc26f417a0c208efd946aa91c72086471d69ccaa..f57d609d43eecb13f3bb43ecc042107b5cad50bd 100644 (file)
@@ -12,61 +12,59 @@ class TvpIE(InfoExtractor):
 
     _TESTS = [{
         'url': 'http://vod.tvp.pl/filmy-fabularne/filmy-za-darmo/ogniem-i-mieczem/wideo/odc-2/4278035',
+        'md5': 'cdd98303338b8a7f7abab5cd14092bf2',
         'info_dict': {
             'id': '4278035',
             'ext': 'wmv',
             'title': 'Ogniem i mieczem, odc. 2',
-            'description': 'Bohun dowiaduje się o złamaniu przez kniahinię danego mu słowa i wyrusza do Rozłogów. Helenie w ostatniej chwili udaje się uciec dzięki pomocy Zagłoby.',
         },
     }, {
         'url': 'http://vod.tvp.pl/seriale/obyczajowe/czas-honoru/sezon-1-1-13/i-seria-odc-13/194536',
+        'md5': '8aa518c15e5cc32dfe8db400dc921fbb',
         'info_dict': {
             'id': '194536',
             'ext': 'mp4',
             'title': 'Czas honoru, I seria – odc. 13',
-            #  'description': 'WŁADEK\nCzesław prosi Marię o dostarczenie Władkowi zarazki tyfusu. Jeśli zachoruje zostanie przewieziony do szpitala skąd łatwiej będzie go odbić. Czy matka zdecyduje się zarazić syna? Karol odwiedza Wandę przyznaje się, że ją oszukiwał, ale ostrzega też, że grozi jej aresztowanie i nalega, żeby wyjechała z Warszawy. Czy dziewczyna zdecyduje się znów oddalić od ukochanego? Rozpoczyna się akcja odbicia Władka.',
         },
     }, {
         'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176',
+        'md5': 'c3b15ed1af288131115ff17a17c19dda',
         'info_dict': {
             'id': '17916176',
             'ext': 'mp4',
             'title': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata',
         },
-        'params': {
-            # m3u8 download
-            'skip_download': 'true',
-        },
     }, {
         'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272',
+        'md5': 'c3b15ed1af288131115ff17a17c19dda',
         'info_dict': {
             'id': '17834272',
             'ext': 'mp4',
             'title': 'Na sygnale, odc. 39',
-            'description': 'Ekipa Wiktora ratuje młodą matkę, która spadła ze schodów trzymając na rękach noworodka. Okazuje się, że dziewczyna jest surogatką, a biologiczni rodzice dziecka próbują zmusić ją do oddania synka…',
-        },
-        'params': {
-            # m3u8 download
-            'skip_download': 'true',
         },
     }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
+
         webpage = self._download_webpage(
             'http://www.tvp.pl/sess/tvplayer.php?object_id=%s' % video_id, video_id)
 
-        title = self._og_search_title(webpage)
-        series = self._search_regex(
-            r'{name:\s*([\'"])SeriesTitle\1,\s*value:\s*\1(?P<series>.*?)\1},',
+        title = self._search_regex(
+            r'name\s*:\s*([\'"])Title\1\s*,\s*value\s*:\s*\1(?P<title>.+?)\1',
+            webpage, 'title', group='title')
+        series_title = self._search_regex(
+            r'name\s*:\s*([\'"])SeriesTitle\1\s*,\s*value\s*:\s*\1(?P<series>.+?)\1',
             webpage, 'series', group='series', default=None)
-        if series is not None and series not in title:
-            title = '%s, %s' % (series, title)
-        description = self._og_search_description(webpage, default=None)
+        if series_title:
+            title = '%s, %s' % (series_title, title)
+
+        thumbnail = self._search_regex(
+            r"poster\s*:\s*'([^']+)'", webpage, 'thumbnail', default=None)
 
         video_url = self._search_regex(
             r'0:{src:([\'"])(?P<url>.*?)\1', webpage, 'formats', group='url', default=None)
-        if video_url is None:
+        if not video_url:
             video_url = self._download_json(
                 'http://www.tvp.pl/pub/stat/videofileinfo?video_id=%s' % video_id,
                 video_id)['video_url']
@@ -89,8 +87,7 @@ class TvpIE(InfoExtractor):
         return {
             'id': video_id,
             'title': title,
-            'thumbnail': self._og_search_thumbnail(webpage),
-            'description': description,
+            'thumbnail': thumbnail,
             'formats': formats,
         }
 
index b11a1d5610d0dffe0d98df7d7b05d4228552dfb7..340cadcf5f2a00f855d4cbb47c1b46ba09ec502a 100644 (file)
@@ -3,9 +3,11 @@ from __future__ import unicode_literals
 
 import itertools
 import re
+import random
 
 from .common import InfoExtractor
 from ..compat import (
+    compat_str,
     compat_urllib_parse,
     compat_urllib_request,
 )
@@ -15,44 +17,12 @@ from ..utils import (
 )
 
 
-class TwitchIE(InfoExtractor):
-    # TODO: One broadcast may be split into multiple videos. The key
-    # 'broadcast_id' is the same for all parts, and 'broadcast_part'
-    # starts at 1 and increases. Can we treat all parts as one video?
-    _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?twitch\.tv/
-        (?:
-            (?P<channelid>[^/]+)|
-            (?:(?:[^/]+)/v/(?P<vodid>[^/]+))|
-            (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
-            (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
-        )
-        /?(?:\#.*)?$
-        """
-    _PAGE_LIMIT = 100
+class TwitchBaseIE(InfoExtractor):
+    _VALID_URL_BASE = r'https?://(?:www\.)?twitch\.tv'
+
     _API_BASE = 'https://api.twitch.tv'
+    _USHER_BASE = 'http://usher.twitch.tv'
     _LOGIN_URL = 'https://secure.twitch.tv/user/login'
-    _TESTS = [{
-        'url': 'http://www.twitch.tv/riotgames/b/577357806',
-        'info_dict': {
-            'id': 'a577357806',
-            'title': 'Worlds Semifinals - Star Horn Royal Club vs. OMG',
-        },
-        'playlist_mincount': 12,
-    }, {
-        'url': 'http://www.twitch.tv/acracingleague/c/5285812',
-        'info_dict': {
-            'id': 'c5285812',
-            'title': 'ACRL Off Season - Sports Cars @ Nordschleife',
-        },
-        'playlist_mincount': 3,
-    }, {
-        'url': 'http://www.twitch.tv/vanillatv',
-        'info_dict': {
-            'id': 'vanillatv',
-            'title': 'VanillaTV',
-        },
-        'playlist_mincount': 412,
-    }]
 
     def _handle_error(self, response):
         if not isinstance(response, dict):
@@ -64,34 +34,60 @@ class TwitchIE(InfoExtractor):
                 expected=True)
 
     def _download_json(self, url, video_id, note='Downloading JSON metadata'):
-        response = super(TwitchIE, self)._download_json(url, video_id, note)
+        response = super(TwitchBaseIE, self)._download_json(url, video_id, note)
         self._handle_error(response)
         return response
 
-    def _extract_media(self, item, item_id):
-        ITEMS = {
-            'a': 'video',
-            'v': 'vod',
-            'c': 'chapter',
+    def _real_initialize(self):
+        self._login()
+
+    def _login(self):
+        (username, password) = self._get_login_info()
+        if username is None:
+            return
+
+        login_page = self._download_webpage(
+            self._LOGIN_URL, None, 'Downloading login page')
+
+        authenticity_token = self._search_regex(
+            r'<input name="authenticity_token" type="hidden" value="([^"]+)"',
+            login_page, 'authenticity token')
+
+        login_form = {
+            'utf8': '✓'.encode('utf-8'),
+            'authenticity_token': authenticity_token,
+            'redirect_on_login': '',
+            'embed_form': 'false',
+            'mp_source_action': '',
+            'follow': '',
+            'user[login]': username,
+            'user[password]': password,
         }
-        info = self._extract_info(self._download_json(
+
+        request = compat_urllib_request.Request(
+            self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
+        request.add_header('Referer', self._LOGIN_URL)
+        response = self._download_webpage(
+            request, None, 'Logging in as %s' % username)
+
+        m = re.search(
+            r"id=([\"'])login_error_message\1[^>]*>(?P<msg>[^<]+)", response)
+        if m:
+            raise ExtractorError(
+                'Unable to login: %s' % m.group('msg').strip(), expected=True)
+
+
+class TwitchItemBaseIE(TwitchBaseIE):
+    def _download_info(self, item, item_id):
+        return self._extract_info(self._download_json(
             '%s/kraken/videos/%s%s' % (self._API_BASE, item, item_id), item_id,
-            'Downloading %s info JSON' % ITEMS[item]))
-
-        if item == 'v':
-            access_token = self._download_json(
-                '%s/api/vods/%s/access_token' % (self._API_BASE, item_id), item_id,
-                'Downloading %s access token' % ITEMS[item])
-            formats = self._extract_m3u8_formats(
-                'http://usher.twitch.tv/vod/%s?nauth=%s&nauthsig=%s'
-                % (item_id, access_token['token'], access_token['sig']),
-                item_id, 'mp4')
-            info['formats'] = formats
-            return info
+            'Downloading %s info JSON' % self._ITEM_TYPE))
 
+    def _extract_media(self, item_id):
+        info = self._download_info(self._ITEM_SHORTCUT, item_id)
         response = self._download_json(
-            '%s/api/videos/%s%s' % (self._API_BASE, item, item_id), item_id,
-            'Downloading %s playlist JSON' % ITEMS[item])
+            '%s/api/videos/%s%s' % (self._API_BASE, self._ITEM_SHORTCUT, item_id), item_id,
+            'Downloading %s playlist JSON' % self._ITEM_TYPE)
         entries = []
         chunks = response['chunks']
         qualities = list(chunks.keys())
@@ -129,119 +125,227 @@ class TwitchIE(InfoExtractor):
             'view_count': info['views'],
         }
 
-    def _real_initialize(self):
-        self._login()
+    def _real_extract(self, url):
+        return self._extract_media(self._match_id(url))
 
-    def _login(self):
-        (username, password) = self._get_login_info()
-        if username is None:
-            return
 
-        login_page = self._download_webpage(
-            self._LOGIN_URL, None, 'Downloading login page')
+class TwitchVideoIE(TwitchItemBaseIE):
+    IE_NAME = 'twitch:video'
+    _VALID_URL = r'%s/[^/]+/b/(?P<id>[^/]+)' % TwitchBaseIE._VALID_URL_BASE
+    _ITEM_TYPE = 'video'
+    _ITEM_SHORTCUT = 'a'
 
-        authenticity_token = self._search_regex(
-            r'<input name="authenticity_token" type="hidden" value="([^"]+)"',
-            login_page, 'authenticity token')
+    _TEST = {
+        'url': 'http://www.twitch.tv/riotgames/b/577357806',
+        'info_dict': {
+            'id': 'a577357806',
+            'title': 'Worlds Semifinals - Star Horn Royal Club vs. OMG',
+        },
+        'playlist_mincount': 12,
+    }
 
-        login_form = {
-            'utf8': '✓'.encode('utf-8'),
-            'authenticity_token': authenticity_token,
-            'redirect_on_login': '',
-            'embed_form': 'false',
-            'mp_source_action': '',
-            'follow': '',
-            'user[login]': username,
-            'user[password]': password,
-        }
 
-        request = compat_urllib_request.Request(
-            self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
-        request.add_header('Referer', self._LOGIN_URL)
-        response = self._download_webpage(
-            request, None, 'Logging in as %s' % username)
+class TwitchChapterIE(TwitchItemBaseIE):
+    IE_NAME = 'twitch:chapter'
+    _VALID_URL = r'%s/[^/]+/c/(?P<id>[^/]+)' % TwitchBaseIE._VALID_URL_BASE
+    _ITEM_TYPE = 'chapter'
+    _ITEM_SHORTCUT = 'c'
 
-        m = re.search(
-            r"id=([\"'])login_error_message\1[^>]*>(?P<msg>[^<]+)", response)
-        if m:
-            raise ExtractorError(
-                'Unable to login: %s' % m.group('msg').strip(), expected=True)
+    _TESTS = [{
+        'url': 'http://www.twitch.tv/acracingleague/c/5285812',
+        'info_dict': {
+            'id': 'c5285812',
+            'title': 'ACRL Off Season - Sports Cars @ Nordschleife',
+        },
+        'playlist_mincount': 3,
+    }, {
+        'url': 'http://www.twitch.tv/tsm_theoddone/c/2349361',
+        'only_matching': True,
+    }]
+
+
+class TwitchVodIE(TwitchItemBaseIE):
+    IE_NAME = 'twitch:vod'
+    _VALID_URL = r'%s/[^/]+/v/(?P<id>[^/]+)' % TwitchBaseIE._VALID_URL_BASE
+    _ITEM_TYPE = 'vod'
+    _ITEM_SHORTCUT = 'v'
+
+    _TEST = {
+        'url': 'http://www.twitch.tv/ksptv/v/3622000',
+        'info_dict': {
+            'id': 'v3622000',
+            'ext': 'mp4',
+            'title': '''KSPTV: Squadcast: "Everyone's on vacation so here's Dahud" Edition!''',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'duration': 6951,
+            'timestamp': 1419028564,
+            'upload_date': '20141219',
+            'uploader': 'KSPTV',
+            'uploader_id': 'ksptv',
+            'view_count': int,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        item_id = self._match_id(url)
+        info = self._download_info(self._ITEM_SHORTCUT, item_id)
+        access_token = self._download_json(
+            '%s/api/vods/%s/access_token' % (self._API_BASE, item_id), item_id,
+            'Downloading %s access token' % self._ITEM_TYPE)
+        formats = self._extract_m3u8_formats(
+            '%s/vod/%s?nauth=%s&nauthsig=%s'
+            % (self._USHER_BASE, item_id, access_token['token'], access_token['sig']),
+            item_id, 'mp4')
+        info['formats'] = formats
+        return info
+
+
+class TwitchPlaylistBaseIE(TwitchBaseIE):
+    _PLAYLIST_URL = '%s/kraken/channels/%%s/videos/?offset=%%d&limit=%%d' % TwitchBaseIE._API_BASE
+    _PAGE_LIMIT = 100
+
+    def _extract_playlist(self, channel_id):
+        info = self._download_json(
+            '%s/kraken/channels/%s' % (self._API_BASE, channel_id),
+            channel_id, 'Downloading channel info JSON')
+        channel_name = info.get('display_name') or info.get('name')
+        entries = []
+        offset = 0
+        limit = self._PAGE_LIMIT
+        for counter in itertools.count(1):
+            response = self._download_json(
+                self._PLAYLIST_URL % (channel_id, offset, limit),
+                channel_id, 'Downloading %s videos JSON page %d' % (self._PLAYLIST_TYPE, counter))
+            videos = response['videos']
+            if not videos:
+                break
+            entries.extend([self.url_result(video['url']) for video in videos])
+            offset += limit
+        return self.playlist_result(entries, channel_id, channel_name)
+
+    def _real_extract(self, url):
+        return self._extract_playlist(self._match_id(url))
+
+
+class TwitchProfileIE(TwitchPlaylistBaseIE):
+    IE_NAME = 'twitch:profile'
+    _VALID_URL = r'%s/(?P<id>[^/]+)/profile/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE
+    _PLAYLIST_TYPE = 'profile'
+
+    _TEST = {
+        'url': 'http://www.twitch.tv/vanillatv/profile',
+        'info_dict': {
+            'id': 'vanillatv',
+            'title': 'VanillaTV',
+        },
+        'playlist_mincount': 412,
+    }
+
+
+class TwitchPastBroadcastsIE(TwitchPlaylistBaseIE):
+    IE_NAME = 'twitch:past_broadcasts'
+    _VALID_URL = r'%s/(?P<id>[^/]+)/profile/past_broadcasts/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE
+    _PLAYLIST_URL = TwitchPlaylistBaseIE._PLAYLIST_URL + '&broadcasts=true'
+    _PLAYLIST_TYPE = 'past broadcasts'
+
+    _TEST = {
+        'url': 'http://www.twitch.tv/spamfish/profile/past_broadcasts',
+        'info_dict': {
+            'id': 'spamfish',
+            'title': 'Spamfish',
+        },
+        'playlist_mincount': 54,
+    }
+
+
+class TwitchStreamIE(TwitchBaseIE):
+    IE_NAME = 'twitch:stream'
+    _VALID_URL = r'%s/(?P<id>[^/]+)/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE
+
+    _TEST = {
+        'url': 'http://www.twitch.tv/shroomztv',
+        'info_dict': {
+            'id': '12772022048',
+            'display_id': 'shroomztv',
+            'ext': 'mp4',
+            'title': 're:^ShroomzTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+            'description': 'H1Z1 - lonewolfing with ShroomzTV | A3 Battle Royale later - @ShroomzTV',
+            'is_live': True,
+            'timestamp': 1421928037,
+            'upload_date': '20150122',
+            'uploader': 'ShroomzTV',
+            'uploader_id': 'shroomztv',
+            'view_count': int,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        if mobj.group('chapterid'):
-            return self._extract_media('c', mobj.group('chapterid'))
+        channel_id = self._match_id(url)
+
+        stream = self._download_json(
+            '%s/kraken/streams/%s' % (self._API_BASE, channel_id), channel_id,
+            'Downloading stream JSON').get('stream')
+
+        # Fallback on profile extraction if stream is offline
+        if not stream:
+            return self.url_result(
+                'http://www.twitch.tv/%s/profile' % channel_id,
+                'TwitchProfile', channel_id)
 
-            """
-            webpage = self._download_webpage(url, chapter_id)
-            m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
+        access_token = self._download_json(
+            '%s/api/channels/%s/access_token' % (self._API_BASE, channel_id), channel_id,
+            'Downloading channel access token')
+
+        query = {
+            'allow_source': 'true',
+            'p': random.randint(1000000, 10000000),
+            'player': 'twitchweb',
+            'segment_preference': '4',
+            'sig': access_token['sig'],
+            'token': access_token['token'],
+        }
+
+        formats = self._extract_m3u8_formats(
+            '%s/api/channel/hls/%s.m3u8?%s'
+            % (self._USHER_BASE, channel_id, compat_urllib_parse.urlencode(query).encode('utf-8')),
+            channel_id, 'mp4')
+
+        view_count = stream.get('viewers')
+        timestamp = parse_iso8601(stream.get('created_at'))
+
+        channel = stream['channel']
+        title = self._live_title(channel.get('display_name') or channel.get('name'))
+        description = channel.get('status')
+
+        thumbnails = []
+        for thumbnail_key, thumbnail_url in stream['preview'].items():
+            m = re.search(r'(?P<width>\d+)x(?P<height>\d+)\.jpg$', thumbnail_key)
             if not m:
-                raise ExtractorError('Cannot find archive of a chapter')
-            archive_id = m.group(1)
-
-            api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
-            doc = self._download_xml(
-                api, chapter_id,
-                note='Downloading chapter information',
-                errnote='Chapter information download failed')
-            for a in doc.findall('.//archive'):
-                if archive_id == a.find('./id').text:
-                    break
-            else:
-                raise ExtractorError('Could not find chapter in chapter information')
-
-            video_url = a.find('./video_file_url').text
-            video_ext = video_url.rpartition('.')[2] or 'flv'
-
-            chapter_api_url = 'https://api.twitch.tv/kraken/videos/c' + chapter_id
-            chapter_info = self._download_json(
-                chapter_api_url, 'c' + chapter_id,
-                note='Downloading chapter metadata',
-                errnote='Download of chapter metadata failed')
-
-            bracket_start = int(doc.find('.//bracket_start').text)
-            bracket_end = int(doc.find('.//bracket_end').text)
-
-            # TODO determine start (and probably fix up file)
-            #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
-            #video_url += '?start=' + TODO:start_timestamp
-            # bracket_start is 13290, but we want 51670615
-            self._downloader.report_warning('Chapter detected, but we can just download the whole file. '
-                                            'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
-
-            info = {
-                'id': 'c' + chapter_id,
-                'url': video_url,
-                'ext': video_ext,
-                'title': chapter_info['title'],
-                'thumbnail': chapter_info['preview'],
-                'description': chapter_info['description'],
-                'uploader': chapter_info['channel']['display_name'],
-                'uploader_id': chapter_info['channel']['name'],
-            }
-            return info
-            """
-        elif mobj.group('videoid'):
-            return self._extract_media('a', mobj.group('videoid'))
-        elif mobj.group('vodid'):
-            return self._extract_media('v', mobj.group('vodid'))
-        elif mobj.group('channelid'):
-            channel_id = mobj.group('channelid')
-            info = self._download_json(
-                '%s/kraken/channels/%s' % (self._API_BASE, channel_id),
-                channel_id, 'Downloading channel info JSON')
-            channel_name = info.get('display_name') or info.get('name')
-            entries = []
-            offset = 0
-            limit = self._PAGE_LIMIT
-            for counter in itertools.count(1):
-                response = self._download_json(
-                    '%s/kraken/channels/%s/videos/?offset=%d&limit=%d'
-                    % (self._API_BASE, channel_id, offset, limit),
-                    channel_id, 'Downloading channel videos JSON page %d' % counter)
-                videos = response['videos']
-                if not videos:
-                    break
-                entries.extend([self.url_result(video['url'], 'Twitch') for video in videos])
-                offset += limit
-            return self.playlist_result(entries, channel_id, channel_name)
+                continue
+            thumbnails.append({
+                'url': thumbnail_url,
+                'width': int(m.group('width')),
+                'height': int(m.group('height')),
+            })
+
+        return {
+            'id': compat_str(stream['_id']),
+            'display_id': channel_id,
+            'title': title,
+            'description': description,
+            'thumbnails': thumbnails,
+            'uploader': channel.get('display_name'),
+            'uploader_id': channel.get('name'),
+            'timestamp': timestamp,
+            'view_count': view_count,
+            'formats': formats,
+            'is_live': True,
+        }
index 7a78f0d264a47fc85c53cd815242e629ee38ed30..fc6e05fe0a2d60f42b1f5eef6d2d2e90994f86b7 100644 (file)
@@ -1,11 +1,15 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
+import re
+
 from .common import InfoExtractor
 from ..compat import (
     compat_urllib_parse,
+    compat_urllib_request,
 )
 from ..utils import (
+    ExtractorError,
     remove_start,
 )
 
@@ -16,34 +20,40 @@ class VideoMegaIE(InfoExtractor):
         (?:iframe\.php)?\?ref=(?P<id>[A-Za-z0-9]+)
         '''
     _TEST = {
-        'url': 'http://videomega.tv/?ref=GKeGPVedBe',
-        'md5': '240fb5bcf9199961f48eb17839b084d6',
+        'url': 'http://videomega.tv/?ref=QR0HCUHI1661IHUCH0RQ',
+        'md5': 'bf5c2f95c4c917536e80936af7bc51e1',
         'info_dict': {
-            'id': 'GKeGPVedBe',
+            'id': 'QR0HCUHI1661IHUCH0RQ',
             'ext': 'mp4',
-            'title': 'XXL - All Sports United',
+            'title': 'Big Buck Bunny',
             'thumbnail': 're:^https?://.*\.jpg$',
         }
     }
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
-        url = 'http://videomega.tv/iframe.php?ref={0:}'.format(video_id)
-        webpage = self._download_webpage(url, video_id)
 
-        escaped_data = self._search_regex(
-            r'unescape\("([^"]+)"\)', webpage, 'escaped data')
+        iframe_url = 'http://videomega.tv/iframe.php?ref={0:}'.format(video_id)
+        req = compat_urllib_request.Request(iframe_url)
+        req.add_header('Referer', url)
+        webpage = self._download_webpage(req, video_id)
+
+        try:
+            escaped_data = re.findall(r'unescape\("([^"]+)"\)', webpage)[-1]
+        except IndexError:
+            raise ExtractorError('Unable to extract escaped data')
+
         playlist = compat_urllib_parse.unquote(escaped_data)
 
         thumbnail = self._search_regex(
             r'image:\s*"([^"]+)"', playlist, 'thumbnail', fatal=False)
-        url = self._search_regex(r'file:\s*"([^"]+)"', playlist, 'URL')
+        video_url = self._search_regex(r'file:\s*"([^"]+)"', playlist, 'URL')
         title = remove_start(self._html_search_regex(
             r'<title>(.*?)</title>', webpage, 'title'), 'VideoMega.tv - ')
 
         formats = [{
             'format_id': 'sd',
-            'url': url,
+            'url': video_url,
         }]
         self._sort_formats(formats)
 
@@ -52,4 +62,5 @@ class VideoMegaIE(InfoExtractor):
             'title': title,
             'formats': formats,
             'thumbnail': thumbnail,
+            'http_referer': iframe_url,
         }
index c7611a3a09de729b56ed3a82e5f00fa12ac2167b..2a1f8be0a7e6d6946b5e1801a4b3cd6dda338efe 100644 (file)
@@ -394,6 +394,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
                 'format': '141',
             },
         },
+        # JS player signature function name containing $
+        {
+            'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
+            'info_dict': {
+                'id': 'nfWlot6h_JM',
+                'ext': 'm4a',
+                'title': 'Taylor Swift - Shake It Off',
+                'description': 'md5:2acfda1b285bdd478ccec22f9918199d',
+                'uploader': 'TaylorSwiftVEVO',
+                'uploader_id': 'TaylorSwiftVEVO',
+                'upload_date': '20140818',
+            },
+            'params': {
+                'youtube_include_dash_manifest': True,
+                'format': '141',
+            },
+        },
         # Controversy video
         {
             'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
@@ -588,7 +605,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
 
     def _parse_sig_js(self, jscode):
         funcname = self._search_regex(
-            r'\.sig\|\|([a-zA-Z0-9]+)\(', jscode,
+            r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode,
             'Initial JS player signature function name')
 
         jsi = JSInterpreter(jscode)
index a30974efd382511b652397c475371b368f86f85b..fd7b400b2a67204c2378303fbf33d2d0a3470993 100644 (file)
@@ -264,7 +264,7 @@ def parseOpts(overrideArguments=None):
     authentication.add_option(
         '-p', '--password',
         dest='password', metavar='PASSWORD',
-        help='account password')
+        help='account password. If this option is left out, youtube-dl will ask interactively.')
     authentication.add_option(
         '-2', '--twofactor',
         dest='twofactor', metavar='TWOFACTOR',
@@ -289,6 +289,17 @@ def parseOpts(overrideArguments=None):
             'extensions aac, m4a, mp3, mp4, ogg, wav, webm. '
             'You can also use the special names "best",'
             ' "bestvideo", "bestaudio", "worst". '
+            ' You can filter the video results by putting a condition in'
+            ' brackets, as in -f "best[height=720]"'
+            ' (or -f "[filesize>10M]"). '
+            ' This works for filesize, height, width, tbr, abr, and vbr'
+            ' and the comparisons <, <=, >, >=, =, != .'
+            ' Formats for which the value is not known are excluded unless you'
+            ' put a question mark (?) after the operator.'
+            ' You can combine format filters, so  '
+            '-f "[height <=? 720][tbr>500]" '
+            'selects up to 720p videos (or videos where the height is not '
+            'known) with a bitrate of at least 500 KBit/s.'
             ' By default, youtube-dl will pick the best quality.'
             ' Use commas to download multiple audio formats, such as'
             ' -f  136/137/mp4/bestvideo,140/m4a/bestaudio.'
index 26b99e43c33bd4bcebea171a9df71969273a1cc8..faccdc43dfb4e0adf21bdff62149a861521f8fed 100644 (file)
@@ -475,15 +475,21 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
         filename = information['filepath']
         input_files = [filename] + [subtitles_filename(filename, lang, self._subformat) for lang in sub_langs]
 
-        opts = ['-map', '0:0', '-map', '0:1', '-c:v', 'copy', '-c:a', 'copy']
+        opts = [
+            '-map', '0',
+            '-c', 'copy',
+            # Don't copy the existing subtitles, we may be running the
+            # postprocessor a second time
+            '-map', '-0:s',
+            '-c:s', 'mov_text',
+        ]
         for (i, lang) in enumerate(sub_langs):
-            opts.extend(['-map', '%d:0' % (i + 1), '-c:s:%d' % i, 'mov_text'])
+            opts.extend(['-map', '%d:0' % (i + 1)])
             lang_code = self._conver_lang_code(lang)
             if lang_code is not None:
                 opts.extend(['-metadata:s:s:%d' % i, 'language=%s' % lang_code])
-        opts.extend(['-f', 'mp4'])
 
-        temp_filename = filename + '.temp'
+        temp_filename = prepend_extension(filename, 'temp')
         self._downloader.to_screen('[ffmpeg] Embedding subtitles in \'%s\'' % filename)
         self.run_ffmpeg_multiple_files(input_files, temp_filename, opts)
         os.remove(encodeFilename(filename))
@@ -503,6 +509,10 @@ class FFmpegMetadataPP(FFmpegPostProcessor):
             metadata['artist'] = info['uploader']
         elif info.get('uploader_id') is not None:
             metadata['artist'] = info['uploader_id']
+        if info.get('description') is not None:
+            metadata['description'] = info['description']
+        if info.get('webpage_url') is not None:
+            metadata['comment'] = info['webpage_url']
 
         if not metadata:
             self._downloader.to_screen('[ffmpeg] There isn\'t any metadata to add')
index daf94abd1d541d77e8b74f0f17e6fde780b054e8..3536a5bd6f3216a62eaaec2ba11a08893c5e62ab 100644 (file)
@@ -399,8 +399,9 @@ def formatSeconds(secs):
 def make_HTTPS_handler(params, **kwargs):
     opts_no_check_certificate = params.get('nocheckcertificate', False)
     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
-        context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
+        context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
         if opts_no_check_certificate:
+            context.check_hostname = False
             context.verify_mode = ssl.CERT_NONE
         try:
             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
@@ -723,6 +724,7 @@ def unified_strdate(date_str, day_first=True):
         '%b %dst %Y %I:%M%p',
         '%b %dnd %Y %I:%M%p',
         '%b %dth %Y %I:%M%p',
+        '%Y %m %d',
         '%Y-%m-%d',
         '%Y/%m/%d',
         '%Y/%m/%d %H:%M:%S',
@@ -1277,7 +1279,7 @@ def parse_duration(s):
     s = s.strip()
 
     m = re.match(
-        r'''(?ix)T?
+        r'''(?ix)(?:P?T)?
         (?:
             (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
             (?P<only_hours>[0-9.]+)\s*(?:hours?)|
@@ -1612,6 +1614,14 @@ def urlhandle_detect_ext(url_handle):
     except AttributeError:  # Python < 3
         getheader = url_handle.info().getheader
 
+    cd = getheader('Content-Disposition')
+    if cd:
+        m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
+        if m:
+            e = determine_ext(m.group('filename'), default_ext=None)
+            if e:
+                return e
+
     return getheader('Content-Type').split("/")[1]
 
 
@@ -1623,3 +1633,23 @@ def age_restricted(content_limit, age_limit):
     if content_limit is None:
         return False  # Content available for everyone
     return age_limit < content_limit
+
+
+def is_html(first_bytes):
+    """ Detect whether a file contains HTML by examining its first bytes. """
+
+    BOMS = [
+        (b'\xef\xbb\xbf', 'utf-8'),
+        (b'\x00\x00\xfe\xff', 'utf-32-be'),
+        (b'\xff\xfe\x00\x00', 'utf-32-le'),
+        (b'\xff\xfe', 'utf-16-le'),
+        (b'\xfe\xff', 'utf-16-be'),
+    ]
+    for bom, enc in BOMS:
+        if first_bytes.startswith(bom):
+            s = first_bytes[len(bom):].decode(enc, 'replace')
+            break
+    else:
+        s = first_bytes.decode('utf-8', 'replace')
+
+    return re.match(r'^\s*<', s)
index 0a184ad0870143f446837226444f63ba2b429316..28458fd69ee0689ebdca517b23c420d83c4316f1 100644 (file)
@@ -1,3 +1,3 @@
 from __future__ import unicode_literals
 
-__version__ = '2015.01.11'
+__version__ = '2015.01.23.1'