Merge branch 'dramafever' of https://github.com/ping/youtube-dl into ping-dramafever
authorSergey M․ <dstftw@gmail.com>
Sat, 13 Jun 2015 22:56:54 +0000 (04:56 +0600)
committerSergey M․ <dstftw@gmail.com>
Sat, 13 Jun 2015 22:56:54 +0000 (04:56 +0600)
90 files changed:
AUTHORS
README.md
docs/supportedsites.md
test/test_aes.py
test/test_subtitles.py
youtube_dl/YoutubeDL.py
youtube_dl/aes.py
youtube_dl/extractor/__init__.py
youtube_dl/extractor/aftonbladet.py
youtube_dl/extractor/arte.py
youtube_dl/extractor/bilibili.py
youtube_dl/extractor/brightcove.py
youtube_dl/extractor/cbs.py
youtube_dl/extractor/chilloutzone.py
youtube_dl/extractor/cinemassacre.py
youtube_dl/extractor/cnet.py
youtube_dl/extractor/cnn.py
youtube_dl/extractor/common.py
youtube_dl/extractor/crunchyroll.py
youtube_dl/extractor/dailymotion.py
youtube_dl/extractor/discovery.py
youtube_dl/extractor/drtv.py
youtube_dl/extractor/empflix.py
youtube_dl/extractor/facebook.py
youtube_dl/extractor/firedrive.py [deleted file]
youtube_dl/extractor/fivetv.py [new file with mode: 0644]
youtube_dl/extractor/generic.py
youtube_dl/extractor/imgur.py
youtube_dl/extractor/instagram.py
youtube_dl/extractor/iprima.py
youtube_dl/extractor/iqiyi.py [new file with mode: 0644]
youtube_dl/extractor/izlesene.py
youtube_dl/extractor/karrierevideos.py [new file with mode: 0644]
youtube_dl/extractor/kickstarter.py
youtube_dl/extractor/letv.py
youtube_dl/extractor/mitele.py
youtube_dl/extractor/naver.py
youtube_dl/extractor/nba.py
youtube_dl/extractor/nextmedia.py
youtube_dl/extractor/nfl.py
youtube_dl/extractor/noco.py
youtube_dl/extractor/nova.py [new file with mode: 0644]
youtube_dl/extractor/nowtv.py [new file with mode: 0644]
youtube_dl/extractor/odnoklassniki.py
youtube_dl/extractor/patreon.py
youtube_dl/extractor/porn91.py [new file with mode: 0644]
youtube_dl/extractor/pornhub.py
youtube_dl/extractor/pornovoisines.py
youtube_dl/extractor/prosiebensat1.py
youtube_dl/extractor/qqmusic.py
youtube_dl/extractor/rtbf.py
youtube_dl/extractor/rtlnl.py
youtube_dl/extractor/rtlnow.py [deleted file]
youtube_dl/extractor/rtve.py
youtube_dl/extractor/rutv.py
youtube_dl/extractor/ruutu.py [new file with mode: 0644]
youtube_dl/extractor/senateisvp.py
youtube_dl/extractor/shared.py
youtube_dl/extractor/sockshare.py [deleted file]
youtube_dl/extractor/sohu.py
youtube_dl/extractor/soompi.py [new file with mode: 0644]
youtube_dl/extractor/spankwire.py
youtube_dl/extractor/spiegeltv.py
youtube_dl/extractor/sportbox.py
youtube_dl/extractor/sunporno.py
youtube_dl/extractor/teamcoco.py
youtube_dl/extractor/telecinco.py
youtube_dl/extractor/tenplay.py
youtube_dl/extractor/tf1.py
youtube_dl/extractor/theplatform.py
youtube_dl/extractor/tlc.py
youtube_dl/extractor/tnaflix.py
youtube_dl/extractor/tube8.py
youtube_dl/extractor/tubitv.py [new file with mode: 0644]
youtube_dl/extractor/tumblr.py
youtube_dl/extractor/turbo.py
youtube_dl/extractor/tutv.py
youtube_dl/extractor/tvc.py [new file with mode: 0644]
youtube_dl/extractor/tvigle.py
youtube_dl/extractor/twentyfourvideo.py
youtube_dl/extractor/vgtv.py
youtube_dl/extractor/videott.py
youtube_dl/extractor/vidme.py
youtube_dl/extractor/viki.py
youtube_dl/extractor/vk.py
youtube_dl/extractor/youtube.py
youtube_dl/options.py
youtube_dl/postprocessor/embedthumbnail.py
youtube_dl/update.py
youtube_dl/version.py

diff --git a/AUTHORS b/AUTHORS
index 267b8da1e6ffbda7853d53996de4020111074834..bf2a25cb8a804f9d0d500816c126480eb5083b58 100644 (file)
--- a/AUTHORS
+++ b/AUTHORS
@@ -124,3 +124,6 @@ Mohammad Teimori Pabandi
 Roman Le Négrate
 Matthias Küch
 Julian Richen
+Ping O.
+Mister Hat
+Peter Ding
index 3d9436456c4bd29d11bf160efa27769b05edde2d..f3d83c89fafbdff0dc6b22b6d207a28428b59f96 100644 (file)
--- a/README.md
+++ b/README.md
@@ -17,12 +17,12 @@ youtube-dl - download videos from youtube.com or other video platforms
 To install it right away for all UNIX users (Linux, OS X, etc.), type:
 
     sudo curl https://yt-dl.org/latest/youtube-dl -o /usr/local/bin/youtube-dl
-    sudo chmod a+x /usr/local/bin/youtube-dl
+    sudo chmod a+rx /usr/local/bin/youtube-dl
 
 If you do not have curl, you can alternatively use a recent wget:
 
     sudo wget https://yt-dl.org/downloads/latest/youtube-dl -O /usr/local/bin/youtube-dl
-    sudo chmod a+x /usr/local/bin/youtube-dl
+    sudo chmod a+rx /usr/local/bin/youtube-dl
 
 Windows users can [download a .exe file](https://yt-dl.org/latest/youtube-dl.exe) and place it in their home directory or any other location on their [PATH](http://en.wikipedia.org/wiki/PATH_%28variable%29).
 
@@ -168,7 +168,7 @@ which means you can modify it, redistribute it or use it however you like.
     --no-progress                    Do not print progress bar
     --console-title                  Display progress in console titlebar
     -v, --verbose                    Print various debugging information
-    --dump-pages                     Print downloaded pages to debug problems (very verbose)
+    --dump-pages                     Print downloaded pages encoded using base64 to debug problems (very verbose)
     --write-pages                    Write downloaded intermediary pages to files in the current directory to debug problems
     --print-traffic                  Display sent and read HTTP traffic
     -C, --call-home                  Contact the youtube-dl server for debugging
@@ -220,7 +220,7 @@ which means you can modify it, redistribute it or use it however you like.
     --embed-thumbnail                Embed thumbnail in the audio as cover art
     --add-metadata                   Write metadata to the video file
     --metadata-from-title FORMAT     Parse additional metadata like song title / artist from the video title. The format syntax is the same as --output, the parsed
-                                     parameters replace existing values. Additional templates: %(album), %(artist). Example: --metadata-from-title "%(artist)s -
+                                     parameters replace existing values. Additional templates: %(album)s, %(artist)s. Example: --metadata-from-title "%(artist)s -
                                      %(title)s" matches a title like "Coldplay - Paradise"
     --xattrs                         Write metadata to the video file's xattrs (using dublin core and xdg standards)
     --fixup POLICY                   Automatically correct known faults of the file. One of never (do nothing), warn (only emit a warning), detect_or_warn(the default;
index 43fbe8b1d75bf260dd98d29ee7cb029b4c373365..d147b53fe60a5bba6f9d33d1fba684014513f83b 100644 (file)
@@ -10,6 +10,7 @@
  - **56.com**
  - **5min**
  - **8tracks**
+ - **91porn**
  - **9gag**
  - **abc.net.au**
  - **Abc7News**
@@ -26,8 +27,7 @@
  - **anitube.se**
  - **AnySex**
  - **Aparat**
- - **AppleDailyAnimationNews**
- - **AppleDailyRealtimeNews**
+ - **AppleDaily**
  - **AppleTrailers**
  - **archive.org**: archive.org videos
  - **ARD**
  - **Eporner**
  - **EroProfile**
  - **Escapist**
+ - **ESPN** (Currently broken)
  - **EveryonesMixtape**
  - **exfm**: ex.fm
  - **ExpoTV**
  - **fc2**
  - **fernsehkritik.tv**
  - **fernsehkritik.tv:postecke**
- - **Firedrive**
  - **Firstpost**
  - **Flickr**
  - **Folketinget**: Folketinget (ft.dk; Danish parliament)
  - **KanalPlay**: Kanal 5/9/11 Play
  - **Kankan**
  - **Karaoketv**
+ - **KarriereVideos**
  - **keek**
  - **KeezMovies**
  - **KhanAcademy**
  - **Noco**
  - **Normalboots**
  - **NosVideo**
+ - **Nova**: TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz
  - **novamov**: NovaMov
  - **Nowness**
+ - **NowTV**
  - **nowvideo**: NowVideo
  - **npo.nl**
  - **npo.nl:live**
  - **OktoberfestTV**
  - **on.aol.com**
  - **Ooyala**
+ - **OoyalaExternal**
  - **OpenFilm**
  - **orf:fm4**: radio FM4
  - **orf:iptv**: iptv.ORF.at
  - **Rte**
  - **rtl.nl**: rtl.nl and rtlxl.nl
  - **RTL2**
- - **RTLnow**
  - **RTP**
  - **RTS**: RTS.ch
  - **rtve.es:alacarta**: RTVE a la carta
  - **smotri:community**: Smotri.com community videos
  - **smotri:user**: Smotri.com user videos
  - **Snotr**
- - **Sockshare**
  - **Sohu**
+ - **soompi**
+ - **soompi:show**
  - **soundcloud**
  - **soundcloud:playlist**
  - **soundcloud:set**
  - **Spike**
  - **Sport5**
  - **SportBox**
+ - **SportBoxEmbed**
  - **SportDeutschland**
  - **Srf**
  - **SRMediathek**: Saarländischer Rundfunk
  - **Trilulilu**
  - **TruTube**
  - **Tube8**
+ - **TubiTv**
  - **Tudou**
  - **Tumblr**
  - **TuneIn**
  - **Turbo**
  - **Tutv**
  - **tv.dfb.de**
+ - **TV2**
+ - **TV2Article**
  - **TV4**: tv4.se and tv4play.se
  - **tvigle**: Интернет-телевидение Tvigle.ru
  - **tvp.pl**
  - **vier:videos**
  - **Viewster**
  - **viki**
+ - **viki:channel**
  - **vimeo**
  - **vimeo:album**
  - **vimeo:channel**
index 4dc7de7b5b8d55fc1d95296b58e076aba664e1cc..315a3f5ae6a597662d05f56e97672b4ff93aff10 100644 (file)
@@ -39,7 +39,7 @@ class TestAES(unittest.TestCase):
         encrypted = base64.b64encode(
             intlist_to_bytes(self.iv[:8]) +
             b'\x17\x15\x93\xab\x8d\x80V\xcdV\xe0\t\xcdo\xc2\xa5\xd8ksM\r\xe27N\xae'
-        )
+        ).decode('utf-8')
         decrypted = (aes_decrypt_text(encrypted, password, 16))
         self.assertEqual(decrypted, self.secret_msg)
 
@@ -47,7 +47,7 @@ class TestAES(unittest.TestCase):
         encrypted = base64.b64encode(
             intlist_to_bytes(self.iv[:8]) +
             b'\x0b\xe6\xa4\xd9z\x0e\xb8\xb9\xd0\xd4i_\x85\x1d\x99\x98_\xe5\x80\xe7.\xbf\xa5\x83'
-        )
+        ).decode('utf-8')
         decrypted = (aes_decrypt_text(encrypted, password, 32))
         self.assertEqual(decrypted, self.secret_msg)
 
index 891ee620b1f2627dd6991e0cccfbc58b59fb6a95..c4e3adb67b7d1034b36cdd3c45969fe321351c64 100644 (file)
@@ -266,7 +266,7 @@ class TestNRKSubtitles(BaseTestSubtitles):
         self.DL.params['allsubtitles'] = True
         subtitles = self.getSubtitles()
         self.assertEqual(set(subtitles.keys()), set(['no']))
-        self.assertEqual(md5(subtitles['no']), '1d221e6458c95c5494dcd38e6a1f129a')
+        self.assertEqual(md5(subtitles['no']), '544fa917d3197fcbee64634559221cc2')
 
 
 class TestRaiSubtitles(BaseTestSubtitles):
index 58b34e087b421474112719ffb3389b252e560313..b1f792d4ef8815dc8cfb4471cdd6f0e47a93e8f7 100755 (executable)
@@ -49,6 +49,7 @@ from .utils import (
     ExtractorError,
     format_bytes,
     formatSeconds,
+    HEADRequest,
     locked_file,
     make_HTTPS_handler,
     MaxDownloadsReached,
@@ -923,8 +924,9 @@ class YoutubeDL(object):
                 if f.get('vcodec') != 'none' and f.get('acodec') != 'none']
             if audiovideo_formats:
                 return audiovideo_formats[format_idx]
-            # for audio only urls, select the best/worst audio format
-            elif all(f.get('acodec') != 'none' for f in available_formats):
+            # for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format
+            elif (all(f.get('acodec') != 'none' for f in available_formats) or
+                  all(f.get('vcodec') != 'none' for f in available_formats)):
                 return available_formats[format_idx]
         elif format_spec == 'bestaudio':
             audio_formats = [
@@ -1014,13 +1016,13 @@ class YoutubeDL(object):
             info_dict['display_id'] = info_dict['id']
 
         if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
-            # Working around negative timestamps in Windows
-            # (see http://bugs.python.org/issue1646728)
-            if info_dict['timestamp'] < 0 and os.name == 'nt':
-                info_dict['timestamp'] = 0
-            upload_date = datetime.datetime.utcfromtimestamp(
-                info_dict['timestamp'])
-            info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
+            # Working around out-of-range timestamp values (e.g. negative ones on Windows,
+            # see http://bugs.python.org/issue1646728)
+            try:
+                upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp'])
+                info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
+            except (ValueError, OverflowError, OSError):
+                pass
 
         if self.params.get('listsubtitles', False):
             if 'automatic_captions' in info_dict:
@@ -1047,6 +1049,8 @@ class YoutubeDL(object):
         if not formats:
             raise ExtractorError('No video formats found!')
 
+        formats_dict = {}
+
         # We check that all the formats have the format and format_id fields
         for i, format in enumerate(formats):
             if 'url' not in format:
@@ -1054,6 +1058,18 @@ class YoutubeDL(object):
 
             if format.get('format_id') is None:
                 format['format_id'] = compat_str(i)
+            format_id = format['format_id']
+            if format_id not in formats_dict:
+                formats_dict[format_id] = []
+            formats_dict[format_id].append(format)
+
+        # Make sure all formats have unique format_id
+        for format_id, ambiguous_formats in formats_dict.items():
+            if len(ambiguous_formats) > 1:
+                for i, format in enumerate(ambiguous_formats):
+                    format['format_id'] = '%s-%d' % (format_id, i)
+
+        for i, format in enumerate(formats):
             if format.get('format') is None:
                 format['format'] = '{id} - {res}{note}'.format(
                     id=format['format_id'],
@@ -1527,6 +1543,7 @@ class YoutubeDL(object):
             pps_chain.extend(ie_info['__postprocessors'])
         pps_chain.extend(self._pps)
         for pp in pps_chain:
+            files_to_delete = []
             try:
                 files_to_delete, info = pp.run(info)
             except PostProcessingError as e:
@@ -1705,7 +1722,8 @@ class YoutubeDL(object):
             if req_is_string:
                 req = url_escaped
             else:
-                req = compat_urllib_request.Request(
+                req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
+                req = req_type(
                     url_escaped, data=req.data, headers=req.headers,
                     origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
 
index 07224d5084158184ac30c927b1b79802d398c336..7817adcfdd546f70cfb76e0634b8df8ddbcaf8e0 100644 (file)
@@ -152,7 +152,7 @@ def aes_decrypt_text(data, password, key_size_bytes):
     """
     NONCE_LENGTH_BYTES = 8
 
-    data = bytes_to_intlist(base64.b64decode(data))
+    data = bytes_to_intlist(base64.b64decode(data.encode('utf-8')))
     password = bytes_to_intlist(password.encode('utf-8'))
 
     key = password[:key_size_bytes] + [0] * (key_size_bytes - len(password))
index ca857a75f64b404a4167461fc4c8a28110af1bc3..6fdaf90b2d69799091db469a9eef5810cef8542e 100644 (file)
@@ -153,10 +153,10 @@ from .extremetube import ExtremeTubeIE
 from .facebook import FacebookIE
 from .faz import FazIE
 from .fc2 import FC2IE
-from .firedrive import FiredriveIE
 from .firstpost import FirstpostIE
 from .firsttv import FirstTVIE
 from .fivemin import FiveMinIE
+from .fivetv import FiveTVIE
 from .fktv import (
     FKTVIE,
     FKTVPosteckeIE,
@@ -234,6 +234,7 @@ from .infoq import InfoQIE
 from .instagram import InstagramIE, InstagramUserIE
 from .internetvideoarchive import InternetVideoArchiveIE
 from .iprima import IPrimaIE
+from .iqiyi import IqiyiIE
 from .ivi import (
     IviIE,
     IviCompilationIE
@@ -248,6 +249,7 @@ from .kaltura import KalturaIE
 from .kanalplay import KanalPlayIE
 from .kankan import KankanIE
 from .karaoketv import KaraoketvIE
+from .karrierevideos import KarriereVideosIE
 from .keezmovies import KeezMoviesIE
 from .khanacademy import KhanAcademyIE
 from .kickstarter import KickStarterIE
@@ -342,8 +344,7 @@ from .newstube import NewstubeIE
 from .nextmedia import (
     NextMediaIE,
     NextMediaActionNewsIE,
-    AppleDailyRealtimeNewsIE,
-    AppleDailyAnimationNewsIE
+    AppleDailyIE,
 )
 from .nfb import NFBIE
 from .nfl import NFLIE
@@ -357,8 +358,10 @@ from .ninegag import NineGagIE
 from .noco import NocoIE
 from .normalboots import NormalbootsIE
 from .nosvideo import NosVideoIE
+from .nova import NovaIE
 from .novamov import NovaMovIE
 from .nowness import NownessIE
+from .nowtv import NowTVIE
 from .nowvideo import NowVideoIE
 from .npo import (
     NPOIE,
@@ -405,6 +408,7 @@ from .playfm import PlayFMIE
 from .playvid import PlayvidIE
 from .playwire import PlaywireIE
 from .podomatic import PodomaticIE
+from .porn91 import Porn91IE
 from .pornhd import PornHdIE
 from .pornhub import (
     PornHubIE,
@@ -442,7 +446,6 @@ from .roxwel import RoxwelIE
 from .rtbf import RTBFIE
 from .rte import RteIE
 from .rtlnl import RtlNlIE
-from .rtlnow import RTLnowIE
 from .rtl2 import RTL2IE
 from .rtp import RTPIE
 from .rts import RTSIE
@@ -456,6 +459,7 @@ from .rutube import (
     RutubePersonIE,
 )
 from .rutv import RUTVIE
+from .ruutu import RuutuIE
 from .sandia import SandiaIE
 from .safari import (
     SafariIE,
@@ -484,8 +488,11 @@ from .smotri import (
     SmotriBroadcastIE,
 )
 from .snotr import SnotrIE
-from .sockshare import SockshareIE
 from .sohu import SohuIE
+from .soompi import (
+    SoompiIE,
+    SoompiShowIE,
+)
 from .soundcloud import (
     SoundcloudIE,
     SoundcloudSetIE,
@@ -571,6 +578,7 @@ from .traileraddict import TrailerAddictIE
 from .trilulilu import TriluliluIE
 from .trutube import TruTubeIE
 from .tube8 import Tube8IE
+from .tubitv import TubiTvIE
 from .tudou import TudouIE
 from .tumblr import TumblrIE
 from .tunein import TuneInIE
@@ -581,6 +589,10 @@ from .tv2 import (
     TV2ArticleIE,
 )
 from .tv4 import TV4IE
+from .tvc import (
+    TVCIE,
+    TVCArticleIE,
+)
 from .tvigle import TvigleIE
 from .tvp import TvpIE, TvpSeriesIE
 from .tvplay import TVPlayIE
@@ -651,7 +663,10 @@ from .vine import (
     VineIE,
     VineUserIE,
 )
-from .viki import VikiIE
+from .viki import (
+    VikiIE,
+    VikiChannelIE,
+)
 from .vk import (
     VKIE,
     VKUserVideosIE,
index a117502bc0ad7bfec11592ec57da575898cacc3d..e0518cf261fbffc4dd23bc4a3800d04eae324139 100644 (file)
@@ -6,11 +6,11 @@ from ..utils import int_or_none
 
 
 class AftonbladetIE(InfoExtractor):
-    _VALID_URL = r'http://tv\.aftonbladet\.se/webbtv.+?(?P<id>article[0-9]+)\.ab(?:$|[?#])'
+    _VALID_URL = r'http://tv\.aftonbladet\.se/abtv/articles/(?P<id>[0-9]+)'
     _TEST = {
-        'url': 'http://tv.aftonbladet.se/webbtv/nyheter/vetenskap/rymden/article36015.ab',
+        'url': 'http://tv.aftonbladet.se/abtv/articles/36015',
         'info_dict': {
-            'id': 'article36015',
+            'id': '36015',
             'ext': 'mp4',
             'title': 'Vulkanutbrott i rymden - nu släpper NASA bilderna',
             'description': 'Jupiters måne mest aktiv av alla himlakroppar',
@@ -25,8 +25,9 @@ class AftonbladetIE(InfoExtractor):
 
         # find internal video meta data
         meta_url = 'http://aftonbladet-play.drlib.aptoma.no/video/%s.json'
-        internal_meta_id = self._html_search_regex(
-            r'data-aptomaId="([\w\d]+)"', webpage, 'internal_meta_id')
+        player_config = self._parse_json(self._html_search_regex(
+            r'data-player-config="([^"]+)"', webpage, 'player config'), video_id)
+        internal_meta_id = player_config['videoId']
         internal_meta_url = meta_url % internal_meta_id
         internal_meta_json = self._download_json(
             internal_meta_url, video_id, 'Downloading video meta data')
index 8273bd6c9ae3cdff82052c8f63efc68be97561b3..76de244774369dd53c510961ecf6b6a7641c7027 100644 (file)
@@ -7,7 +7,6 @@ from .common import InfoExtractor
 from ..utils import (
     find_xpath_attr,
     unified_strdate,
-    get_element_by_id,
     get_element_by_attribute,
     int_or_none,
     qualities,
@@ -195,7 +194,9 @@ class ArteTVFutureIE(ArteTVPlus7IE):
     def _real_extract(self, url):
         anchor_id, lang = self._extract_url_info(url)
         webpage = self._download_webpage(url, anchor_id)
-        row = get_element_by_id(anchor_id, webpage)
+        row = self._search_regex(
+            r'(?s)id="%s"[^>]*>.+?(<div[^>]*arte_vp_url[^>]*>)' % anchor_id,
+            webpage, 'row')
         return self._extract_from_webpage(row, anchor_id, lang)
 
 
index 7ca835e31f3477f8e46c74804aea36ecfe789686..bf60450c2f3753aa64d5ae11069bec98b67f23dc 100644 (file)
@@ -3,6 +3,8 @@ from __future__ import unicode_literals
 
 import re
 import itertools
+import json
+import xml.etree.ElementTree as ET
 
 from .common import InfoExtractor
 from ..utils import (
@@ -67,11 +69,19 @@ class BiliBiliIE(InfoExtractor):
 
         entries = []
 
-        lq_doc = self._download_xml(
+        lq_page = self._download_webpage(
             'http://interface.bilibili.com/v_cdn_play?appkey=1&cid=%s' % cid,
             video_id,
             note='Downloading LQ video info'
         )
+        try:
+            err_info = json.loads(lq_page)
+            raise ExtractorError(
+                'BiliBili said: ' + err_info['error_text'], expected=True)
+        except ValueError:
+            pass
+
+        lq_doc = ET.fromstring(lq_page)
         lq_durls = lq_doc.findall('./durl')
 
         hq_doc = self._download_xml(
@@ -80,9 +90,11 @@ class BiliBiliIE(InfoExtractor):
             note='Downloading HQ video info',
             fatal=False,
         )
-        hq_durls = hq_doc.findall('./durl') if hq_doc is not False else itertools.repeat(None)
-
-        assert len(lq_durls) == len(hq_durls)
+        if hq_doc is not False:
+            hq_durls = hq_doc.findall('./durl')
+            assert len(lq_durls) == len(hq_durls)
+        else:
+            hq_durls = itertools.repeat(None)
 
         i = 1
         for lq_durl, hq_durl in zip(lq_durls, hq_durls):
@@ -93,7 +105,7 @@ class BiliBiliIE(InfoExtractor):
                 'filesize': int_or_none(
                     lq_durl.find('./size'), get_attr='text'),
             }]
-            if hq_durl:
+            if hq_durl is not None:
                 formats.append({
                     'format_id': 'hq',
                     'quality': 2,
index 4f60d53660fa7777b9e1b6152967ce2e7e567ec9..d768f99e67bd49de31e3f0df70d832284cbb3482 100644 (file)
@@ -156,6 +156,28 @@ class BrightcoveIE(InfoExtractor):
         linkBase = find_param('linkBaseURL')
         if linkBase is not None:
             params['linkBaseURL'] = linkBase
+        return cls._make_brightcove_url(params)
+
+    @classmethod
+    def _build_brighcove_url_from_js(cls, object_js):
+        # The layout of JS is as follows:
+        # customBC.createVideo = function (width, height, playerID, playerKey, videoPlayer, VideoRandomID) {
+        #   // build Brightcove <object /> XML
+        # }
+        m = re.search(
+            r'''(?x)customBC.\createVideo\(
+                .*?                                                  # skipping width and height
+                ["\'](?P<playerID>\d+)["\']\s*,\s*                   # playerID
+                ["\'](?P<playerKey>AQ[^"\']{48})[^"\']*["\']\s*,\s*  # playerKey begins with AQ and is 50 characters
+                                                                     # in length, however it's appended to itself
+                                                                     # in places, so truncate
+                ["\'](?P<videoID>\d+)["\']                           # @videoPlayer
+            ''', object_js)
+        if m:
+            return cls._make_brightcove_url(m.groupdict())
+
+    @classmethod
+    def _make_brightcove_url(cls, params):
         data = compat_urllib_parse.urlencode(params)
         return cls._FEDERATED_URL_TEMPLATE % data
 
@@ -172,7 +194,7 @@ class BrightcoveIE(InfoExtractor):
         """Return a list of all Brightcove URLs from the webpage """
 
         url_m = re.search(
-            r'<meta\s+property="og:video"\s+content="(https?://(?:secure|c)\.brightcove.com/[^"]+)"',
+            r'<meta\s+property=[\'"]og:video[\'"]\s+content=[\'"](https?://(?:secure|c)\.brightcove.com/[^\'"]+)[\'"]',
             webpage)
         if url_m:
             url = unescapeHTML(url_m.group(1))
@@ -188,7 +210,12 @@ class BrightcoveIE(InfoExtractor):
                 [^>]*?>\s*<param\s+name="movie"\s+value="https?://[^/]*brightcove\.com/
             ).+?>\s*</object>''',
             webpage)
-        return list(filter(None, [cls._build_brighcove_url(m) for m in matches]))
+        if matches:
+            return list(filter(None, [cls._build_brighcove_url(m) for m in matches]))
+
+        return list(filter(None, [
+            cls._build_brighcove_url_from_js(custom_bc)
+            for custom_bc in re.findall(r'(customBC\.createVideo\(.+?\);)', webpage)]))
 
     def _real_extract(self, url):
         url, smuggled_data = unsmuggle_url(url, {})
index 1ceb9d8d9df6c0268e33de5e34c01a245e134e05..75fffb1563ae9f95bf862ad156111b6962a8429e 100644 (file)
@@ -4,12 +4,13 @@ from .common import InfoExtractor
 
 
 class CBSIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?cbs\.com/shows/[^/]+/(?:video|artist)/(?P<id>[^/]+)/.*'
+    _VALID_URL = r'https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/(?:video|artist)|colbertlateshow\.com/(?:video|podcasts))/[^/]+/(?P<id>[^/]+)'
 
     _TESTS = [{
         'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/',
         'info_dict': {
             'id': '4JUVEwq3wUT7',
+            'display_id': 'connect-chat-feat-garth-brooks',
             'ext': 'flv',
             'title': 'Connect Chat feat. Garth Brooks',
             'description': 'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!',
@@ -24,6 +25,7 @@ class CBSIE(InfoExtractor):
         'url': 'http://www.cbs.com/shows/liveonletterman/artist/221752/st-vincent/',
         'info_dict': {
             'id': 'WWF_5KqY3PK1',
+            'display_id': 'st-vincent',
             'ext': 'flv',
             'title': 'Live on Letterman - St. Vincent',
             'description': 'Live On Letterman: St. Vincent in concert from New York\'s Ed Sullivan Theater on Tuesday, July 16, 2014.',
@@ -34,12 +36,23 @@ class CBSIE(InfoExtractor):
             'skip_download': True,
         },
         '_skip': 'Blocked outside the US',
+    }, {
+        'url': 'http://colbertlateshow.com/video/8GmB0oY0McANFvp2aEffk9jZZZ2YyXxy/the-colbeard/',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
-        video_id = self._match_id(url)
-        webpage = self._download_webpage(url, video_id)
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
         real_id = self._search_regex(
-            r"video\.settings\.pid\s*=\s*'([^']+)';",
+            [r"video\.settings\.pid\s*=\s*'([^']+)';", r"cbsplayer\.pid\s*=\s*'([^']+)';"],
             webpage, 'real video ID')
-        return self.url_result('theplatform:%s' % real_id)
+        return {
+            '_type': 'url_transparent',
+            'ie_key': 'ThePlatform',
+            'url': 'theplatform:%s' % real_id,
+            'display_id': display_id,
+        }
index c922f695905d70e4052ddfa5c8f336c01221413b..0206d96db4670fb29a40353839dae15911b9c6d3 100644 (file)
@@ -57,7 +57,7 @@ class ChilloutzoneIE(InfoExtractor):
 
         base64_video_info = self._html_search_regex(
             r'var cozVidData = "(.+?)";', webpage, 'video data')
-        decoded_video_info = base64.b64decode(base64_video_info).decode("utf-8")
+        decoded_video_info = base64.b64decode(base64_video_info.encode('utf-8')).decode('utf-8')
         video_info_dict = json.loads(decoded_video_info)
 
         # get video information from dict
index cf0a7551b7f3df672d20aad7a00d7ced43bf2945..c949a481477c187d9433f39e6e851b87c97edf79 100644 (file)
@@ -60,6 +60,17 @@ class CinemassacreIE(InfoExtractor):
                 'uploader_id': 'Cinemassacre',
                 'title': 'AVGN: McKids',
             }
+        },
+        {
+            'url': 'http://cinemassacre.com/2015/05/25/mario-kart-64-nintendo-64-james-mike-mondays/',
+            'md5': '1376908e49572389e7b06251a53cdd08',
+            'info_dict': {
+                'id': 'Cinemassacre-555779690c440',
+                'ext': 'mp4',
+                'description': 'Let’s Play Mario Kart 64 !! Mario Kart 64 is a classic go-kart racing game released for the Nintendo 64 (N64). Today James & Mike do 4 player Battle Mode with Kyle and Bootsy!',
+                'title': 'Mario Kart 64 (Nintendo 64) James & Mike Mondays',
+                'upload_date': '20150525',
+            }
         }
     ]
 
@@ -72,7 +83,7 @@ class CinemassacreIE(InfoExtractor):
 
         playerdata_url = self._search_regex(
             [
-                r'src="(http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=.+?)"',
+                r'src="(http://(?:player2\.screenwavemedia\.com|player\.screenwavemedia\.com/play)/[a-zA-Z]+\.php\?[^"]*\bid=.+?)"',
                 r'<iframe[^>]+src="((?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"',
             ],
             webpage, 'player data URL', default=None)
index 3145b30514ea2a075f92077b9f87b64c9e8820a7..5dd69bff7ac73bcc0adc4d91c614045ddf116a9c 100644 (file)
@@ -11,7 +11,7 @@ from ..utils import (
 
 class CNETIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?cnet\.com/videos/(?P<id>[^/]+)/'
-    _TEST = {
+    _TESTS = [{
         'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/',
         'info_dict': {
             'id': '56f4ea68-bd21-4852-b08c-4de5b8354c60',
@@ -25,7 +25,20 @@ class CNETIE(InfoExtractor):
         'params': {
             'skip_download': 'requires rtmpdump',
         }
-    }
+    }, {
+        'url': 'http://www.cnet.com/videos/whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187/',
+        'info_dict': {
+            'id': '56527b93-d25d-44e3-b738-f989ce2e49ba',
+            'ext': 'flv',
+            'description': 'Khail and Ashley wonder what other civic woes can be solved by self-tweeting objects, investigate a new kind of VR camera and watch an origami robot self-assemble, walk, climb, dig and dissolve. #TDPothole',
+            'uploader_id': 'b163284d-6b73-44fc-b3e6-3da66c392d40',
+            'uploader': 'Ashley Esqueda',
+            'title': 'Whiny potholes tweet at local government when hit by cars (Tomorrow Daily 187)',
+        },
+        'params': {
+            'skip_download': True,  # requires rtmpdump
+        },
+    }]
 
     def _real_extract(self, url):
         display_id = self._match_id(url)
@@ -42,7 +55,7 @@ class CNETIE(InfoExtractor):
             raise ExtractorError('Cannot find video data')
 
         mpx_account = data['config']['players']['default']['mpx_account']
-        vid = vdata['files']['rtmp']
+        vid = vdata['files'].get('rtmp', vdata['files']['hds'])
         tp_link = 'http://link.theplatform.com/s/%s/%s' % (mpx_account, vid)
 
         video_id = vdata['id']
index 5efc5f4fe556a4424542b441a83f2d6dbd5bc8e7..3b1bd4033fd1c01986c83ab44cc1cebaa1b19e5b 100644 (file)
@@ -12,7 +12,7 @@ from ..utils import (
 
 class CNNIE(InfoExtractor):
     _VALID_URL = r'''(?x)https?://(?:(?:edition|www)\.)?cnn\.com/video/(?:data/.+?|\?)/
-        (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z]{3,5})(?:-ap)?|(?=&)))'''
+        (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z\-]+)|(?=&)))'''
 
     _TESTS = [{
         'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn',
index cecf917ffb67040739bbacf573297ba891ec9ea1..49e4dc7109e151ae124ed1aac15a9762d00eac21 100644 (file)
@@ -846,7 +846,7 @@ class InfoExtractor(object):
 
     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
                               entry_protocol='m3u8', preference=None,
-                              m3u8_id=None):
+                              m3u8_id=None, note=None, errnote=None):
 
         formats = [{
             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
@@ -865,8 +865,8 @@ class InfoExtractor(object):
 
         m3u8_doc = self._download_webpage(
             m3u8_url, video_id,
-            note='Downloading m3u8 information',
-            errnote='Failed to download m3u8 information')
+            note=note or 'Downloading m3u8 information',
+            errnote=errnote or 'Failed to download m3u8 information')
         last_info = None
         last_media = None
         kv_rex = re.compile(
index 1c77df47ef346173fc11a58396c98768e5afc986..41f0c736d98c229518bacb41fac2f35ce9b80958 100644 (file)
@@ -76,8 +76,8 @@ class CrunchyrollIE(InfoExtractor):
         self._login()
 
     def _decrypt_subtitles(self, data, iv, id):
-        data = bytes_to_intlist(data)
-        iv = bytes_to_intlist(iv)
+        data = bytes_to_intlist(base64.b64decode(data.encode('utf-8')))
+        iv = bytes_to_intlist(base64.b64decode(iv.encode('utf-8')))
         id = int(id)
 
         def obfuscate_key_aux(count, modulo, start):
@@ -179,6 +179,16 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
 
         return output
 
+    def _extract_subtitles(self, subtitle):
+        sub_root = xml.etree.ElementTree.fromstring(subtitle)
+        return [{
+            'ext': 'srt',
+            'data': self._convert_subtitles_to_srt(sub_root),
+        }, {
+            'ext': 'ass',
+            'data': self._convert_subtitles_to_ass(sub_root),
+        }]
+
     def _get_subtitles(self, video_id, webpage):
         subtitles = {}
         for sub_id, sub_name in re.findall(r'\?ssid=([0-9]+)" title="([^"]+)', webpage):
@@ -190,25 +200,11 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
             data = self._search_regex(r'<data>([^<]+)', sub_page, 'subtitle_data', fatal=False)
             if not id or not iv or not data:
                 continue
-            id = int(id)
-            iv = base64.b64decode(iv)
-            data = base64.b64decode(data)
-
             subtitle = self._decrypt_subtitles(data, iv, id).decode('utf-8')
             lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False)
             if not lang_code:
                 continue
-            sub_root = xml.etree.ElementTree.fromstring(subtitle)
-            subtitles[lang_code] = [
-                {
-                    'ext': 'srt',
-                    'data': self._convert_subtitles_to_srt(sub_root),
-                },
-                {
-                    'ext': 'ass',
-                    'data': self._convert_subtitles_to_ass(sub_root),
-                },
-            ]
+            subtitles[lang_code] = self._extract_subtitles(subtitle)
         return subtitles
 
     def _real_extract(self, url):
index db10b8d00b7482b157bd9dd0ecc9ef9b8191ce88..70aa4333c773c39c4e3be451f8b632045ac85b51 100644 (file)
@@ -225,7 +225,7 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
 
 class DailymotionUserIE(DailymotionPlaylistIE):
     IE_NAME = 'dailymotion:user'
-    _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?:old/)?user/(?P<user>[^/]+)'
+    _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?:(?:old/)?user/)?(?P<user>[^/]+)$'
     _PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s'
     _TESTS = [{
         'url': 'https://www.dailymotion.com/user/nqtv',
@@ -239,7 +239,8 @@ class DailymotionUserIE(DailymotionPlaylistIE):
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         user = mobj.group('user')
-        webpage = self._download_webpage(url, user)
+        webpage = self._download_webpage(
+            'https://www.dailymotion.com/user/%s' % user, user)
         full_user = unescapeHTML(self._html_search_regex(
             r'<a class="nav-image" title="([^"]+)" href="/%s">' % re.escape(user),
             webpage, 'user'))
index d3e6675283cddcb8f6a6dfffbfbd1e1ea3da11bc..d6723ecf26ea67356b288df6e5f3bf612141b91a 100644 (file)
@@ -2,19 +2,19 @@ from __future__ import unicode_literals
 
 from .common import InfoExtractor
 from ..utils import (
+    parse_duration,
     parse_iso8601,
-    int_or_none,
 )
+from ..compat import compat_str
 
 
 class DiscoveryIE(InfoExtractor):
     _VALID_URL = r'http://www\.discovery\.com\/[a-zA-Z0-9\-]*/[a-zA-Z0-9\-]*/videos/(?P<id>[a-zA-Z0-9_\-]*)(?:\.htm)?'
-    _TEST = {
+    _TESTS = [{
         'url': 'http://www.discovery.com/tv-shows/mythbusters/videos/mission-impossible-outtakes.htm',
-        'md5': '3c69d77d9b0d82bfd5e5932a60f26504',
         'info_dict': {
-            'id': 'mission-impossible-outtakes',
-            'ext': 'flv',
+            'id': '20769',
+            'ext': 'mp4',
             'title': 'Mission Impossible Outtakes',
             'description': ('Watch Jamie Hyneman and Adam Savage practice being'
                             ' each other -- to the point of confusing Jamie\'s dog -- and '
@@ -24,22 +24,36 @@ class DiscoveryIE(InfoExtractor):
             'timestamp': 1303099200,
             'upload_date': '20110418',
         },
-    }
+        'params': {
+            'skip_download': True,  # requires ffmpeg
+        }
+    }, {
+        'url': 'http://www.discovery.com/tv-shows/mythbusters/videos/mythbusters-the-simpsons',
+        'info_dict': {
+            'id': 'mythbusters-the-simpsons',
+            'title': 'MythBusters: The Simpsons',
+        },
+        'playlist_count': 9,
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
-        webpage = self._download_webpage(url, video_id)
+        info = self._download_json(url + '?flat=1', video_id)
 
-        info = self._parse_json(self._search_regex(
-            r'(?s)<script type="application/ld\+json">(.*?)</script>',
-            webpage, 'video info'), video_id)
+        video_title = info.get('playlist_title') or info.get('video_title')
 
-        return {
-            'id': video_id,
-            'title': info['name'],
-            'url': info['contentURL'],
-            'description': info.get('description'),
-            'thumbnail': info.get('thumbnailUrl'),
-            'timestamp': parse_iso8601(info.get('uploadDate')),
-            'duration': int_or_none(info.get('duration')),
-        }
+        entries = [{
+            'id': compat_str(video_info['id']),
+            'formats': self._extract_m3u8_formats(
+                video_info['src'], video_id, ext='mp4',
+                note='Download m3u8 information for video %d' % (idx + 1)),
+            'title': video_info['title'],
+            'description': video_info.get('description'),
+            'duration': parse_duration(video_info.get('video_length')),
+            'webpage_url': video_info.get('href'),
+            'thumbnail': video_info.get('thumbnailURL'),
+            'alt_title': video_info.get('secondary_title'),
+            'timestamp': parse_iso8601(video_info.get('publishedDate')),
+        } for idx, video_info in enumerate(info['playlist'])]
+
+        return self.playlist_result(entries, video_id, video_title)
index f25ab319e66d4d5b151cd9a9d4509807b6a88617..baa24c6d13abe016cceb83bb927db15d7d300509 100644 (file)
@@ -1,8 +1,11 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-from .common import InfoExtractor, ExtractorError
-from ..utils import parse_iso8601
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    parse_iso8601,
+)
 
 
 class DRTVIE(InfoExtractor):
@@ -60,19 +63,31 @@ class DRTVIE(InfoExtractor):
                 restricted_to_denmark = asset['RestrictedToDenmark']
                 spoken_subtitles = asset['Target'] == 'SpokenSubtitles'
                 for link in asset['Links']:
-                    target = link['Target']
                     uri = link['Uri']
+                    target = link['Target']
                     format_id = target
-                    preference = -1 if target == 'HDS' else -2
+                    preference = None
                     if spoken_subtitles:
-                        preference -= 2
+                        preference = -1
                         format_id += '-spoken-subtitles'
-                    formats.append({
-                        'url': uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43' if target == 'HDS' else uri,
-                        'format_id': format_id,
-                        'ext': link['FileFormat'],
-                        'preference': preference,
-                    })
+                    if target == 'HDS':
+                        formats.extend(self._extract_f4m_formats(
+                            uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43',
+                            video_id, preference, f4m_id=format_id))
+                    elif target == 'HLS':
+                        formats.extend(self._extract_m3u8_formats(
+                            uri, video_id, 'mp4', preference=preference,
+                            m3u8_id=format_id))
+                    else:
+                        bitrate = link.get('Bitrate')
+                        if bitrate:
+                            format_id += '-%s' % bitrate
+                        formats.append({
+                            'url': uri,
+                            'format_id': format_id,
+                            'tbr': bitrate,
+                            'ext': link.get('FileFormat'),
+                        })
                 subtitles_list = asset.get('SubtitlesList')
                 if isinstance(subtitles_list, list):
                     LANGS = {
index 70f8efe27578c4d43b27378a4a2c80d495a7488c..4827022e088cf33d064acf7ab8cebdadc0d743a0 100644 (file)
@@ -4,22 +4,28 @@ from .tnaflix import TNAFlixIE
 
 
 class EMPFlixIE(TNAFlixIE):
-    _VALID_URL = r'^https?://www\.empflix\.com/videos/(?P<display_id>[0-9a-zA-Z-]+)-(?P<id>[0-9]+)\.html'
+    _VALID_URL = r'https?://(?:www\.)?empflix\.com/videos/(?P<display_id>.+?)-(?P<id>[0-9]+)\.html'
 
     _TITLE_REGEX = r'name="title" value="(?P<title>[^"]*)"'
     _DESCRIPTION_REGEX = r'name="description" value="([^"]*)"'
     _CONFIG_REGEX = r'flashvars\.config\s*=\s*escape\("([^"]+)"'
 
-    _TEST = {
-        'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html',
-        'md5': 'b1bc15b6412d33902d6e5952035fcabc',
-        'info_dict': {
-            'id': '33051',
-            'display_id': 'Amateur-Finger-Fuck',
-            'ext': 'mp4',
-            'title': 'Amateur Finger Fuck',
-            'description': 'Amateur solo finger fucking.',
-            'thumbnail': 're:https?://.*\.jpg$',
-            'age_limit': 18,
+    _TESTS = [
+        {
+            'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html',
+            'md5': 'b1bc15b6412d33902d6e5952035fcabc',
+            'info_dict': {
+                'id': '33051',
+                'display_id': 'Amateur-Finger-Fuck',
+                'ext': 'mp4',
+                'title': 'Amateur Finger Fuck',
+                'description': 'Amateur solo finger fucking.',
+                'thumbnail': 're:https?://.*\.jpg$',
+                'age_limit': 18,
+            }
+        },
+        {
+            'url': 'http://www.empflix.com/videos/[AROMA][ARMD-718]-Aoi-Yoshino-Sawa-25826.html',
+            'only_matching': True,
         }
-    }
+    ]
index 937b28fcccf3bd58929adcca1bda9d05966460e5..82dc27bc6ff3ed2edd3b318f3ed3d14e360ef22d 100644 (file)
@@ -50,7 +50,10 @@ class FacebookIE(InfoExtractor):
             'id': '274175099429670',
             'ext': 'mp4',
             'title': 'Facebook video #274175099429670',
-        }
+        },
+        'expected_warnings': [
+            'title'
+        ]
     }, {
         'url': 'https://www.facebook.com/video.php?v=10204634152394104',
         'only_matching': True,
@@ -149,12 +152,12 @@ class FacebookIE(InfoExtractor):
             raise ExtractorError('Cannot find video formats')
 
         video_title = self._html_search_regex(
-            r'<h2 class="uiHeaderTitle">([^<]*)</h2>', webpage, 'title',
-            fatal=False)
+            r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage, 'title',
+            default=None)
         if not video_title:
             video_title = self._html_search_regex(
                 r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(.*?)</span>',
-                webpage, 'alternative title', default=None)
+                webpage, 'alternative title', fatal=False)
             video_title = limit_length(video_title, 80)
         if not video_title:
             video_title = 'Facebook video #%s' % video_id
diff --git a/youtube_dl/extractor/firedrive.py b/youtube_dl/extractor/firedrive.py
deleted file mode 100644 (file)
index 3191116..0000000
+++ /dev/null
@@ -1,80 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..compat import (
-    compat_urllib_parse,
-    compat_urllib_request,
-)
-from ..utils import (
-    ExtractorError,
-)
-
-
-class FiredriveIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?firedrive\.com/' + \
-                 '(?:file|embed)/(?P<id>[0-9a-zA-Z]+)'
-    _FILE_DELETED_REGEX = r'<div class="removed_file_image">'
-
-    _TESTS = [{
-        'url': 'https://www.firedrive.com/file/FEB892FA160EBD01',
-        'md5': 'd5d4252f80ebeab4dc2d5ceaed1b7970',
-        'info_dict': {
-            'id': 'FEB892FA160EBD01',
-            'ext': 'flv',
-            'title': 'bbb_theora_486kbit.flv',
-            'thumbnail': 're:^http://.*\.jpg$',
-        },
-    }]
-
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-        url = 'http://firedrive.com/file/%s' % video_id
-        webpage = self._download_webpage(url, video_id)
-
-        if re.search(self._FILE_DELETED_REGEX, webpage) is not None:
-            raise ExtractorError('Video %s does not exist' % video_id,
-                                 expected=True)
-
-        fields = dict(re.findall(r'''(?x)<input\s+
-            type="hidden"\s+
-            name="([^"]+)"\s+
-            value="([^"]*)"
-            ''', webpage))
-
-        post = compat_urllib_parse.urlencode(fields)
-        req = compat_urllib_request.Request(url, post)
-        req.add_header('Content-type', 'application/x-www-form-urlencoded')
-
-        # Apparently, this header is required for confirmation to work.
-        req.add_header('Host', 'www.firedrive.com')
-
-        webpage = self._download_webpage(req, video_id,
-                                         'Downloading video page')
-
-        title = self._search_regex(r'class="external_title_left">(.+)</div>',
-                                   webpage, 'title')
-        thumbnail = self._search_regex(r'image:\s?"(//[^\"]+)', webpage,
-                                       'thumbnail', fatal=False)
-        if thumbnail is not None:
-            thumbnail = 'http:' + thumbnail
-
-        ext = self._search_regex(r'type:\s?\'([^\']+)\',',
-                                 webpage, 'extension', fatal=False)
-        video_url = self._search_regex(
-            r'file:\s?loadURL\(\'(http[^\']+)\'\),', webpage, 'file url')
-
-        formats = [{
-            'format_id': 'sd',
-            'url': video_url,
-            'ext': ext,
-        }]
-
-        return {
-            'id': video_id,
-            'title': title,
-            'thumbnail': thumbnail,
-            'formats': formats,
-        }
diff --git a/youtube_dl/extractor/fivetv.py b/youtube_dl/extractor/fivetv.py
new file mode 100644 (file)
index 0000000..13fbc4d
--- /dev/null
@@ -0,0 +1,88 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class FiveTVIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                    http://
+                        (?:www\.)?5-tv\.ru/
+                        (?:
+                            (?:[^/]+/)+(?P<id>\d+)|
+                            (?P<path>[^/?#]+)(?:[/?#])?
+                        )
+                    '''
+
+    _TESTS = [{
+        'url': 'http://5-tv.ru/news/96814/',
+        'md5': 'bbff554ad415ecf5416a2f48c22d9283',
+        'info_dict': {
+            'id': '96814',
+            'ext': 'mp4',
+            'title': 'Россияне выбрали имя для общенациональной платежной системы',
+            'description': 'md5:a8aa13e2b7ad36789e9f77a74b6de660',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'duration': 180,
+        },
+    }, {
+        'url': 'http://5-tv.ru/video/1021729/',
+        'info_dict': {
+            'id': '1021729',
+            'ext': 'mp4',
+            'title': '3D принтер',
+            'description': 'md5:d76c736d29ef7ec5c0cf7d7c65ffcb41',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'duration': 180,
+        },
+    }, {
+        'url': 'http://www.5-tv.ru/glavnoe/#itemDetails',
+        'info_dict': {
+            'id': 'glavnoe',
+            'ext': 'mp4',
+            'title': 'Итоги недели с 8 по 14 июня 2015 года',
+            'thumbnail': 're:^https?://.*\.jpg$',
+        },
+    }, {
+        'url': 'http://www.5-tv.ru/glavnoe/broadcasts/508645/',
+        'only_matching': True,
+    }, {
+        'url': 'http://5-tv.ru/films/1507502/',
+        'only_matching': True,
+    }, {
+        'url': 'http://5-tv.ru/programs/broadcast/508713/',
+        'only_matching': True,
+    }, {
+        'url': 'http://5-tv.ru/angel/',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.5-tv.ru/schedule/?iframe=true&width=900&height=450',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id') or mobj.group('path')
+
+        webpage = self._download_webpage(url, video_id)
+
+        video_url = self._search_regex(
+            r'<a[^>]+?href="([^"]+)"[^>]+?class="videoplayer"',
+            webpage, 'video url')
+
+        title = self._og_search_title(webpage, default=None) or self._search_regex(
+            r'<title>([^<]+)</title>', webpage, 'title')
+        duration = int_or_none(self._og_search_property(
+            'video:duration', webpage, 'duration', default=None))
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'description': self._og_search_description(webpage, default=None),
+            'thumbnail': self._og_search_thumbnail(webpage, default=None),
+            'duration': duration,
+        }
index 9a7b0d25d790054e39729bab63e42b1ea7a89dff..f6b984300d1e09e9f55a9d618e3966d7778602e9 100644 (file)
@@ -9,6 +9,8 @@ from .common import InfoExtractor
 from .youtube import YoutubeIE
 from ..compat import (
     compat_urllib_parse,
+    compat_urllib_parse_unquote,
+    compat_urllib_request,
     compat_urlparse,
     compat_xml_parse_error,
 )
@@ -32,6 +34,7 @@ from .brightcove import BrightcoveIE
 from .nbc import NBCSportsVPlayerIE
 from .ooyala import OoyalaIE
 from .rutv import RUTVIE
+from .tvc import TVCIE
 from .sportbox import SportBoxEmbedIE
 from .smotri import SmotriIE
 from .condenast import CondeNastIE
@@ -39,6 +42,7 @@ from .udn import UDNEmbedIE
 from .senateisvp import SenateISVPIE
 from .bliptv import BlipTVIE
 from .svt import SVTIE
+from .pornhub import PornHubIE
 
 
 class GenericIE(InfoExtractor):
@@ -46,6 +50,97 @@ class GenericIE(InfoExtractor):
     _VALID_URL = r'.*'
     IE_NAME = 'generic'
     _TESTS = [
+        # Direct link to a video
+        {
+            'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
+            'md5': '67d406c2bcb6af27fa886f31aa934bbe',
+            'info_dict': {
+                'id': 'trailer',
+                'ext': 'mp4',
+                'title': 'trailer',
+                'upload_date': '20100513',
+            }
+        },
+        # Direct link to media delivered compressed (until Accept-Encoding is *)
+        {
+            'url': 'http://calimero.tk/muzik/FictionJunction-Parallel_Hearts.flac',
+            'md5': '128c42e68b13950268b648275386fc74',
+            'info_dict': {
+                'id': 'FictionJunction-Parallel_Hearts',
+                'ext': 'flac',
+                'title': 'FictionJunction-Parallel_Hearts',
+                'upload_date': '20140522',
+            },
+            'expected_warnings': [
+                'URL could be a direct video link, returning it as such.'
+            ]
+        },
+        # Direct download with broken HEAD
+        {
+            'url': 'http://ai-radio.org:8000/radio.opus',
+            'info_dict': {
+                'id': 'radio',
+                'ext': 'opus',
+                'title': 'radio',
+            },
+            'params': {
+                'skip_download': True,  # infinite live stream
+            },
+            'expected_warnings': [
+                r'501.*Not Implemented'
+            ],
+        },
+        # Direct link with incorrect MIME type
+        {
+            'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
+            'md5': '4ccbebe5f36706d85221f204d7eb5913',
+            'info_dict': {
+                'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
+                'id': '5_Lennart_Poettering_-_Systemd',
+                'ext': 'webm',
+                'title': '5_Lennart_Poettering_-_Systemd',
+                'upload_date': '20141120',
+            },
+            'expected_warnings': [
+                'URL could be a direct video link, returning it as such.'
+            ]
+        },
+        # RSS feed
+        {
+            'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
+            'info_dict': {
+                'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
+                'title': 'Zero Punctuation',
+                'description': 're:.*groundbreaking video review series.*'
+            },
+            'playlist_mincount': 11,
+        },
+        # RSS feed with enclosure
+        {
+            'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
+            'info_dict': {
+                'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
+                'ext': 'm4v',
+                'upload_date': '20150228',
+                'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
+            }
+        },
+        # google redirect
+        {
+            'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
+            'info_dict': {
+                'id': 'cmQHVoWB5FY',
+                'ext': 'mp4',
+                'upload_date': '20130224',
+                'uploader_id': 'TheVerge',
+                'description': 're:^Chris Ziegler takes a look at the\.*',
+                'uploader': 'The Verge',
+                'title': 'First Firefox OS phones side-by-side',
+            },
+            'params': {
+                'skip_download': False,
+            }
+        },
         {
             'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
             'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
@@ -125,17 +220,6 @@ class GenericIE(InfoExtractor):
                 'skip_download': True,  # m3u8 download
             },
         },
-        # Direct link to a video
-        {
-            'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
-            'md5': '67d406c2bcb6af27fa886f31aa934bbe',
-            'info_dict': {
-                'id': 'trailer',
-                'ext': 'mp4',
-                'title': 'trailer',
-                'upload_date': '20100513',
-            }
-        },
         # ooyala video
         {
             'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
@@ -160,22 +244,6 @@ class GenericIE(InfoExtractor):
             },
             'add_ie': ['Ooyala'],
         },
-        # google redirect
-        {
-            'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
-            'info_dict': {
-                'id': 'cmQHVoWB5FY',
-                'ext': 'mp4',
-                'upload_date': '20130224',
-                'uploader_id': 'TheVerge',
-                'description': 're:^Chris Ziegler takes a look at the\.*',
-                'uploader': 'The Verge',
-                'title': 'First Firefox OS phones side-by-side',
-            },
-            'params': {
-                'skip_download': False,
-            }
-        },
         # embed.ly video
         {
             'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
@@ -225,6 +293,15 @@ class GenericIE(InfoExtractor):
                 'skip_download': True,
             },
         },
+        # TVC embed
+        {
+            'url': 'http://sch1298sz.mskobr.ru/dou_edu/karamel_ki/filial_galleries/video/iframe_src_http_tvc_ru_video_iframe_id_55304_isplay_false_acc_video_id_channel_brand_id_11_show_episodes_episode_id_32307_frameb/',
+            'info_dict': {
+                'id': '55304',
+                'ext': 'mp4',
+                'title': 'Дошкольное воспитание',
+            },
+        },
         # SportBox embed
         {
             'url': 'http://www.vestifinance.ru/articles/25753',
@@ -407,16 +484,6 @@ class GenericIE(InfoExtractor):
                 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',
             }
         },
-        # RSS feed
-        {
-            'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
-            'info_dict': {
-                'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
-                'title': 'Zero Punctuation',
-                'description': 're:.*groundbreaking video review series.*'
-            },
-            'playlist_mincount': 11,
-        },
         # Multiple brightcove videos
         # https://github.com/rg3/youtube-dl/issues/2283
         {
@@ -470,21 +537,6 @@ class GenericIE(InfoExtractor):
                 'uploader': 'thoughtworks.wistia.com',
             },
         },
-        # Direct download with broken HEAD
-        {
-            'url': 'http://ai-radio.org:8000/radio.opus',
-            'info_dict': {
-                'id': 'radio',
-                'ext': 'opus',
-                'title': 'radio',
-            },
-            'params': {
-                'skip_download': True,  # infinite live stream
-            },
-            'expected_warnings': [
-                r'501.*Not Implemented'
-            ],
-        },
         # Soundcloud embed
         {
             'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/',
@@ -516,21 +568,6 @@ class GenericIE(InfoExtractor):
             },
             'playlist_mincount': 2,
         },
-        # Direct link with incorrect MIME type
-        {
-            'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
-            'md5': '4ccbebe5f36706d85221f204d7eb5913',
-            'info_dict': {
-                'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
-                'id': '5_Lennart_Poettering_-_Systemd',
-                'ext': 'webm',
-                'title': '5_Lennart_Poettering_-_Systemd',
-                'upload_date': '20141120',
-            },
-            'expected_warnings': [
-                'URL could be a direct video link, returning it as such.'
-            ]
-        },
         # Cinchcast embed
         {
             'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
@@ -689,16 +726,6 @@ class GenericIE(InfoExtractor):
                 'age_limit': 0,
             },
         },
-        # RSS feed with enclosure
-        {
-            'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
-            'info_dict': {
-                'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
-                'ext': 'm4v',
-                'upload_date': '20150228',
-                'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
-            }
-        },
         # Crooks and Liars embed
         {
             'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists',
@@ -773,6 +800,18 @@ class GenericIE(InfoExtractor):
                 # rtmpe downloads
                 'skip_download': True,
             }
+        },
+        # Brightcove URL in single quotes
+        {
+            'url': 'http://www.sportsnet.ca/baseball/mlb/sn-presents-russell-martin-world-citizen/',
+            'md5': '4ae374f1f8b91c889c4b9203c8c752af',
+            'info_dict': {
+                'id': '4255764656001',
+                'ext': 'mp4',
+                'title': 'SN Presents: Russell Martin, World Citizen',
+                'description': 'To understand why he was the Toronto Blue Jays’ top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.',
+                'uploader': 'Rogers Sportsnet',
+            },
         }
     ]
 
@@ -894,7 +933,7 @@ class GenericIE(InfoExtractor):
             force_videoid = smuggled_data['force_videoid']
             video_id = force_videoid
         else:
-            video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0]
+            video_id = compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
 
         self.to_screen('%s: Requesting header' % video_id)
 
@@ -916,7 +955,9 @@ class GenericIE(InfoExtractor):
 
         full_response = None
         if head_response is False:
-            full_response = self._request_webpage(url, video_id)
+            request = compat_urllib_request.Request(url)
+            request.add_header('Accept-Encoding', '*')
+            full_response = self._request_webpage(request, video_id)
             head_response = full_response
 
         # Check for direct link to a video
@@ -927,7 +968,7 @@ class GenericIE(InfoExtractor):
                 head_response.headers.get('Last-Modified'))
             return {
                 'id': video_id,
-                'title': os.path.splitext(url_basename(url))[0],
+                'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
                 'direct': True,
                 'formats': [{
                     'format_id': m.group('format_id'),
@@ -941,7 +982,17 @@ class GenericIE(InfoExtractor):
             self._downloader.report_warning('Falling back on generic information extractor.')
 
         if not full_response:
-            full_response = self._request_webpage(url, video_id)
+            request = compat_urllib_request.Request(url)
+            # Some webservers may serve compressed content of rather big size (e.g. gzipped flac)
+            # making it impossible to download only chunk of the file (yet we need only 512kB to
+            # test whether it's HTML or not). According to youtube-dl default Accept-Encoding
+            # that will always result in downloading the whole file that is not desirable.
+            # Therefore for extraction pass we have to override Accept-Encoding to any in order
+            # to accept raw bytes and being able to download only a chunk.
+            # It may probably better to solve this by checking Content-Type for application/octet-stream
+            # after HEAD request finishes, but not sure if we can rely on this.
+            request.add_header('Accept-Encoding', '*')
+            full_response = self._request_webpage(request, video_id)
 
         # Maybe it's a direct link to a video?
         # Be careful not to download the whole thing!
@@ -953,7 +1004,7 @@ class GenericIE(InfoExtractor):
                 head_response.headers.get('Last-Modified'))
             return {
                 'id': video_id,
-                'title': os.path.splitext(url_basename(url))[0],
+                'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
                 'direct': True,
                 'url': url,
                 'upload_date': upload_date,
@@ -1033,7 +1084,7 @@ class GenericIE(InfoExtractor):
 
         # Look for embedded rtl.nl player
         matches = re.findall(
-            r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+video_embed[^"]+)"',
+            r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"',
             webpage)
         if matches:
             return _playlist_from_matches(matches, ie='RtlNl')
@@ -1261,11 +1312,27 @@ class GenericIE(InfoExtractor):
         if rutv_url:
             return self.url_result(rutv_url, 'RUTV')
 
+        # Look for embedded TVC player
+        tvc_url = TVCIE._extract_url(webpage)
+        if tvc_url:
+            return self.url_result(tvc_url, 'TVC')
+
         # Look for embedded SportBox player
         sportbox_urls = SportBoxEmbedIE._extract_urls(webpage)
         if sportbox_urls:
             return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed')
 
+        # Look for embedded PornHub player
+        pornhub_url = PornHubIE._extract_url(webpage)
+        if pornhub_url:
+            return self.url_result(pornhub_url, 'PornHub')
+
+        # Look for embedded Tvigle player
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'), 'Tvigle')
+
         # Look for embedded TED player
         mobj = re.search(
             r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
index fe5d95e2c9cad488f342233e7ebfd52e42a86de3..d692ea79ab493174038c9649445e6a592a86687c 100644 (file)
@@ -3,6 +3,7 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
+from ..compat import compat_urlparse
 from ..utils import (
     int_or_none,
     js_to_json,
@@ -12,7 +13,7 @@ from ..utils import (
 
 
 class ImgurIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?P<id>[a-zA-Z0-9]+)(?:\.mp4|\.gifv)?'
+    _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?P<id>[a-zA-Z0-9]+)'
 
     _TESTS = [{
         'url': 'https://i.imgur.com/A61SaA1.gifv',
@@ -34,7 +35,8 @@ class ImgurIE(InfoExtractor):
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
-        webpage = self._download_webpage(url, video_id)
+        webpage = self._download_webpage(
+            compat_urlparse.urljoin(url, video_id), video_id)
 
         width = int_or_none(self._search_regex(
             r'<param name="width" value="([0-9]+)"',
index b107557880345157fcf66cb7e6e1b4a92b25af2e..3d78f78c46d1ad004339bc33ebcb09d1286e5092 100644 (file)
@@ -3,7 +3,10 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..utils import int_or_none
+from ..utils import (
+    int_or_none,
+    limit_length,
+)
 
 
 class InstagramIE(InfoExtractor):
@@ -100,11 +103,13 @@ class InstagramUserIE(InfoExtractor):
                 thumbnails_el = it.get('images', {})
                 thumbnail = thumbnails_el.get('thumbnail', {}).get('url')
 
-                title = it.get('caption', {}).get('text', it['id'])
+                # In some cases caption is null, which corresponds to None
+                # in python. As a result, it.get('caption', {}) gives None
+                title = (it.get('caption') or {}).get('text', it['id'])
 
                 entries.append({
                     'id': it['id'],
-                    'title': title,
+                    'title': limit_length(title, 80),
                     'formats': formats,
                     'thumbnail': thumbnail,
                     'webpage_url': it.get('link'),
index 8529bedfc0ab283790e74144bc9d570df19dc4b3..821c8ec109236b787b9afa2985e450ff8a647595 100644 (file)
@@ -11,11 +11,12 @@ from ..compat import (
 )
 from ..utils import (
     ExtractorError,
+    remove_end,
 )
 
 
 class IPrimaIE(InfoExtractor):
-    _VALID_URL = r'https?://play\.iprima\.cz/[^?#]+/(?P<id>[^?#]+)'
+    _VALID_URL = r'https?://play\.iprima\.cz/(?:[^/]+/)*(?P<id>[^?#]+)'
 
     _TESTS = [{
         'url': 'http://play.iprima.cz/particka/particka-92',
@@ -23,7 +24,7 @@ class IPrimaIE(InfoExtractor):
             'id': '39152',
             'ext': 'flv',
             'title': 'Partička (92)',
-            'description': 'md5:3740fda51464da35a2d4d0670b8e4fd6',
+            'description': 'md5:74e9617e51bca67c3ecfb2c6f9766f45',
             'thumbnail': 'http://play.iprima.cz/sites/default/files/image_crops/image_620x349/3/491483_particka-92_image_620x349.jpg',
         },
         'params': {
@@ -35,13 +36,14 @@ class IPrimaIE(InfoExtractor):
             'id': '9718337',
             'ext': 'flv',
             'title': 'Tchibo Partička - Jarní móda',
-            'description': 'md5:589f8f59f414220621ff8882eb3ce7be',
             'thumbnail': 're:^http:.*\.jpg$',
         },
         'params': {
             'skip_download': True,  # requires rtmpdump
         },
-        'skip': 'Do not have permission to access this page',
+    }, {
+        'url': 'http://play.iprima.cz/zpravy-ftv-prima-2752015',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
@@ -102,8 +104,10 @@ class IPrimaIE(InfoExtractor):
 
         return {
             'id': real_id,
-            'title': self._og_search_title(webpage),
+            'title': remove_end(self._og_search_title(webpage), ' | Prima PLAY'),
             'thumbnail': self._og_search_thumbnail(webpage),
             'formats': formats,
-            'description': self._og_search_description(webpage),
+            'description': self._search_regex(
+                r'<p[^>]+itemprop="description"[^>]*>([^<]+)',
+                webpage, 'description', default=None),
         }
diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py
new file mode 100644 (file)
index 0000000..9106dd0
--- /dev/null
@@ -0,0 +1,296 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import hashlib
+import math
+import os.path
+import random
+import re
+import time
+import uuid
+import zlib
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse
+from ..utils import (
+    ExtractorError,
+    url_basename,
+)
+
+
+class IqiyiIE(InfoExtractor):
+    IE_NAME = 'iqiyi'
+
+    _VALID_URL = r'http://(?:www\.)iqiyi.com/v_.+?\.html'
+
+    _TESTS = [{
+        'url': 'http://www.iqiyi.com/v_19rrojlavg.html',
+        'md5': '2cb594dc2781e6c941a110d8f358118b',
+        'info_dict': {
+            'id': '9c1fb1b99d192b21c559e5a1a2cb3c73',
+            'title': '美国德州空中惊现奇异云团 酷似UFO',
+            'ext': 'f4v',
+        }
+    }, {
+        'url': 'http://www.iqiyi.com/v_19rrhnnclk.html',
+        'info_dict': {
+            'id': 'e3f585b550a280af23c98b6cb2be19fb',
+            'title': '名侦探柯南第752集',
+        },
+        'playlist': [{
+            'md5': '7e49376fecaffa115d951634917fe105',
+            'info_dict': {
+                'id': 'e3f585b550a280af23c98b6cb2be19fb_part1',
+                'ext': 'f4v',
+                'title': '名侦探柯南第752集',
+            },
+        }, {
+            'md5': '41b75ba13bb7ac0e411131f92bc4f6ca',
+            'info_dict': {
+                'id': 'e3f585b550a280af23c98b6cb2be19fb_part2',
+                'ext': 'f4v',
+                'title': '名侦探柯南第752集',
+            },
+        }, {
+            'md5': '0cee1dd0a3d46a83e71e2badeae2aab0',
+            'info_dict': {
+                'id': 'e3f585b550a280af23c98b6cb2be19fb_part3',
+                'ext': 'f4v',
+                'title': '名侦探柯南第752集',
+            },
+        }, {
+            'md5': '4f8ad72373b0c491b582e7c196b0b1f9',
+            'info_dict': {
+                'id': 'e3f585b550a280af23c98b6cb2be19fb_part4',
+                'ext': 'f4v',
+                'title': '名侦探柯南第752集',
+            },
+        }, {
+            'md5': 'd89ad028bcfad282918e8098e811711d',
+            'info_dict': {
+                'id': 'e3f585b550a280af23c98b6cb2be19fb_part5',
+                'ext': 'f4v',
+                'title': '名侦探柯南第752集',
+            },
+        }, {
+            'md5': '9cb1e5c95da25dff0660c32ae50903b7',
+            'info_dict': {
+                'id': 'e3f585b550a280af23c98b6cb2be19fb_part6',
+                'ext': 'f4v',
+                'title': '名侦探柯南第752集',
+            },
+        }, {
+            'md5': '155116e0ff1867bbc9b98df294faabc9',
+            'info_dict': {
+                'id': 'e3f585b550a280af23c98b6cb2be19fb_part7',
+                'ext': 'f4v',
+                'title': '名侦探柯南第752集',
+            },
+        }, {
+            'md5': '53f5db77622ae14fa493ed2a278a082b',
+            'info_dict': {
+                'id': 'e3f585b550a280af23c98b6cb2be19fb_part8',
+                'ext': 'f4v',
+                'title': '名侦探柯南第752集',
+            },
+        }],
+    }]
+
+    _FORMATS_MAP = [
+        ('1', 'h6'),
+        ('2', 'h5'),
+        ('3', 'h4'),
+        ('4', 'h3'),
+        ('5', 'h2'),
+        ('10', 'h1'),
+    ]
+
+    def construct_video_urls(self, data, video_id, _uuid):
+        def do_xor(x, y):
+            a = y % 3
+            if a == 1:
+                return x ^ 121
+            if a == 2:
+                return x ^ 72
+            return x ^ 103
+
+        def get_encode_code(l):
+            a = 0
+            b = l.split('-')
+            c = len(b)
+            s = ''
+            for i in range(c - 1, -1, -1):
+                a = do_xor(int(b[c - i - 1], 16), i)
+                s += chr(a)
+            return s[::-1]
+
+        def get_path_key(x, format_id, segment_index):
+            mg = ')(*&^flash@#$%a'
+            tm = self._download_json(
+                'http://data.video.qiyi.com/t?tn=' + str(random.random()), video_id,
+                note='Download path key of segment %d for format %s' % (segment_index + 1, format_id)
+            )['t']
+            t = str(int(math.floor(int(tm) / (600.0))))
+            return hashlib.md5((t + mg + x).encode('utf8')).hexdigest()
+
+        video_urls_dict = {}
+        for format_item in data['vp']['tkl'][0]['vs']:
+            if 0 < int(format_item['bid']) <= 10:
+                format_id = self.get_format(format_item['bid'])
+            else:
+                continue
+
+            video_urls = []
+
+            video_urls_info = format_item['fs']
+            if not format_item['fs'][0]['l'].startswith('/'):
+                t = get_encode_code(format_item['fs'][0]['l'])
+                if t.endswith('mp4'):
+                    video_urls_info = format_item['flvs']
+
+            for segment_index, segment in enumerate(video_urls_info):
+                vl = segment['l']
+                if not vl.startswith('/'):
+                    vl = get_encode_code(vl)
+                key = get_path_key(
+                    vl.split('/')[-1].split('.')[0], format_id, segment_index)
+                filesize = segment['b']
+                base_url = data['vp']['du'].split('/')
+                base_url.insert(-1, key)
+                base_url = '/'.join(base_url)
+                param = {
+                    'su': _uuid,
+                    'qyid': uuid.uuid4().hex,
+                    'client': '',
+                    'z': '',
+                    'bt': '',
+                    'ct': '',
+                    'tn': str(int(time.time()))
+                }
+                api_video_url = base_url + vl + '?' + \
+                    compat_urllib_parse.urlencode(param)
+                js = self._download_json(
+                    api_video_url, video_id,
+                    note='Download video info of segment %d for format %s' % (segment_index + 1, format_id))
+                video_url = js['l']
+                video_urls.append(
+                    (video_url, filesize))
+
+            video_urls_dict[format_id] = video_urls
+        return video_urls_dict
+
+    def get_format(self, bid):
+        matched_format_ids = [_format_id for _bid, _format_id in self._FORMATS_MAP if _bid == str(bid)]
+        return matched_format_ids[0] if len(matched_format_ids) else None
+
+    def get_bid(self, format_id):
+        matched_bids = [_bid for _bid, _format_id in self._FORMATS_MAP if _format_id == format_id]
+        return matched_bids[0] if len(matched_bids) else None
+
+    def get_raw_data(self, tvid, video_id, enc_key, _uuid):
+        tm = str(int(time.time()))
+        param = {
+            'key': 'fvip',
+            'src': hashlib.md5(b'youtube-dl').hexdigest(),
+            'tvId': tvid,
+            'vid': video_id,
+            'vinfo': 1,
+            'tm': tm,
+            'enc': hashlib.md5(
+                (enc_key + tm + tvid).encode('utf8')).hexdigest(),
+            'qyid': _uuid,
+            'tn': random.random(),
+            'um': 0,
+            'authkey': hashlib.md5(
+                (tm + tvid).encode('utf8')).hexdigest()
+        }
+
+        api_url = 'http://cache.video.qiyi.com/vms' + '?' + \
+            compat_urllib_parse.urlencode(param)
+        raw_data = self._download_json(api_url, video_id)
+        return raw_data
+
+    def get_enc_key(self, swf_url, video_id):
+        filename, _ = os.path.splitext(url_basename(swf_url))
+        enc_key_json = self._downloader.cache.load('iqiyi-enc-key', filename)
+        if enc_key_json is not None:
+            return enc_key_json[0]
+
+        req = self._request_webpage(
+            swf_url, video_id, note='download swf content')
+        cn = req.read()
+        cn = zlib.decompress(cn[8:])
+        pt = re.compile(b'MixerRemote\x08(?P<enc_key>.+?)\$&vv')
+        enc_key = self._search_regex(pt, cn, 'enc_key').decode('utf8')
+
+        self._downloader.cache.store('iqiyi-enc-key', filename, [enc_key])
+
+        return enc_key
+
+    def _real_extract(self, url):
+        webpage = self._download_webpage(
+            url, 'temp_id', note='download video page')
+        tvid = self._search_regex(
+            r'data-player-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid')
+        video_id = self._search_regex(
+            r'data-player-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id')
+        swf_url = self._search_regex(
+            r'(http://[^\'"]+MainPlayer[^.]+\.swf)', webpage, 'swf player URL')
+        _uuid = uuid.uuid4().hex
+
+        enc_key = self.get_enc_key(swf_url, video_id)
+
+        raw_data = self.get_raw_data(tvid, video_id, enc_key, _uuid)
+
+        if raw_data['code'] != 'A000000':
+            raise ExtractorError('Unable to load data. Error code: ' + raw_data['code'])
+
+        if not raw_data['data']['vp']['tkl']:
+            raise ExtractorError('No support iQiqy VIP video')
+
+        data = raw_data['data']
+
+        title = data['vi']['vn']
+
+        # generate video_urls_dict
+        video_urls_dict = self.construct_video_urls(
+            data, video_id, _uuid)
+
+        # construct info
+        entries = []
+        for format_id in video_urls_dict:
+            video_urls = video_urls_dict[format_id]
+            for i, video_url_info in enumerate(video_urls):
+                if len(entries) < i + 1:
+                    entries.append({'formats': []})
+                entries[i]['formats'].append(
+                    {
+                        'url': video_url_info[0],
+                        'filesize': video_url_info[-1],
+                        'format_id': format_id,
+                        'preference': int(self.get_bid(format_id))
+                    }
+                )
+
+        for i in range(len(entries)):
+            self._sort_formats(entries[i]['formats'])
+            entries[i].update(
+                {
+                    'id': '%s_part%d' % (video_id, i + 1),
+                    'title': title,
+                }
+            )
+
+        if len(entries) > 1:
+            info = {
+                '_type': 'multi_video',
+                'id': video_id,
+                'title': title,
+                'entries': entries,
+            }
+        else:
+            info = entries[0]
+            info['id'] = video_id
+            info['title'] = title
+
+        return info
index 99a1361f844c15520c842cd9fffa1e5c2e9b6974..bc226fa67c064b991674a510b1eba54d40dc67e0 100644 (file)
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote
 from ..utils import (
     determine_ext,
     float_or_none,
@@ -30,7 +31,7 @@ class IzleseneIE(InfoExtractor):
                 'description': 'md5:253753e2655dde93f59f74b572454f6d',
                 'thumbnail': 're:^http://.*\.jpg',
                 'uploader_id': 'pelikzzle',
-                'timestamp': 1404302298,
+                'timestamp': int,
                 'upload_date': '20140702',
                 'duration': 95.395,
                 'age_limit': 0,
@@ -46,7 +47,7 @@ class IzleseneIE(InfoExtractor):
                 'description': 'Tarkan Dortmund 2006 Konseri',
                 'thumbnail': 're:^http://.*\.jpg',
                 'uploader_id': 'parlayankiz',
-                'timestamp': 1163322193,
+                'timestamp': int,
                 'upload_date': '20061112',
                 'duration': 253.666,
                 'age_limit': 0,
@@ -67,9 +68,9 @@ class IzleseneIE(InfoExtractor):
 
         uploader = self._html_search_regex(
             r"adduserUsername\s*=\s*'([^']+)';",
-            webpage, 'uploader', fatal=False, default='')
+            webpage, 'uploader', fatal=False)
         timestamp = parse_iso8601(self._html_search_meta(
-            'uploadDate', webpage, 'upload date', fatal=False))
+            'uploadDate', webpage, 'upload date'))
 
         duration = float_or_none(self._html_search_regex(
             r'"videoduration"\s*:\s*"([^"]+)"',
@@ -86,8 +87,7 @@ class IzleseneIE(InfoExtractor):
 
         # Might be empty for some videos.
         streams = self._html_search_regex(
-            r'"qualitylevel"\s*:\s*"([^"]+)"',
-            webpage, 'streams', fatal=False, default='')
+            r'"qualitylevel"\s*:\s*"([^"]+)"', webpage, 'streams', default='')
 
         formats = []
         if streams:
@@ -95,15 +95,15 @@ class IzleseneIE(InfoExtractor):
                 quality, url = re.search(r'\[(\w+)\](.+)', stream).groups()
                 formats.append({
                     'format_id': '%sp' % quality if quality else 'sd',
-                    'url': url,
+                    'url': compat_urllib_parse_unquote(url),
                     'ext': ext,
                 })
         else:
             stream_url = self._search_regex(
-                r'"streamurl"\s?:\s?"([^"]+)"', webpage, 'stream URL')
+                r'"streamurl"\s*:\s*"([^"]+)"', webpage, 'stream URL')
             formats.append({
                 'format_id': 'sd',
-                'url': stream_url,
+                'url': compat_urllib_parse_unquote(stream_url),
                 'ext': ext,
             })
 
diff --git a/youtube_dl/extractor/karrierevideos.py b/youtube_dl/extractor/karrierevideos.py
new file mode 100644 (file)
index 0000000..bed94bc
--- /dev/null
@@ -0,0 +1,96 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+    fix_xml_ampersands,
+    float_or_none,
+    xpath_with_ns,
+    xpath_text,
+)
+
+
+class KarriereVideosIE(InfoExtractor):
+    _VALID_URL = r'http://(?:www\.)?karrierevideos\.at(?:/[^/]+)+/(?P<id>[^/]+)'
+    _TESTS = [{
+        'url': 'http://www.karrierevideos.at/berufsvideos/mittlere-hoehere-schulen/altenpflegerin',
+        'info_dict': {
+            'id': '32c91',
+            'ext': 'flv',
+            'title': 'AltenpflegerIn',
+            'description': 'md5:dbadd1259fde2159a9b28667cb664ae2',
+            'thumbnail': 're:^http://.*\.png',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        }
+    }, {
+        # broken ampersands
+        'url': 'http://www.karrierevideos.at/orientierung/vaeterkarenz-und-neue-chancen-fuer-muetter-baby-was-nun',
+        'info_dict': {
+            'id': '5sniu',
+            'ext': 'flv',
+            'title': 'Väterkarenz und neue Chancen für Mütter - "Baby - was nun?"',
+            'description': 'md5:97092c6ad1fd7d38e9d6a5fdeb2bcc33',
+            'thumbnail': 're:^http://.*\.png',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        }
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        title = (self._html_search_meta('title', webpage, default=None) or
+                 self._search_regex(r'<h1 class="title">([^<]+)</h1>'))
+
+        video_id = self._search_regex(
+            r'/config/video/(.+?)\.xml', webpage, 'video id')
+        playlist = self._download_xml(
+            'http://www.karrierevideos.at/player-playlist.xml.php?p=%s' % video_id,
+            video_id, transform_source=fix_xml_ampersands)
+
+        NS_MAP = {
+            'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats'
+        }
+
+        def ns(path):
+            return xpath_with_ns(path, NS_MAP)
+
+        item = playlist.find('./tracklist/item')
+        video_file = xpath_text(
+            item, ns('./jwplayer:file'), 'video url', fatal=True)
+        streamer = xpath_text(
+            item, ns('./jwplayer:streamer'), 'streamer', fatal=True)
+
+        uploader = xpath_text(
+            item, ns('./jwplayer:author'), 'uploader')
+        duration = float_or_none(
+            xpath_text(item, ns('./jwplayer:duration'), 'duration'))
+
+        description = self._html_search_regex(
+            r'(?s)<div class="leadtext">(.+?)</div>',
+            webpage, 'description')
+
+        thumbnail = self._html_search_meta(
+            'thumbnail', webpage, 'thumbnail')
+        if thumbnail:
+            thumbnail = compat_urlparse.urljoin(url, thumbnail)
+
+        return {
+            'id': video_id,
+            'url': streamer.replace('rtmpt', 'rtmp'),
+            'play_path': 'mp4:%s' % video_file,
+            'ext': 'flv',
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'uploader': uploader,
+            'duration': duration,
+        }
index 7d4b57056509383fdc082a68c1650f38dc258763..1d391e69ff7e0aba1b78ae5e32792b2dca839943 100644 (file)
@@ -28,6 +28,14 @@ class KickStarterIE(InfoExtractor):
             'uploader': 'Pebble Technology',
             'title': 'Pebble iOS Notifications',
         }
+    }, {
+        'url': 'https://www.kickstarter.com/projects/1420158244/power-drive-2000/widget/video.html',
+        'info_dict': {
+            'id': '1420158244',
+            'ext': 'mp4',
+            'title': 'Power Drive 2000',
+        },
+        'expected_warnings': ['OpenGraph description'],
     }]
 
     def _real_extract(self, url):
@@ -48,10 +56,15 @@ class KickStarterIE(InfoExtractor):
                 'title': title,
             }
 
+        thumbnail = self._og_search_thumbnail(webpage, default=None)
+        if thumbnail is None:
+            thumbnail = self._html_search_regex(
+                r'<img[^>]+class="[^"]+\s*poster\s*[^"]+"[^>]+src="([^"]+)"',
+                webpage, 'thumbnail image', fatal=False)
         return {
             'id': video_id,
             'url': video_url,
             'title': title,
             'description': self._og_search_description(webpage),
-            'thumbnail': self._og_search_thumbnail(webpage),
+            'thumbnail': thumbnail,
         }
index 1484ac0d267697dceb34c9e406e3a26b26a37f54..da896caf160f6f0a0ae49167ae546cd9da4d45fa 100644 (file)
@@ -50,9 +50,7 @@ class LetvIE(InfoExtractor):
             'title': '与龙共舞 完整版',
             'description': 'md5:7506a5eeb1722bb9d4068f85024e3986',
         },
-        'params': {
-            'cn_verification_proxy': 'http://proxy.uku.im:8888'
-        },
+        'skip': 'Only available in China',
     }]
 
     @staticmethod
index d8897eb90d526b7b7d2e5a5ace5bec84ebb40031..7091f3335e8223ea0a089ba7cff127a983b12d7d 100644 (file)
@@ -20,7 +20,6 @@ class MiTeleIE(InfoExtractor):
 
     _TESTS = [{
         'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/',
-        'md5': '6a75fe9d0d3275bead0cb683c616fddb',
         'info_dict': {
             'id': '0fce117d',
             'ext': 'mp4',
@@ -29,6 +28,10 @@ class MiTeleIE(InfoExtractor):
             'display_id': 'programa-144',
             'duration': 2913,
         },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
     }]
 
     def _real_extract(self, url):
@@ -56,12 +59,14 @@ class MiTeleIE(InfoExtractor):
             episode,
             transform_source=strip_jsonp
         )
+        formats = self._extract_m3u8_formats(
+            token_info['tokenizedUrl'], episode, ext='mp4')
 
         return {
             'id': embed_data['videoId'],
             'display_id': episode,
             'title': info_el.find('title').text,
-            'url': token_info['tokenizedUrl'],
+            'formats': formats,
             'description': get_element_by_attribute('class', 'text', webpage),
             'thumbnail': info_el.find('thumb').text,
             'duration': parse_duration(info_el.find('duration').text),
index c10405f04d3cc1b3e89004029b7502112e9baa29..925967753bd12816005b5ed8929f0438e9ec0214 100644 (file)
@@ -6,6 +6,7 @@ import re
 from .common import InfoExtractor
 from ..compat import (
     compat_urllib_parse,
+    compat_urlparse,
 )
 from ..utils import (
     ExtractorError,
@@ -16,7 +17,7 @@ from ..utils import (
 class NaverIE(InfoExtractor):
     _VALID_URL = r'https?://(?:m\.)?tvcast\.naver\.com/v/(?P<id>\d+)'
 
-    _TEST = {
+    _TESTS = [{
         'url': 'http://tvcast.naver.com/v/81652',
         'info_dict': {
             'id': '81652',
@@ -25,7 +26,18 @@ class NaverIE(InfoExtractor):
             'description': '합격불변의 법칙 메가스터디 | 메가스터디 수학 김상희 선생님이 9월 모의고사 수학A형 16번에서 20번까지 해설강의를 공개합니다.',
             'upload_date': '20130903',
         },
-    }
+    }, {
+        'url': 'http://tvcast.naver.com/v/395837',
+        'md5': '638ed4c12012c458fefcddfd01f173cd',
+        'info_dict': {
+            'id': '395837',
+            'ext': 'mp4',
+            'title': '9년이 지나도 아픈 기억, 전효성의 아버지',
+            'description': 'md5:5bf200dcbf4b66eb1b350d1eb9c753f7',
+            'upload_date': '20150519',
+        },
+        'skip': 'Georestricted',
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
@@ -35,7 +47,7 @@ class NaverIE(InfoExtractor):
                          webpage)
         if m_id is None:
             m_error = re.search(
-                r'(?s)<div class="nation_error">\s*(?:<!--.*?-->)?\s*<p class="[^"]+">(?P<msg>.+?)</p>\s*</div>',
+                r'(?s)<div class="(?:nation_error|nation_box)">\s*(?:<!--.*?-->)?\s*<p class="[^"]+">(?P<msg>.+?)</p>\s*</div>',
                 webpage)
             if m_error:
                 raise ExtractorError(clean_html(m_error.group('msg')), expected=True)
@@ -58,14 +70,18 @@ class NaverIE(InfoExtractor):
         formats = []
         for format_el in urls.findall('EncodingOptions/EncodingOption'):
             domain = format_el.find('Domain').text
+            uri = format_el.find('uri').text
             f = {
-                'url': domain + format_el.find('uri').text,
+                'url': compat_urlparse.urljoin(domain, uri),
                 'ext': 'mp4',
                 'width': int(format_el.find('width').text),
                 'height': int(format_el.find('height').text),
             }
             if domain.startswith('rtmp'):
+                # urlparse does not support custom schemes
+                # https://bugs.python.org/issue18828
                 f.update({
+                    'url': domain + uri,
                     'ext': 'flv',
                     'rtmp_protocol': '1',  # rtmpt
                 })
index 862b706bf96719aa071f1f89c73f2a4ef45a20b1..944096e1ca15de964fcdf896adf988c9aa2264bd 100644 (file)
@@ -22,6 +22,18 @@ class NBAIE(InfoExtractor):
     }, {
         'url': 'http://www.nba.com/video/games/hornets/2014/12/05/0021400276-nyk-cha-play5.nba/',
         'only_matching': True,
+    }, {
+        'url': 'http://watch.nba.com/nba/video/channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba',
+        'info_dict': {
+            'id': '0041400301-cle-atl-recap.nba',
+            'ext': 'mp4',
+            'title': 'NBA GAME TIME | Video: Hawks vs. Cavaliers Game 1',
+            'description': 'md5:8094c3498d35a9bd6b1a8c396a071b4d',
+            'duration': 228,
+        },
+        'params': {
+            'skip_download': True,
+        }
     }]
 
     def _real_extract(self, url):
@@ -35,8 +47,12 @@ class NBAIE(InfoExtractor):
             self._og_search_title(webpage, default=shortened_video_id), ' : NBA.com')
 
         description = self._og_search_description(webpage)
-        duration = parse_duration(
-            self._html_search_meta('duration', webpage, 'duration'))
+        duration_str = self._html_search_meta(
+            'duration', webpage, 'duration', default=None)
+        if not duration_str:
+            duration_str = self._html_search_regex(
+                r'Duration:</b>\s*(\d+:\d+)', webpage, 'duration', fatal=False)
+        duration = parse_duration(duration_str)
 
         return {
             'id': shortened_video_id,
index 02dba4ef639e64deff790f94bd5cd0cd6da2a4cb..d1b7cff4cfbf30c76c52ae98ad247e8907be6abd 100644 (file)
@@ -89,8 +89,8 @@ class NextMediaActionNewsIE(NextMediaIE):
         return self._extract_from_nextmedia_page(news_id, url, article_page)
 
 
-class AppleDailyRealtimeNewsIE(NextMediaIE):
-    _VALID_URL = r'http://(www|ent).appledaily.com.tw/(realtimenews|enews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?'
+class AppleDailyIE(NextMediaIE):
+    _VALID_URL = r'http://(www|ent).appledaily.com.tw/(?:animation|appledaily|enews|realtimenews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?'
     _TESTS = [{
         'url': 'http://ent.appledaily.com.tw/enews/article/entertainment/20150128/36354694',
         'md5': 'a843ab23d150977cc55ef94f1e2c1e4d',
@@ -99,7 +99,7 @@ class AppleDailyRealtimeNewsIE(NextMediaIE):
             'ext': 'mp4',
             'title': '周亭羽走過摩鐵陰霾2男陪吃 九把刀孤寒看醫生',
             'thumbnail': 're:^https?://.*\.jpg$',
-            'description': 'md5:b23787119933404ce515c6356a8c355c',
+            'description': 'md5:2acd430e59956dc47cd7f67cb3c003f4',
             'upload_date': '20150128',
         }
     }, {
@@ -110,26 +110,10 @@ class AppleDailyRealtimeNewsIE(NextMediaIE):
             'ext': 'mp4',
             'title': '不滿被踩腳 山東兩大媽一路打下車',
             'thumbnail': 're:^https?://.*\.jpg$',
-            'description': 'md5:2648aaf6fc4f401f6de35a91d111aa1d',
+            'description': 'md5:175b4260c1d7c085993474217e4ab1b4',
             'upload_date': '20150128',
         }
-    }]
-
-    _URL_PATTERN = r'\{url: \'(.+)\'\}'
-
-    def _fetch_title(self, page):
-        return self._html_search_regex(r'<h1 id="h1">([^<>]+)</h1>', page, 'news title')
-
-    def _fetch_thumbnail(self, page):
-        return self._html_search_regex(r"setInitialImage\(\'([^']+)'\)", page, 'video thumbnail', fatal=False)
-
-    def _fetch_timestamp(self, page):
-        return None
-
-
-class AppleDailyAnimationNewsIE(AppleDailyRealtimeNewsIE):
-    _VALID_URL = 'http://www.appledaily.com.tw/animation/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?'
-    _TESTS = [{
+    }, {
         'url': 'http://www.appledaily.com.tw/animation/realtimenews/new/20150128/5003671',
         'md5': '03df296d95dedc2d5886debbb80cb43f',
         'info_dict': {
@@ -154,10 +138,22 @@ class AppleDailyAnimationNewsIE(AppleDailyRealtimeNewsIE):
         'expected_warnings': [
             'video thumbnail',
         ]
+    }, {
+        'url': 'http://www.appledaily.com.tw/appledaily/article/supplement/20140417/35770334/',
+        'only_matching': True,
     }]
 
+    _URL_PATTERN = r'\{url: \'(.+)\'\}'
+
     def _fetch_title(self, page):
-        return self._html_search_meta('description', page, 'news title')
+        return (self._html_search_regex(r'<h1 id="h1">([^<>]+)</h1>', page, 'news title', default=None) or
+                self._html_search_meta('description', page, 'news title'))
+
+    def _fetch_thumbnail(self, page):
+        return self._html_search_regex(r"setInitialImage\(\'([^']+)'\)", page, 'video thumbnail', fatal=False)
+
+    def _fetch_timestamp(self, page):
+        return None
 
     def _fetch_description(self, page):
         return self._html_search_meta('description', page, 'news description')
index 2684dd250aa65e22903612f4a1780fc8f701296a..dc54634a58e440fc70ae9bcb3e7d5781981b2b1e 100644 (file)
@@ -19,7 +19,7 @@ class NFLIE(InfoExtractor):
     _VALID_URL = r'''(?x)https?://
         (?P<host>(?:www\.)?(?:nfl\.com|.*?\.clubs\.nfl\.com))/
         (?:.+?/)*
-        (?P<id>(?:\d[a-z]{2}\d{13}|\w{8}\-(?:\w{4}\-){3}\w{12}))'''
+        (?P<id>(?:[a-z0-9]{16}|\w{8}\-(?:\w{4}\-){3}\w{12}))'''
     _TESTS = [
         {
             'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights',
@@ -58,6 +58,10 @@ class NFLIE(InfoExtractor):
                 'upload_date': '20150202',
             },
         },
+        {
+            'url': 'http://www.nfl.com/videos/nfl-network-top-ten/09000d5d810a6bd4/Top-10-Gutsiest-Performances-Jack-Youngblood',
+            'only_matching': True,
+        }
     ]
 
     @staticmethod
index 664dc81d47ce7af613636022f4e540dffd67f8b6..5bbd2dcf66294f5f0e21b6aae000f9ddecd5c051 100644 (file)
@@ -166,6 +166,10 @@ class NocoIE(InfoExtractor):
         self._sort_formats(formats)
 
         timestamp = parse_iso8601(show.get('online_date_start_utc'), ' ')
+
+        if timestamp is not None and timestamp < 0:
+            timestamp = None
+
         uploader = show.get('partner_name')
         uploader_id = show.get('partner_key')
         duration = float_or_none(show.get('duration_ms'), 1000)
diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py
new file mode 100644 (file)
index 0000000..3f9c776
--- /dev/null
@@ -0,0 +1,179 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_html,
+    unified_strdate,
+)
+
+
+class NovaIE(InfoExtractor):
+    IE_DESC = 'TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz'
+    _VALID_URL = 'http://(?:[^.]+\.)?(?P<site>tv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P<id>[^/]+?)(?:\.html|/|$)'
+    _TESTS = [{
+        'url': 'http://tvnoviny.nova.cz/clanek/novinky/co-na-sebe-sportaci-praskli-vime-jestli-pujde-hrdlicka-na-materskou.html?utm_source=tvnoviny&utm_medium=cpfooter&utm_campaign=novaplus',
+        'info_dict': {
+            'id': '1608920',
+            'display_id': 'co-na-sebe-sportaci-praskli-vime-jestli-pujde-hrdlicka-na-materskou',
+            'ext': 'flv',
+            'title': 'Duel: Michal Hrdlička a Petr Suchoň',
+            'description': 'md5:d0cc509858eee1b1374111c588c6f5d5',
+            'thumbnail': 're:^https?://.*\.(?:jpg)',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        }
+    }, {
+        'url': 'http://tn.nova.cz/clanek/tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci.html#player_13260',
+        'md5': '1dd7b9d5ea27bc361f110cd855a19bd3',
+        'info_dict': {
+            'id': '1757139',
+            'display_id': 'tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci',
+            'ext': 'mp4',
+            'title': 'Podzemní nemocnice v pražské Krči',
+            'description': 'md5:f0a42dd239c26f61c28f19e62d20ef53',
+            'thumbnail': 're:^https?://.*\.(?:jpg)',
+        }
+    }, {
+        'url': 'http://novaplus.nova.cz/porad/policie-modrava/video/5591-policie-modrava-15-dil-blondynka-na-hrbitove',
+        'info_dict': {
+            'id': '1756825',
+            'display_id': '5591-policie-modrava-15-dil-blondynka-na-hrbitove',
+            'ext': 'flv',
+            'title': 'Policie Modrava - 15. díl - Blondýnka na hřbitově',
+            'description': 'md5:dc24e50be5908df83348e50d1431295e',  # Make sure this description is clean of html tags
+            'thumbnail': 're:^https?://.*\.(?:jpg)',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        }
+    }, {
+        'url': 'http://novaplus.nova.cz/porad/televizni-noviny/video/5585-televizni-noviny-30-5-2015/',
+        'info_dict': {
+            'id': '1756858',
+            'ext': 'flv',
+            'title': 'Televizní noviny - 30. 5. 2015',
+            'thumbnail': 're:^https?://.*\.(?:jpg)',
+            'upload_date': '20150530',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        }
+    }, {
+        'url': 'http://fanda.nova.cz/clanek/fun-and-games/krvavy-epos-zaklinac-3-divoky-hon-vychazi-vyhrajte-ho-pro-sebe.html',
+        'info_dict': {
+            'id': '1753621',
+            'ext': 'mp4',
+            'title': 'Zaklínač 3: Divoký hon',
+            'description': 're:.*Pokud se stejně jako my nemůžete.*',
+            'thumbnail': 're:https?://.*\.jpg(\?.*)?',
+            'upload_date': '20150521',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        }
+    }, {
+        'url': 'http://sport.tn.nova.cz/clanek/sport/hokej/nhl/zivot-jde-dal-hodnotil-po-vyrazeni-z-playoff-jiri-sekac.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://fanda.nova.cz/clanek/fun-and-games/krvavy-epos-zaklinac-3-divoky-hon-vychazi-vyhrajte-ho-pro-sebe.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://doma.nova.cz/clanek/zdravi/prijdte-se-zapsat-do-registru-kostni-drene-jiz-ve-stredu-3-cervna.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://prask.nova.cz/clanek/novinky/co-si-na-sobe-nase-hvezdy-nechaly-pojistit.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://tv.nova.cz/clanek/novinky/zivot-je-zivot-bondovsky-trailer.html',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        display_id = mobj.group('id')
+        site = mobj.group('site')
+
+        webpage = self._download_webpage(url, display_id)
+
+        video_id = self._search_regex(
+            [r"(?:media|video_id)\s*:\s*'(\d+)'",
+             r'media=(\d+)',
+             r'id="article_video_(\d+)"',
+             r'id="player_(\d+)"'],
+            webpage, 'video id')
+
+        config_url = self._search_regex(
+            r'src="(http://tn\.nova\.cz/bin/player/videojs/config\.php\?[^"]+)"',
+            webpage, 'config url', default=None)
+
+        if not config_url:
+            DEFAULT_SITE_ID = '23000'
+            SITES = {
+                'tvnoviny': DEFAULT_SITE_ID,
+                'novaplus': DEFAULT_SITE_ID,
+                'vymena': DEFAULT_SITE_ID,
+                'krasna': DEFAULT_SITE_ID,
+                'fanda': '30',
+                'tn': '30',
+                'doma': '30',
+            }
+
+            site_id = self._search_regex(
+                r'site=(\d+)', webpage, 'site id', default=None) or SITES.get(site, DEFAULT_SITE_ID)
+
+            config_url = ('http://tn.nova.cz/bin/player/videojs/config.php?site=%s&media=%s&jsVar=vjsconfig'
+                          % (site_id, video_id))
+
+        config = self._download_json(
+            config_url, display_id,
+            'Downloading config JSON',
+            transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1])
+
+        mediafile = config['mediafile']
+        video_url = mediafile['src']
+
+        m = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>[^/]+?))/&*(?P<playpath>.+)$', video_url)
+        if m:
+            formats = [{
+                'url': m.group('url'),
+                'app': m.group('app'),
+                'play_path': m.group('playpath'),
+                'player_path': 'http://tvnoviny.nova.cz/static/shared/app/videojs/video-js.swf',
+                'ext': 'flv',
+            }]
+        else:
+            formats = [{
+                'url': video_url,
+            }]
+        self._sort_formats(formats)
+
+        title = mediafile.get('meta', {}).get('title') or self._og_search_title(webpage)
+        description = clean_html(self._og_search_description(webpage, default=None))
+        thumbnail = config.get('poster')
+
+        if site == 'novaplus':
+            upload_date = unified_strdate(self._search_regex(
+                r'(\d{1,2}-\d{1,2}-\d{4})$', display_id, 'upload date', default=None))
+        elif site == 'fanda':
+            upload_date = unified_strdate(self._search_regex(
+                r'<span class="date_time">(\d{1,2}\.\d{1,2}\.\d{4})', webpage, 'upload date', default=None))
+        else:
+            upload_date = None
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'upload_date': upload_date,
+            'thumbnail': thumbnail,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py
new file mode 100644 (file)
index 0000000..173e46c
--- /dev/null
@@ -0,0 +1,192 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    parse_iso8601,
+    parse_duration,
+    remove_start,
+)
+
+
+class NowTVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?nowtv\.de/(?P<station>rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<id>.+?)/player'
+
+    _TESTS = [{
+        # rtl
+        'url': 'http://www.nowtv.de/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/player',
+        'info_dict': {
+            'id': '203519',
+            'display_id': 'bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit',
+            'ext': 'mp4',
+            'title': 'Die neuen Bauern und eine Hochzeit',
+            'description': 'md5:e234e1ed6d63cf06be5c070442612e7e',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'timestamp': 1432580700,
+            'upload_date': '20150525',
+            'duration': 2786,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        # rtl2
+        'url': 'http://www.nowtv.de/rtl2/berlin-tag-nacht/berlin-tag-nacht-folge-934/player',
+        'info_dict': {
+            'id': '203481',
+            'display_id': 'berlin-tag-nacht/berlin-tag-nacht-folge-934',
+            'ext': 'mp4',
+            'title': 'Berlin - Tag & Nacht (Folge 934)',
+            'description': 'md5:c85e88c2e36c552dfe63433bc9506dd0',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'timestamp': 1432666800,
+            'upload_date': '20150526',
+            'duration': 2641,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        # rtlnitro
+        'url': 'http://www.nowtv.de/rtlnitro/alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00/player',
+        'info_dict': {
+            'id': '165780',
+            'display_id': 'alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00',
+            'ext': 'mp4',
+            'title': 'Hals- und Beinbruch',
+            'description': 'md5:b50d248efffe244e6f56737f0911ca57',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'timestamp': 1432415400,
+            'upload_date': '20150523',
+            'duration': 2742,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        # superrtl
+        'url': 'http://www.nowtv.de/superrtl/medicopter-117/angst/player',
+        'info_dict': {
+            'id': '99205',
+            'display_id': 'medicopter-117/angst',
+            'ext': 'mp4',
+            'title': 'Angst!',
+            'description': 'md5:30cbc4c0b73ec98bcd73c9f2a8c17c4e',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'timestamp': 1222632900,
+            'upload_date': '20080928',
+            'duration': 3025,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        # ntv
+        'url': 'http://www.nowtv.de/ntv/ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch/player',
+        'info_dict': {
+            'id': '203521',
+            'display_id': 'ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch',
+            'ext': 'mp4',
+            'title': 'Thema u.a.: Der erste Blick: Die Apple Watch',
+            'description': 'md5:4312b6c9d839ffe7d8caf03865a531af',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'timestamp': 1432751700,
+            'upload_date': '20150527',
+            'duration': 1083,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        # vox
+        'url': 'http://www.nowtv.de/vox/der-hundeprofi/buero-fall-chihuahua-joel/player',
+        'info_dict': {
+            'id': '128953',
+            'display_id': 'der-hundeprofi/buero-fall-chihuahua-joel',
+            'ext': 'mp4',
+            'title': "Büro-Fall / Chihuahua 'Joel'",
+            'description': 'md5:e62cb6bf7c3cc669179d4f1eb279ad8d',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'timestamp': 1432408200,
+            'upload_date': '20150523',
+            'duration': 3092,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        display_id = mobj.group('id')
+        station = mobj.group('station')
+
+        info = self._download_json(
+            'https://api.nowtv.de/v3/movies/%s?fields=*,format,files' % display_id,
+            display_id)
+
+        video_id = compat_str(info['id'])
+
+        files = info['files']
+        if not files:
+            if info.get('geoblocked', False):
+                raise ExtractorError(
+                    'Video %s is not available from your location due to geo restriction' % video_id,
+                    expected=True)
+            if not info.get('free', True):
+                raise ExtractorError(
+                    'Video %s is not available for free' % video_id, expected=True)
+
+        f = info.get('format', {})
+        station = f.get('station') or station
+
+        STATIONS = {
+            'rtl': 'rtlnow',
+            'rtl2': 'rtl2now',
+            'vox': 'voxnow',
+            'nitro': 'rtlnitronow',
+            'ntv': 'n-tvnow',
+            'superrtl': 'superrtlnow'
+        }
+
+        formats = []
+        for item in files['items']:
+            item_path = remove_start(item['path'], '/')
+            tbr = int_or_none(item['bitrate'])
+            m3u8_url = 'http://hls.fra.%s.de/hls-vod-enc/%s.m3u8' % (STATIONS[station], item_path)
+            m3u8_url = m3u8_url.replace('now/', 'now/videos/')
+            formats.append({
+                'url': m3u8_url,
+                'format_id': '%s-%sk' % (item['id'], tbr),
+                'ext': 'mp4',
+                'tbr': tbr,
+            })
+        self._sort_formats(formats)
+
+        title = info['title']
+        description = info.get('articleLong') or info.get('articleShort')
+        timestamp = parse_iso8601(info.get('broadcastStartDate'), ' ')
+        duration = parse_duration(info.get('duration'))
+        thumbnail = f.get('defaultImage169Format') or f.get('defaultImage169Logo')
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'timestamp': timestamp,
+            'duration': duration,
+            'formats': formats,
+        }
index fbc521d1aae02077ae62c5cd0a6c5f9cdcff014a..6c7149fe3859732978d34356f8c18aca14e74f63 100644 (file)
@@ -2,6 +2,7 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
+from ..compat import compat_urllib_parse
 from ..utils import (
     unified_strdate,
     int_or_none,
@@ -11,8 +12,9 @@ from ..utils import (
 
 
 class OdnoklassnikiIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:odnoklassniki|ok)\.ru/(?:video|web-api/video/moviePlayer)/(?P<id>\d+)'
+    _VALID_URL = r'https?://(?:odnoklassniki|ok)\.ru/(?:video|web-api/video/moviePlayer)/(?P<id>[\d-]+)'
     _TESTS = [{
+        # metadata in JSON
         'url': 'http://ok.ru/video/20079905452',
         'md5': '8e24ad2da6f387948e7a7d44eb8668fe',
         'info_dict': {
@@ -20,11 +22,22 @@ class OdnoklassnikiIE(InfoExtractor):
             'ext': 'mp4',
             'title': 'Культура меняет нас (прекрасный ролик!))',
             'duration': 100,
-            'upload_date': '20141207',
             'uploader_id': '330537914540',
             'uploader': 'Виталий Добровольский',
             'like_count': int,
-            'age_limit': 0,
+        },
+    }, {
+        # metadataUrl
+        'url': 'http://ok.ru/video/63567059965189-0',
+        'md5': '9676cf86eff5391d35dea675d224e131',
+        'info_dict': {
+            'id': '63567059965189-0',
+            'ext': 'mp4',
+            'title': 'Девушка без комплексов ...',
+            'duration': 191,
+            'uploader_id': '534380003155',
+            'uploader': 'Андрей Мещанинов',
+            'like_count': int,
         },
     }, {
         'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452',
@@ -34,14 +47,23 @@ class OdnoklassnikiIE(InfoExtractor):
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        webpage = self._download_webpage(url, video_id)
+        webpage = self._download_webpage(
+            'http://ok.ru/video/%s' % video_id, video_id)
 
         player = self._parse_json(
             unescapeHTML(self._search_regex(
                 r'data-attributes="([^"]+)"', webpage, 'player')),
             video_id)
 
-        metadata = self._parse_json(player['flashvars']['metadata'], video_id)
+        flashvars = player['flashvars']
+
+        metadata = flashvars.get('metadata')
+        if metadata:
+            metadata = self._parse_json(metadata, video_id)
+        else:
+            metadata = self._download_json(
+                compat_urllib_parse.unquote(flashvars['metadataUrl']),
+                video_id, 'Downloading metadata JSON')
 
         movie = metadata['movie']
         title = movie['title']
@@ -53,11 +75,11 @@ class OdnoklassnikiIE(InfoExtractor):
         uploader = author.get('name')
 
         upload_date = unified_strdate(self._html_search_meta(
-            'ya:ovs:upload_date', webpage, 'upload date'))
+            'ya:ovs:upload_date', webpage, 'upload date', default=None))
 
         age_limit = None
         adult = self._html_search_meta(
-            'ya:ovs:adult', webpage, 'age limit')
+            'ya:ovs:adult', webpage, 'age limit', default=None)
         if adult:
             age_limit = 18 if adult == 'true' else 0
 
index f179ea2008636f061c6a4cdad6fc69841a291076..6cdc2638b4930dc92835d71f673b560dea99022d 100644 (file)
@@ -87,7 +87,7 @@ class PatreonIE(InfoExtractor):
             r'<div class="attach"><a target="_blank" href="([^"]+)">',
             webpage, 'attachment URL', default=None)
         embed = self._html_search_regex(
-            r'<div id="watchCreation">\s*<iframe class="embedly-embed" src="([^"]+)"',
+            r'<div[^>]+id="watchCreation"[^>]*>\s*<iframe[^>]+src="([^"]+)"',
             webpage, 'embedded URL', default=None)
 
         if attach_fn is not None:
diff --git a/youtube_dl/extractor/porn91.py b/youtube_dl/extractor/porn91.py
new file mode 100644 (file)
index 0000000..72d1b27
--- /dev/null
@@ -0,0 +1,71 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+from ..compat import compat_urllib_parse
+from .common import InfoExtractor
+from ..utils import (
+    parse_duration,
+    int_or_none,
+    ExtractorError,
+)
+
+
+class Porn91IE(InfoExtractor):
+    IE_NAME = '91porn'
+    _VALID_URL = r'(?:https?://)(?:www\.|)91porn\.com/.+?\?viewkey=(?P<id>[\w\d]+)'
+
+    _TEST = {
+        'url': 'http://91porn.com/view_video.php?viewkey=7e42283b4f5ab36da134',
+        'md5': '6df8f6d028bc8b14f5dbd73af742fb20',
+        'info_dict': {
+            'id': '7e42283b4f5ab36da134',
+            'title': '18岁大一漂亮学妹,水嫩性感,再爽一次!',
+            'ext': 'mp4',
+            'duration': 431,
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        url = 'http://91porn.com/view_video.php?viewkey=%s' % video_id
+        self._set_cookie('91porn.com', 'language', 'cn_CN')
+        webpage = self._download_webpage(url, video_id, 'get HTML content')
+
+        if '作为游客,你每天只可观看10个视频' in webpage:
+            raise ExtractorError('91 Porn says: Daily limit 10 videos exceeded', expected=True)
+
+        title = self._search_regex(
+            r'<div id="viewvideo-title">([^<]+)</div>', webpage, 'title')
+        title = title.replace('\n', '')
+
+        # get real url
+        file_id = self._search_regex(
+            r'so.addVariable\(\'file\',\'(\d+)\'', webpage, 'file id')
+        sec_code = self._search_regex(
+            r'so.addVariable\(\'seccode\',\'([^\']+)\'', webpage, 'sec code')
+        max_vid = self._search_regex(
+            r'so.addVariable\(\'max_vid\',\'(\d+)\'', webpage, 'max vid')
+        url_params = compat_urllib_parse.urlencode({
+            'VID': file_id,
+            'mp4': '1',
+            'seccode': sec_code,
+            'max_vid': max_vid,
+        })
+        info_cn = self._download_webpage(
+            'http://91porn.com/getfile.php?' + url_params, video_id,
+            'get real video url')
+        video_url = self._search_regex(r'file=([^&]+)&', info_cn, 'url')
+
+        duration = parse_duration(self._search_regex(
+            r'时长:\s*</span>\s*(\d+:\d+)', webpage, 'duration', fatal=False))
+
+        comment_count = int_or_none(self._search_regex(
+            r'留言:\s*</span>\s*(\d+)', webpage, 'comment count', fatal=False))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'url': video_url,
+            'duration': duration,
+            'comment_count': comment_count,
+        }
index 0c8b731cf47267568e43ccd09ff21f1683b4d992..8565d7551df4c6453e400abb09316433d0b97fcd 100644 (file)
@@ -19,7 +19,7 @@ from ..aes import (
 
 
 class PornHubIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?pornhub\.com/view_video\.php\?viewkey=(?P<id>[0-9a-f]+)'
+    _VALID_URL = r'https?://(?:www\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)(?P<id>[0-9a-f]+)'
     _TEST = {
         'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
         'md5': '882f488fa1f0026f023f33576004a2ed',
@@ -32,6 +32,13 @@ class PornHubIE(InfoExtractor):
         }
     }
 
+    @classmethod
+    def _extract_url(cls, webpage):
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?pornhub\.com/embed/\d+)\1', webpage)
+        if mobj:
+            return mobj.group('url')
+
     def _extract_count(self, pattern, webpage, name):
         return str_to_int(self._search_regex(
             pattern, webpage, '%s count' % name, fatal=False))
@@ -39,7 +46,8 @@ class PornHubIE(InfoExtractor):
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        req = compat_urllib_request.Request(url)
+        req = compat_urllib_request.Request(
+            'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id)
         req.add_header('Cookie', 'age_verified=1')
         webpage = self._download_webpage(req, video_id)
 
@@ -71,7 +79,8 @@ class PornHubIE(InfoExtractor):
 
         video_urls = list(map(compat_urllib_parse.unquote, re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage)))
         if webpage.find('"encrypted":true') != -1:
-            password = compat_urllib_parse.unquote_plus(self._html_search_regex(r'"video_title":"([^"]+)', webpage, 'password'))
+            password = compat_urllib_parse.unquote_plus(
+                self._search_regex(r'"video_title":"([^"]+)', webpage, 'password'))
             video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls))
 
         formats = []
index 9688ed94898de231e6c7f1c9dc28d3779da10311..eba4dfbb39576bff355b722c997dd31e07ce370f 100644 (file)
@@ -34,7 +34,7 @@ class PornoVoisinesIE(InfoExtractor):
             'duration': 120,
             'view_count': int,
             'average_rating': float,
-            'categories': ['Débutante', 'Scénario', 'Sodomie'],
+            'categories': ['Débutantes', 'Scénario', 'Sodomie'],
             'age_limit': 18,
         }
     }
@@ -71,7 +71,7 @@ class PornoVoisinesIE(InfoExtractor):
         view_count = int_or_none(self._search_regex(
             r'(\d+) vues', webpage, 'view count', fatal=False))
         average_rating = self._search_regex(
-            r'Note : (\d+,\d+)', webpage, 'average rating', fatal=False)
+            r'Note\s*:\s*(\d+(?:,\d+)?)', webpage, 'average rating', fatal=False)
         if average_rating:
             average_rating = float_or_none(average_rating.replace(',', '.'))
 
index 7cc7996642cae1de1ca2a585391d167025b92162..536a42dc88a4e17bbd039289508521d1ea13e282 100644 (file)
@@ -17,7 +17,7 @@ from ..utils import (
 class ProSiebenSat1IE(InfoExtractor):
     IE_NAME = 'prosiebensat1'
     IE_DESC = 'ProSiebenSat.1 Digital'
-    _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|ran|the-voice-of-germany)\.de|fem\.com)/(?P<id>.+)'
+    _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|the-voice-of-germany)\.(?:de|at)|ran\.de|fem\.com)/(?P<id>.+)'
 
     _TESTS = [
         {
@@ -177,6 +177,7 @@ class ProSiebenSat1IE(InfoExtractor):
         r'<header class="clearfix">\s*<h3>(.+?)</h3>',
         r'<!-- start video -->\s*<h1>(.+?)</h1>',
         r'<h1 class="att-name">\s*(.+?)</h1>',
+        r'<header class="module_header">\s*<h2>([^<]+)</h2>\s*</header>',
     ]
     _DESCRIPTION_REGEXES = [
         r'<p itemprop="description">\s*(.+?)</p>',
@@ -206,8 +207,8 @@ class ProSiebenSat1IE(InfoExtractor):
     def _extract_clip(self, url, webpage):
         clip_id = self._html_search_regex(self._CLIPID_REGEXES, webpage, 'clip id')
 
-        access_token = 'testclient'
-        client_name = 'kolibri-1.2.5'
+        access_token = 'prosieben'
+        client_name = 'kolibri-1.12.6'
         client_location = url
 
         videos_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos?%s' % compat_urllib_parse.urlencode({
@@ -275,13 +276,17 @@ class ProSiebenSat1IE(InfoExtractor):
         for source in urls_sources:
             protocol = source['protocol']
             if protocol == 'rtmp' or protocol == 'rtmpe':
-                mobj = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', source['url'])
+                mobj = re.search(r'^(?P<url>rtmpe?://[^/]+)/(?P<path>.+)$', source['url'])
                 if not mobj:
                     continue
+                path = mobj.group('path')
+                mp4colon_index = path.rfind('mp4:')
+                app = path[:mp4colon_index]
+                play_path = path[mp4colon_index:]
                 formats.append({
-                    'url': mobj.group('url'),
-                    'app': mobj.group('app'),
-                    'play_path': mobj.group('playpath'),
+                    'url': '%s/%s' % (mobj.group('url'), app),
+                    'app': app,
+                    'play_path': play_path,
                     'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf',
                     'page_url': 'http://www.prosieben.de',
                     'vbr': fix_bitrate(source['bitrate']),
index b540033e25a8c8e033f9d7f7b49d6b6ae46f755b..bafa81c21c12f15ac8162f8303f56615124b260b 100644 (file)
@@ -9,7 +9,6 @@ from .common import InfoExtractor
 from ..utils import (
     strip_jsonp,
     unescapeHTML,
-    js_to_json,
 )
 from ..compat import compat_urllib_request
 
@@ -19,10 +18,10 @@ class QQMusicIE(InfoExtractor):
     _VALID_URL = r'http://y.qq.com/#type=song&mid=(?P<id>[0-9A-Za-z]+)'
     _TESTS = [{
         'url': 'http://y.qq.com/#type=song&mid=004295Et37taLD',
-        'md5': 'bed90b6db2a7a7a7e11bc585f471f63a',
+        'md5': '9ce1c1c8445f561506d2e3cfb0255705',
         'info_dict': {
             'id': '004295Et37taLD',
-            'ext': 'm4a',
+            'ext': 'mp3',
             'title': '可惜没如果',
             'upload_date': '20141227',
             'creator': '林俊杰',
@@ -30,6 +29,12 @@ class QQMusicIE(InfoExtractor):
         }
     }]
 
+    _FORMATS = {
+        'mp3-320': {'prefix': 'M800', 'ext': 'mp3', 'preference': 40, 'abr': 320},
+        'mp3-128': {'prefix': 'M500', 'ext': 'mp3', 'preference': 30, 'abr': 128},
+        'm4a': {'prefix': 'C200', 'ext': 'm4a', 'preference': 10}
+    }
+
     # Reference: m_r_GetRUin() in top_player.js
     # http://imgcache.gtimg.cn/music/portal_v3/y/top_player.js
     @staticmethod
@@ -69,11 +74,22 @@ class QQMusicIE(InfoExtractor):
             'http://base.music.qq.com/fcgi-bin/fcg_musicexpress.fcg?json=3&guid=%s' % guid,
             mid, note='Retrieve vkey', errnote='Unable to get vkey',
             transform_source=strip_jsonp)['key']
-        song_url = 'http://cc.stream.qqmusic.qq.com/C200%s.m4a?vkey=%s&guid=%s&fromtag=0' % (mid, vkey, guid)
+
+        formats = []
+        for format_id, details in self._FORMATS.items():
+            formats.append({
+                'url': 'http://cc.stream.qqmusic.qq.com/%s%s.%s?vkey=%s&guid=%s&fromtag=0'
+                       % (details['prefix'], mid, details['ext'], vkey, guid),
+                'format': format_id,
+                'format_id': format_id,
+                'preference': details['preference'],
+                'abr': details.get('abr'),
+            })
+        self._sort_formats(formats)
 
         return {
             'id': mid,
-            'url': song_url,
+            'formats': formats,
             'title': song_name,
             'upload_date': publish_time,
             'creator': singer,
@@ -181,60 +197,49 @@ class QQMusicToplistIE(QQPlaylistBaseIE):
     _VALID_URL = r'http://y\.qq\.com/#type=toplist&p=(?P<id>(top|global)_[0-9]+)'
 
     _TESTS = [{
-        'url': 'http://y.qq.com/#type=toplist&p=global_12',
+        'url': 'http://y.qq.com/#type=toplist&p=global_123',
         'info_dict': {
-            'id': 'global_12',
-            'title': 'itunes榜',
+            'id': 'global_123',
+            'title': '美国iTunes榜',
         },
         'playlist_count': 10,
     }, {
-        'url': 'http://y.qq.com/#type=toplist&p=top_6',
+        'url': 'http://y.qq.com/#type=toplist&p=top_3',
         'info_dict': {
-            'id': 'top_6',
+            'id': 'top_3',
             'title': 'QQ音乐巅峰榜·欧美',
+            'description': 'QQ音乐巅峰榜·欧美根据用户收听行为自动生成,集结当下最流行的欧美新歌!:更新时间:每周四22点|统'
+                           '计周期:一周(上周四至本周三)|统计对象:三个月内发行的欧美歌曲|统计数量:100首|统计算法:根据'
+                           '歌曲在一周内的有效播放次数,由高到低取前100名(同一歌手最多允许5首歌曲同时上榜)|有效播放次数:'
+                           '登录用户完整播放一首歌曲,记为一次有效播放;同一用户收听同一首歌曲,每天记录为1次有效播放'
         },
         'playlist_count': 100,
     }, {
-        'url': 'http://y.qq.com/#type=toplist&p=global_5',
+        'url': 'http://y.qq.com/#type=toplist&p=global_106',
         'info_dict': {
-            'id': 'global_5',
-            'title': '韩国mnet排行榜',
+            'id': 'global_106',
+            'title': '韩国Mnet榜',
         },
         'playlist_count': 50,
     }]
 
-    @staticmethod
-    def strip_qq_jsonp(code):
-        return js_to_json(re.sub(r'^MusicJsonCallback\((.*?)\)/\*.+?\*/$', r'\1', code))
-
     def _real_extract(self, url):
         list_id = self._match_id(url)
 
         list_type, num_id = list_id.split("_")
 
-        list_page = self._download_webpage(
-            "http://y.qq.com/y/static/toplist/index/%s.html" % list_id,
-            list_id, 'Download toplist page')
-
-        entries = []
-        if list_type == 'top':
-            jsonp_url = "http://y.qq.com/y/static/toplist/json/top/%s/1.js" % num_id
-        else:
-            jsonp_url = "http://y.qq.com/y/static/toplist/json/global/%s/1_1.js" % num_id
-
         toplist_json = self._download_json(
-            jsonp_url, list_id, note='Retrieve toplist json',
-            errnote='Unable to get toplist json', transform_source=self.strip_qq_jsonp)
-
-        for song in toplist_json['l']:
-            s = song['s']
-            song_mid = s.split("|")[20]
-            entries.append(self.url_result(
-                'http://y.qq.com/#type=song&mid=' + song_mid, 'QQMusic',
-                song_mid))
+            'http://i.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg?type=%s&topid=%s&format=json'
+            % (list_type, num_id),
+            list_id, 'Download toplist page')
 
-        list_name = self._html_search_regex(
-            r'<h2 id="top_name">([^\']+)</h2>', list_page, 'top list name',
-            default=None)
+        entries = [
+            self.url_result(
+                'http://y.qq.com/#type=song&mid=' + song['data']['songmid'], 'QQMusic', song['data']['songmid']
+            ) for song in toplist_json['songlist']
+        ]
 
-        return self.playlist_result(entries, list_id, list_name)
+        topinfo = toplist_json.get('topinfo', {})
+        list_name = topinfo.get('ListName')
+        list_description = topinfo.get('info')
+        return self.playlist_result(entries, list_id, list_name, list_description)
index dce64e1517003015722db1097ac83b106cc91136..e4215d546219bb95fe79abfb184da149148962db 100644 (file)
@@ -1,10 +1,11 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-import re
-import json
-
 from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    unescapeHTML,
+)
 
 
 class RTBFIE(InfoExtractor):
@@ -16,34 +17,47 @@ class RTBFIE(InfoExtractor):
             'id': '1921274',
             'ext': 'mp4',
             'title': 'Les Diables au coeur (épisode 2)',
-            'description': 'Football - Diables Rouges',
             'duration': 3099,
-            'timestamp': 1398456336,
-            'upload_date': '20140425',
         }
     }
 
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
+    _QUALITIES = [
+        ('mobile', 'mobile'),
+        ('web', 'SD'),
+        ('url', 'MD'),
+        ('high', 'HD'),
+    ]
 
-        page = self._download_webpage('https://www.rtbf.be/video/embed?id=%s' % video_id, video_id)
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
 
-        data = json.loads(self._html_search_regex(
-            r'<div class="js-player-embed(?: player-embed)?" data-video="([^"]+)"', page, 'data video'))['data']
+        webpage = self._download_webpage(
+            'http://www.rtbf.be/video/embed?id=%s' % video_id, video_id)
 
-        video_url = data.get('downloadUrl') or data.get('url')
+        data = self._parse_json(
+            unescapeHTML(self._search_regex(
+                r'data-video="([^"]+)"', webpage, 'data video')),
+            video_id)
 
-        if data['provider'].lower() == 'youtube':
+        if data.get('provider').lower() == 'youtube':
+            video_url = data.get('downloadUrl') or data.get('url')
             return self.url_result(video_url, 'Youtube')
+        formats = []
+        for key, format_id in self._QUALITIES:
+            format_url = data['sources'].get(key)
+            if format_url:
+                formats.append({
+                    'format_id': format_id,
+                    'url': format_url,
+                })
 
         return {
             'id': video_id,
-            'url': video_url,
+            'formats': formats,
             'title': data['title'],
             'description': data.get('description') or data.get('subtitle'),
-            'thumbnail': data['thumbnail']['large'],
+            'thumbnail': data.get('thumbnail'),
             'duration': data.get('duration') or data.get('realDuration'),
-            'timestamp': data['created'],
-            'view_count': data['viewCount'],
+            'timestamp': int_or_none(data.get('created')),
+            'view_count': int_or_none(data.get('viewCount')),
         }
index cfce4550ada568cfe13fae859a2bb745671074b5..41d202c289839d00de98eb0a3b1e5770eb0990f5 100644 (file)
@@ -12,10 +12,10 @@ class RtlNlIE(InfoExtractor):
     IE_NAME = 'rtl.nl'
     IE_DESC = 'rtl.nl and rtlxl.nl'
     _VALID_URL = r'''(?x)
-        https?://(www\.)?
+        https?://(?:www\.)?
         (?:
             rtlxl\.nl/\#!/[^/]+/|
-            rtl\.nl/system/videoplayer/[^?#]+?/video_embed\.html\#uuid=
+            rtl\.nl/system/videoplayer/(?:[^/]+/)+(?:video_)?embed\.html\b.+?\buuid=
         )
         (?P<id>[0-9a-f-]+)'''
 
@@ -43,6 +43,9 @@ class RtlNlIE(InfoExtractor):
             'upload_date': '20150215',
             'description': 'Er zijn nieuwe beelden vrijgegeven die vlak na de aanslag in Kopenhagen zijn gemaakt. Op de video is goed te zien hoe omstanders zich bekommeren om één van de slachtoffers, terwijl de eerste agenten ter plaatse komen.',
         }
+    }, {
+        'url': 'http://www.rtl.nl/system/videoplayer/derden/embed.html#!/uuid=bb0353b0-d6a4-1dad-90e9-18fe75b8d1f0',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py
deleted file mode 100644 (file)
index 785a804..0000000
+++ /dev/null
@@ -1,174 +0,0 @@
-# encoding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..utils import (
-    ExtractorError,
-    clean_html,
-    unified_strdate,
-    int_or_none,
-)
-
-
-class RTLnowIE(InfoExtractor):
-    """Information Extractor for RTL NOW, RTL2 NOW, RTL NITRO, SUPER RTL NOW, VOX NOW and n-tv NOW"""
-    _VALID_URL = r'''(?x)
-                        (?:https?://)?
-                        (?P<url>
-                            (?P<domain>
-                                rtl-now\.rtl\.de|
-                                rtl2now\.rtl2\.de|
-                                (?:www\.)?voxnow\.de|
-                                (?:www\.)?rtlnitronow\.de|
-                                (?:www\.)?superrtlnow\.de|
-                                (?:www\.)?n-tvnow\.de)
-                            /+[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?
-                            (?:container_id|film_id)=(?P<video_id>[0-9]+)&
-                            player=1(?:&season=[0-9]+)?(?:&.*)?
-                        )'''
-
-    _TESTS = [
-        {
-            'url': 'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1',
-            'info_dict': {
-                'id': '90419',
-                'ext': 'flv',
-                'title': 'Ahornallee - Folge 1 - Der Einzug',
-                'description': 'md5:ce843b6b5901d9a7f7d04d1bbcdb12de',
-                'upload_date': '20070416',
-                'duration': 1685,
-            },
-            'params': {
-                'skip_download': True,
-            },
-            'skip': 'Only works from Germany',
-        },
-        {
-            'url': 'http://rtl2now.rtl2.de/aerger-im-revier/episode-15-teil-1.php?film_id=69756&player=1&season=2&index=5',
-            'info_dict': {
-                'id': '69756',
-                'ext': 'flv',
-                'title': 'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit u.a.',
-                'description': 'md5:3fb247005ed21a935ffc82b7dfa70cf0',
-                'thumbnail': 'http://autoimg.static-fra.de/rtl2now/219850/1500x1500/image2.jpg',
-                'upload_date': '20120519',
-                'duration': 1245,
-            },
-            'params': {
-                'skip_download': True,
-            },
-            'skip': 'Only works from Germany',
-        },
-        {
-            'url': 'http://www.voxnow.de/voxtours/suedafrika-reporter-ii.php?film_id=13883&player=1&season=17',
-            'info_dict': {
-                'id': '13883',
-                'ext': 'flv',
-                'title': 'Voxtours - Südafrika-Reporter II',
-                'description': 'md5:de7f8d56be6fd4fed10f10f57786db00',
-                'upload_date': '20090627',
-                'duration': 1800,
-            },
-            'params': {
-                'skip_download': True,
-            },
-        },
-        {
-            'url': 'http://superrtlnow.de/medicopter-117/angst.php?film_id=99205&player=1',
-            'info_dict': {
-                'id': '99205',
-                'ext': 'flv',
-                'title': 'Medicopter 117 - Angst!',
-                'description': 're:^Im Therapiezentrum \'Sonnalm\' kommen durch eine Unachtsamkeit die für die B.handlung mit Phobikern gehaltenen Voglespinnen frei\. Eine Ausreißerin',
-                'thumbnail': 'http://autoimg.static-fra.de/superrtlnow/287529/1500x1500/image2.jpg',
-                'upload_date': '20080928',
-                'duration': 2691,
-            },
-            'params': {
-                'skip_download': True,
-            },
-        },
-        {
-            'url': 'http://rtl-now.rtl.de/der-bachelor/folge-4.php?film_id=188729&player=1&season=5',
-            'info_dict': {
-                'id': '188729',
-                'ext': 'flv',
-                'upload_date': '20150204',
-                'description': 'md5:5e1ce23095e61a79c166d134b683cecc',
-                'title': 'Der Bachelor - Folge 4',
-            }
-        }, {
-            'url': 'http://www.n-tvnow.de/deluxe-alles-was-spass-macht/thema-ua-luxushotel-fuer-vierbeiner.php?container_id=153819&player=1&season=0',
-            'only_matching': True,
-        },
-    ]
-
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_page_url = 'http://%s/' % mobj.group('domain')
-        video_id = mobj.group('video_id')
-
-        webpage = self._download_webpage('http://' + mobj.group('url'), video_id)
-
-        mobj = re.search(r'(?s)<div style="margin-left: 20px; font-size: 13px;">(.*?)<div id="playerteaser">', webpage)
-        if mobj:
-            raise ExtractorError(clean_html(mobj.group(1)), expected=True)
-
-        title = self._og_search_title(webpage)
-        description = self._og_search_description(webpage)
-        thumbnail = self._og_search_thumbnail(webpage, default=None)
-
-        upload_date = unified_strdate(self._html_search_meta('uploadDate', webpage, 'upload date'))
-
-        mobj = re.search(r'<meta itemprop="duration" content="PT(?P<seconds>\d+)S" />', webpage)
-        duration = int(mobj.group('seconds')) if mobj else None
-
-        playerdata_url = self._html_search_regex(
-            r"'playerdata': '(?P<playerdata_url>[^']+)'", webpage, 'playerdata_url')
-
-        playerdata = self._download_xml(playerdata_url, video_id, 'Downloading player data XML')
-
-        videoinfo = playerdata.find('./playlist/videoinfo')
-
-        formats = []
-        for filename in videoinfo.findall('filename'):
-            mobj = re.search(r'(?P<url>rtmpe://(?:[^/]+/){2})(?P<play_path>.+)', filename.text)
-            if mobj:
-                fmt = {
-                    'url': mobj.group('url'),
-                    'play_path': 'mp4:' + mobj.group('play_path'),
-                    'page_url': video_page_url,
-                    'player_url': video_page_url + 'includes/vodplayer.swf',
-                }
-            else:
-                mobj = re.search(r'.*/(?P<hoster>[^/]+)/videos/(?P<play_path>.+)\.f4m', filename.text)
-                if mobj:
-                    fmt = {
-                        'url': 'rtmpe://fms.rtl.de/' + mobj.group('hoster'),
-                        'play_path': 'mp4:' + mobj.group('play_path'),
-                        'page_url': url,
-                        'player_url': video_page_url + 'includes/vodplayer.swf',
-                    }
-                else:
-                    fmt = {
-                        'url': filename.text,
-                    }
-            fmt.update({
-                'width': int_or_none(filename.get('width')),
-                'height': int_or_none(filename.get('height')),
-                'vbr': int_or_none(filename.get('bitrate')),
-                'ext': 'flv',
-            })
-            formats.append(fmt)
-
-        return {
-            'id': video_id,
-            'title': title,
-            'description': description,
-            'thumbnail': thumbnail,
-            'upload_date': upload_date,
-            'duration': duration,
-            'formats': formats,
-        }
index 849300140ecbf598874d22b090262eabec1e7ea5..82cd98ac742bf436b24fbbc77cac9a6fb8a44ff6 100644 (file)
@@ -17,7 +17,7 @@ from ..utils import (
 
 
 def _decrypt_url(png):
-    encrypted_data = base64.b64decode(png)
+    encrypted_data = base64.b64decode(png.encode('utf-8'))
     text_index = encrypted_data.find(b'tEXt')
     text_chunk = encrypted_data[text_index - 4:]
     length = struct_unpack('!I', text_chunk[:4])[0]
index 55604637dca22533cd765529dcb2abfb759fd9c1..d9df0686133a6772deb1e58260069857620afc58 100644 (file)
@@ -104,7 +104,7 @@ class RUTVIE(InfoExtractor):
     @classmethod
     def _extract_url(cls, webpage):
         mobj = re.search(
-            r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.rutv\.ru/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage)
+            r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.(?:rutv\.ru|vgtrk\.com)/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage)
         if mobj:
             return mobj.group('url')
 
diff --git a/youtube_dl/extractor/ruutu.py b/youtube_dl/extractor/ruutu.py
new file mode 100644 (file)
index 0000000..4e22628
--- /dev/null
@@ -0,0 +1,119 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse_urlparse
+from ..utils import (
+    determine_ext,
+    int_or_none,
+    xpath_text,
+)
+
+
+class RuutuIE(InfoExtractor):
+    _VALID_URL = r'http://(?:www\.)?ruutu\.fi/ohjelmat/(?:[^/?#]+/)*(?P<id>[^/?#]+)'
+    _TESTS = [
+        {
+            'url': 'http://www.ruutu.fi/ohjelmat/oletko-aina-halunnut-tietaa-mita-tapahtuu-vain-hetki-ennen-lahetysta-nyt-se-selvisi',
+            'md5': 'ab2093f39be1ca8581963451b3c0234f',
+            'info_dict': {
+                'id': '2058907',
+                'display_id': 'oletko-aina-halunnut-tietaa-mita-tapahtuu-vain-hetki-ennen-lahetysta-nyt-se-selvisi',
+                'ext': 'mp4',
+                'title': 'Oletko aina halunnut tietää mitä tapahtuu vain hetki ennen lähetystä? - Nyt se selvisi!',
+                'description': 'md5:cfc6ccf0e57a814360df464a91ff67d6',
+                'thumbnail': 're:^https?://.*\.jpg$',
+                'duration': 114,
+                'age_limit': 0,
+            },
+        },
+        {
+            'url': 'http://www.ruutu.fi/ohjelmat/superpesis/superpesis-katso-koko-kausi-ruudussa',
+            'md5': '065a10ae4d5b8cfd9d0c3d332465e3d9',
+            'info_dict': {
+                'id': '2057306',
+                'display_id': 'superpesis-katso-koko-kausi-ruudussa',
+                'ext': 'mp4',
+                'title': 'Superpesis: katso koko kausi Ruudussa',
+                'description': 'md5:44c44a99fdbe5b380ab74ebd75f0af77',
+                'thumbnail': 're:^https?://.*\.jpg$',
+                'duration': 40,
+                'age_limit': 0,
+            },
+        },
+    ]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        video_id = self._search_regex(
+            r'data-media-id="(\d+)"', webpage, 'media id')
+
+        video_xml_url = None
+
+        media_data = self._search_regex(
+            r'jQuery\.extend\([^,]+,\s*(.+?)\);', webpage,
+            'media data', default=None)
+        if media_data:
+            media_json = self._parse_json(media_data, display_id, fatal=False)
+            if media_json:
+                xml_url = media_json.get('ruutuplayer', {}).get('xmlUrl')
+                if xml_url:
+                    video_xml_url = xml_url.replace('{ID}', video_id)
+
+        if not video_xml_url:
+            video_xml_url = 'http://gatling.ruutu.fi/media-xml-cache?id=%s' % video_id
+
+        video_xml = self._download_xml(video_xml_url, video_id)
+
+        formats = []
+        processed_urls = []
+
+        def extract_formats(node):
+            for child in node:
+                if child.tag.endswith('Files'):
+                    extract_formats(child)
+                elif child.tag.endswith('File'):
+                    video_url = child.text
+                    if not video_url or video_url in processed_urls or 'NOT_USED' in video_url:
+                        return
+                    processed_urls.append(video_url)
+                    ext = determine_ext(video_url)
+                    if ext == 'm3u8':
+                        formats.extend(self._extract_m3u8_formats(
+                            video_url, video_id, 'mp4', m3u8_id='hls'))
+                    elif ext == 'f4m':
+                        formats.extend(self._extract_f4m_formats(
+                            video_url, video_id, f4m_id='hds'))
+                    else:
+                        proto = compat_urllib_parse_urlparse(video_url).scheme
+                        if not child.tag.startswith('HTTP') and proto != 'rtmp':
+                            continue
+                        preference = -1 if proto == 'rtmp' else 1
+                        label = child.get('label')
+                        tbr = int_or_none(child.get('bitrate'))
+                        width, height = [int_or_none(x) for x in child.get('resolution', '').split('x')]
+                        formats.append({
+                            'format_id': '%s-%s' % (proto, label if label else tbr),
+                            'url': video_url,
+                            'width': width,
+                            'height': height,
+                            'tbr': tbr,
+                            'preference': preference,
+                        })
+
+        extract_formats(video_xml.find('./Clip'))
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': self._og_search_title(webpage),
+            'description': self._og_search_description(webpage),
+            'thumbnail': self._og_search_thumbnail(webpage),
+            'duration': int_or_none(xpath_text(video_xml, './/Runtime', 'duration')),
+            'age_limit': int_or_none(xpath_text(video_xml, './/AgeLimit', 'age limit')),
+            'formats': formats,
+        }
index d3b8a1be49702f71a1a8c4eb7bd01d17cf103071..9c53704ea383b1af34e8f8157e327b71c2c3865a 100644 (file)
@@ -48,7 +48,7 @@ class SenateISVPIE(InfoExtractor):
         ["arch", "", "http://ussenate-f.akamaihd.net/"]
     ]
     _IE_NAME = 'senate.gov'
-    _VALID_URL = r'http://www\.senate\.gov/isvp/\?(?P<qs>.+)'
+    _VALID_URL = r'http://www\.senate\.gov/isvp/?\?(?P<qs>.+)'
     _TESTS = [{
         'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png',
         'info_dict': {
@@ -72,12 +72,16 @@ class SenateISVPIE(InfoExtractor):
             'ext': 'mp4',
             'title': 'Integrated Senate Video Player'
         }
+    }, {
+        # From http://www.c-span.org/video/?96791-1
+        'url': 'http://www.senate.gov/isvp?type=live&comm=banking&filename=banking012715',
+        'only_matching': True,
     }]
 
     @staticmethod
     def _search_iframe_url(webpage):
         mobj = re.search(
-            r"<iframe[^>]+src=['\"](?P<url>http://www\.senate\.gov/isvp/\?[^'\"]+)['\"]",
+            r"<iframe[^>]+src=['\"](?P<url>http://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]",
             webpage)
         if mobj:
             return mobj.group('url')
index 26ced716e8a875f1c4c5c9527b856475dce83f9e..9f3e944e73532a92c5213ce95e7633fe6bc4c212 100644 (file)
@@ -47,7 +47,7 @@ class SharedIE(InfoExtractor):
         video_url = self._html_search_regex(
             r'data-url="([^"]+)"', video_page, 'video URL')
         title = base64.b64decode(self._html_search_meta(
-            'full:title', webpage, 'title')).decode('utf-8')
+            'full:title', webpage, 'title').encode('utf-8')).decode('utf-8')
         filesize = int_or_none(self._html_search_meta(
             'full:size', webpage, 'file size', fatal=False))
         thumbnail = self._html_search_regex(
diff --git a/youtube_dl/extractor/sockshare.py b/youtube_dl/extractor/sockshare.py
deleted file mode 100644 (file)
index b5fa6f1..0000000
+++ /dev/null
@@ -1,83 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from ..compat import (
-    compat_urllib_parse,
-    compat_urllib_request,
-)
-from ..utils import (
-    determine_ext,
-    ExtractorError,
-)
-
-from .common import InfoExtractor
-
-
-class SockshareIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?sockshare\.com/file/(?P<id>[0-9A-Za-z]+)'
-    _FILE_DELETED_REGEX = r'This file doesn\'t exist, or has been removed\.</div>'
-    _TEST = {
-        'url': 'http://www.sockshare.com/file/437BE28B89D799D7',
-        'md5': '9d0bf1cfb6dbeaa8d562f6c97506c5bd',
-        'info_dict': {
-            'id': '437BE28B89D799D7',
-            'title': 'big_buck_bunny_720p_surround.avi',
-            'ext': 'avi',
-        }
-    }
-
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-        url = 'http://sockshare.com/file/%s' % video_id
-        webpage = self._download_webpage(url, video_id)
-
-        if re.search(self._FILE_DELETED_REGEX, webpage) is not None:
-            raise ExtractorError('Video %s does not exist' % video_id,
-                                 expected=True)
-
-        confirm_hash = self._html_search_regex(r'''(?x)<input\s+
-            type="hidden"\s+
-            value="([^"]*)"\s+
-            name="hash"
-            ''', webpage, 'hash')
-
-        fields = {
-            "hash": confirm_hash.encode('utf-8'),
-            "confirm": "Continue as Free User"
-        }
-
-        post = compat_urllib_parse.urlencode(fields)
-        req = compat_urllib_request.Request(url, post)
-        # Apparently, this header is required for confirmation to work.
-        req.add_header('Host', 'www.sockshare.com')
-        req.add_header('Content-type', 'application/x-www-form-urlencoded')
-
-        webpage = self._download_webpage(
-            req, video_id, 'Downloading video page')
-
-        video_url = self._html_search_regex(
-            r'<a href="([^"]*)".+class="download_file_link"',
-            webpage, 'file url')
-        video_url = "http://www.sockshare.com" + video_url
-        title = self._html_search_regex((
-            r'<h1>(.+)<strong>',
-            r'var name = "([^"]+)";'),
-            webpage, 'title', default=None)
-        thumbnail = self._html_search_regex(
-            r'<img\s+src="([^"]*)".+?name="bg"',
-            webpage, 'thumbnail', default=None)
-
-        formats = [{
-            'format_id': 'sd',
-            'url': video_url,
-            'ext': determine_ext(title),
-        }]
-
-        return {
-            'id': video_id,
-            'title': title,
-            'thumbnail': thumbnail,
-            'formats': formats,
-        }
index eab4adfca46f3686ac7d23ca581681e56d5066f1..29bd9ce6f76247b5ac997050075c7e720d8a3b2b 100644 (file)
@@ -23,9 +23,7 @@ class SohuIE(InfoExtractor):
             'ext': 'mp4',
             'title': 'MV:Far East Movement《The Illest》',
         },
-        'params': {
-            'cn_verification_proxy': 'proxy.uku.im:8888'
-        }
+        'skip': 'On available in China',
     }, {
         'url': 'http://tv.sohu.com/20150305/n409385080.shtml',
         'md5': 'ac9a5d322b4bf9ae184d53e4711e4f1a',
diff --git a/youtube_dl/extractor/soompi.py b/youtube_dl/extractor/soompi.py
new file mode 100644 (file)
index 0000000..5da66ca
--- /dev/null
@@ -0,0 +1,146 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .crunchyroll import CrunchyrollIE
+
+from .common import InfoExtractor
+from ..compat import compat_HTTPError
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    remove_start,
+    xpath_text,
+)
+
+
+class SoompiBaseIE(InfoExtractor):
+    def _get_episodes(self, webpage, episode_filter=None):
+        episodes = self._parse_json(
+            self._search_regex(
+                r'VIDEOS\s*=\s*(\[.+?\]);', webpage, 'episodes JSON'),
+            None)
+        return list(filter(episode_filter, episodes))
+
+
+class SoompiIE(SoompiBaseIE, CrunchyrollIE):
+    IE_NAME = 'soompi'
+    _VALID_URL = r'https?://tv\.soompi\.com/(?:en/)?watch/(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://tv.soompi.com/en/watch/29235',
+        'info_dict': {
+            'id': '29235',
+            'ext': 'mp4',
+            'title': 'Episode 1096',
+            'description': '2015-05-20'
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }]
+
+    def _get_episode(self, webpage, video_id):
+        return self._get_episodes(webpage, lambda x: x['id'] == video_id)[0]
+
+    def _get_subtitles(self, config, video_id):
+        sub_langs = {}
+        for subtitle in config.findall('./{default}preload/subtitles/subtitle'):
+            sub_langs[subtitle.attrib['id']] = subtitle.attrib['title']
+
+        subtitles = {}
+        for s in config.findall('./{default}preload/subtitle'):
+            lang_code = sub_langs.get(s.attrib['id'])
+            if not lang_code:
+                continue
+            sub_id = s.get('id')
+            data = xpath_text(s, './data', 'data')
+            iv = xpath_text(s, './iv', 'iv')
+            if not id or not iv or not data:
+                continue
+            subtitle = self._decrypt_subtitles(data, iv, sub_id).decode('utf-8')
+            subtitles[lang_code] = self._extract_subtitles(subtitle)
+        return subtitles
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        try:
+            webpage = self._download_webpage(
+                url, video_id, 'Downloading episode page')
+        except ExtractorError as ee:
+            if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
+                webpage = ee.cause.read()
+                block_message = self._html_search_regex(
+                    r'(?s)<div class="block-message">(.+?)</div>', webpage,
+                    'block message', default=None)
+                if block_message:
+                    raise ExtractorError(block_message, expected=True)
+            raise
+
+        formats = []
+        config = None
+        for format_id in re.findall(r'\?quality=([0-9a-zA-Z]+)', webpage):
+            config = self._download_xml(
+                'http://tv.soompi.com/en/show/_/%s-config.xml?mode=hls&quality=%s' % (video_id, format_id),
+                video_id, 'Downloading %s XML' % format_id)
+            m3u8_url = xpath_text(
+                config, './{default}preload/stream_info/file',
+                '%s m3u8 URL' % format_id)
+            if not m3u8_url:
+                continue
+            formats.extend(self._extract_m3u8_formats(
+                m3u8_url, video_id, 'mp4', m3u8_id=format_id))
+        self._sort_formats(formats)
+
+        episode = self._get_episode(webpage, video_id)
+
+        title = episode['name']
+        description = episode.get('description')
+        duration = int_or_none(episode.get('duration'))
+
+        thumbnails = [{
+            'id': thumbnail_id,
+            'url': thumbnail_url,
+        } for thumbnail_id, thumbnail_url in episode.get('img_url', {}).items()]
+
+        subtitles = self.extract_subtitles(config, video_id)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnails': thumbnails,
+            'duration': duration,
+            'formats': formats,
+            'subtitles': subtitles
+        }
+
+
+class SoompiShowIE(SoompiBaseIE):
+    IE_NAME = 'soompi:show'
+    _VALID_URL = r'https?://tv\.soompi\.com/en/shows/(?P<id>[0-9a-zA-Z\-_]+)'
+    _TESTS = [{
+        'url': 'http://tv.soompi.com/en/shows/liar-game',
+        'info_dict': {
+            'id': 'liar-game',
+            'title': 'Liar Game',
+            'description': 'md5:52c02bce0c1a622a95823591d0589b66',
+        },
+        'playlist_count': 14,
+    }]
+
+    def _real_extract(self, url):
+        show_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            url, show_id, 'Downloading show page')
+
+        title = remove_start(self._og_search_title(webpage), 'SoompiTV | ')
+        description = self._og_search_description(webpage)
+
+        entries = [
+            self.url_result('http://tv.soompi.com/en/watch/%s' % episode['id'], 'Soompi')
+            for episode in self._get_episodes(webpage)]
+
+        return self.playlist_result(entries, show_id, title, description)
index b936202f6f3005fe9ae085724566d709c6a484cc..06d6e6640637f2ff3507c3fa1c68339fbb8d98b4 100644 (file)
@@ -71,7 +71,7 @@ class SpankwireIE(InfoExtractor):
             compat_urllib_parse.unquote,
             re.findall(r'playerData\.cdnPath[0-9]{3,}\s*=\s*["\']([^"\']+)["\']', webpage)))
         if webpage.find('flashvars\.encrypted = "true"') != -1:
-            password = self._html_search_regex(
+            password = self._search_regex(
                 r'flashvars\.video_title = "([^"]+)',
                 webpage, 'password').replace('+', ' ')
             video_urls = list(map(
index 98cf92d89a1151edfd11b8f15a86eeaa6a83178d..08a5c4314959409cd93d71fcd8f8160ee541b7c4 100644 (file)
@@ -2,7 +2,11 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
-from ..utils import float_or_none
+from ..compat import compat_urllib_parse_urlparse
+from ..utils import (
+    determine_ext,
+    float_or_none,
+)
 
 
 class SpiegeltvIE(InfoExtractor):
@@ -17,7 +21,7 @@ class SpiegeltvIE(InfoExtractor):
             'thumbnail': 're:http://.*\.jpg$',
         },
         'params': {
-            # rtmp download
+            # m3u8 download
             'skip_download': True,
         }
     }, {
@@ -51,9 +55,37 @@ class SpiegeltvIE(InfoExtractor):
         is_wide = media_json['is_wide']
 
         server_json = self._download_json(
-            'http://www.spiegel.tv/streaming_servers/', video_id,
-            note='Downloading server information')
-        server = server_json[0]['endpoint']
+            'http://spiegeltv-prod-static.s3.amazonaws.com/projectConfigs/projectConfig.json',
+            video_id, note='Downloading server information')
+
+        format = '16x9' if is_wide else '4x3'
+
+        formats = []
+        for streamingserver in server_json['streamingserver']:
+            endpoint = streamingserver.get('endpoint')
+            if not endpoint:
+                continue
+            play_path = 'mp4:%s_spiegeltv_0500_%s.m4v' % (uuid, format)
+            if endpoint.startswith('rtmp'):
+                formats.append({
+                    'url': endpoint,
+                    'format_id': 'rtmp',
+                    'app': compat_urllib_parse_urlparse(endpoint).path[1:],
+                    'play_path': play_path,
+                    'player_path': 'http://prod-static.spiegel.tv/frontend-076.swf',
+                    'ext': 'flv',
+                    'rtmp_live': True,
+                })
+            elif determine_ext(endpoint) == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    endpoint.replace('[video]', play_path),
+                    video_id, 'm4v',
+                    preference=1,  # Prefer hls since it allows to workaround georestriction
+                    m3u8_id='hls'))
+            else:
+                formats.append({
+                    'url': endpoint,
+                })
 
         thumbnails = []
         for image in media_json['images']:
@@ -65,16 +97,12 @@ class SpiegeltvIE(InfoExtractor):
 
         description = media_json['subtitle']
         duration = float_or_none(media_json.get('duration_in_ms'), scale=1000)
-        format = '16x9' if is_wide else '4x3'
-
-        url = server + 'mp4:' + uuid + '_spiegeltv_0500_' + format + '.m4v'
 
         return {
             'id': video_id,
             'title': title,
-            'url': url,
-            'ext': 'm4v',
             'description': description,
             'duration': duration,
-            'thumbnails': thumbnails
+            'thumbnails': thumbnails,
+            'formats': formats,
         }
index 8686f9d11fa3178eefe3eb71fe5a1413e729beeb..86d509ae5351a3cc15be66dda9f485d63ec166ba 100644 (file)
@@ -6,8 +6,7 @@ import re
 from .common import InfoExtractor
 from ..compat import compat_urlparse
 from ..utils import (
-    parse_duration,
-    parse_iso8601,
+    unified_strdate,
 )
 
 
@@ -20,11 +19,9 @@ class SportBoxIE(InfoExtractor):
             'id': '80822',
             'ext': 'mp4',
             'title': 'Гонка 2  заезд ««Объединенный 2000»: классы Туринг и Супер-продакшн',
-            'description': 'md5:81715fa9c4ea3d9e7915dc8180c778ed',
+            'description': 'md5:3d72dc4a006ab6805d82f037fdc637ad',
             'thumbnail': 're:^https?://.*\.jpg$',
-            'timestamp': 1411896237,
             'upload_date': '20140928',
-            'duration': 4846,
         },
         'params': {
             # m3u8 download
@@ -48,17 +45,13 @@ class SportBoxIE(InfoExtractor):
             r'src="/?(vdl/player/[^"]+)"', webpage, 'player')
 
         title = self._html_search_regex(
-            r'<h1 itemprop="name">([^<]+)</h1>', webpage, 'title')
-        description = self._html_search_regex(
-            r'(?s)<div itemprop="description">(.+?)</div>',
-            webpage, 'description', fatal=False)
+            [r'"nodetitle"\s*:\s*"([^"]+)"', r'class="node-header_{1,2}title">([^<]+)'],
+            webpage, 'title')
+        description = self._og_search_description(webpage) or self._html_search_meta(
+            'description', webpage, 'description')
         thumbnail = self._og_search_thumbnail(webpage)
-        timestamp = parse_iso8601(self._search_regex(
-            r'<span itemprop="uploadDate">([^<]+)</span>',
-            webpage, 'timestamp', fatal=False))
-        duration = parse_duration(self._html_search_regex(
-            r'<meta itemprop="duration" content="PT([^"]+)">',
-            webpage, 'duration', fatal=False))
+        upload_date = unified_strdate(self._html_search_meta(
+            'dateCreated', webpage, 'upload date'))
 
         return {
             '_type': 'url_transparent',
@@ -67,8 +60,7 @@ class SportBoxIE(InfoExtractor):
             'title': title,
             'description': description,
             'thumbnail': thumbnail,
-            'timestamp': timestamp,
-            'duration': duration,
+            'upload_date': upload_date,
         }
 
 
index 854d01beeb5cefd1f82d7991ee2c0ce75ad33dfa..e527aa97188b1860e054f8af7c7bd7a33301729e 100644 (file)
@@ -44,7 +44,7 @@ class SunPornoIE(InfoExtractor):
             webpage, 'duration', fatal=False))
 
         view_count = int_or_none(self._html_search_regex(
-            r'class="views">\s*(\d+)\s*<',
+            r'class="views">(?:<noscript>)?\s*(\d+)\s*<',
             webpage, 'view count', fatal=False))
         comment_count = int_or_none(self._html_search_regex(
             r'(\d+)</b> Comments?',
index 56be526383b590d9b00759400a269d5164beef2f..d1b7264b4ca4a0cb72e491da26d7f5bbc1cc66b7 100644 (file)
@@ -10,6 +10,7 @@ from .common import InfoExtractor
 from ..utils import (
     ExtractorError,
     qualities,
+    determine_ext,
 )
 from ..compat import compat_ord
 
@@ -50,6 +51,17 @@ class TeamcocoIE(InfoExtractor):
             'params': {
                 'skip_download': True,  # m3u8 downloads
             }
+        }, {
+            'url': 'http://teamcoco.com/video/full-episode-mon-6-1-joel-mchale-jake-tapper-and-musical-guest-courtney-barnett?playlist=x;eyJ0eXBlIjoidGFnIiwiaWQiOjl9',
+            'info_dict': {
+                'id': '89341',
+                'ext': 'mp4',
+                'title': 'Full Episode - Mon. 6/1 - Joel McHale, Jake Tapper, And Musical Guest Courtney Barnett',
+                'description': 'Guests: Joel McHale, Jake Tapper, And Musical Guest Courtney Barnett',
+            },
+            'params': {
+                'skip_download': True,  # m3u8 downloads
+            }
         }
     ]
     _VIDEO_ID_REGEXES = (
@@ -108,10 +120,24 @@ class TeamcocoIE(InfoExtractor):
         formats = []
         get_quality = qualities(['500k', '480p', '1000k', '720p', '1080p'])
         for filed in data['files']:
-            if filed['type'] == 'hls':
-                formats.extend(self._extract_m3u8_formats(
-                    filed['url'], video_id, ext='mp4'))
+            if determine_ext(filed['url']) == 'm3u8':
+                # compat_urllib_parse.urljoin does not work here
+                if filed['url'].startswith('/'):
+                    m3u8_url = 'http://ht.cdn.turner.com/tbs/big/teamcoco' + filed['url']
+                else:
+                    m3u8_url = filed['url']
+                m3u8_formats = self._extract_m3u8_formats(
+                    m3u8_url, video_id, ext='mp4')
+                for m3u8_format in m3u8_formats:
+                    if m3u8_format not in formats:
+                        formats.append(m3u8_format)
+            elif determine_ext(filed['url']) == 'f4m':
+                # TODO Correct f4m extraction
+                continue
             else:
+                if filed['url'].startswith('/mp4:protected/'):
+                    # TODO Correct extraction for these files
+                    continue
                 m_format = re.search(r'(\d+(k|p))\.mp4', filed['url'])
                 if m_format is not None:
                     format_id = m_format.group(1)
index 251a686804b6f26915c3fa25d9f6b2cc1f98ed4b..a0c744fd16b633b08e7bbb632b77cf40e8410710 100644 (file)
@@ -16,6 +16,10 @@ class TelecincoIE(MiTeleIE):
             'title': 'Con Martín Berasategui, hacer un bacalao al ...',
             'duration': 662,
         },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
     }, {
         'url': 'http://www.telecinco.es/informativos/nacional/Pablo_Iglesias-Informativos_Telecinco-entrevista-Pedro_Piqueras_2_1945155182.html',
         'only_matching': True,
index 466155ef800fbc540292eb343bc9092ab9da6416..f6694149b8e3509b4446458300824a3f3d5fc5de 100644 (file)
@@ -2,6 +2,10 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    float_or_none,
+)
 
 
 class TenPlayIE(InfoExtractor):
@@ -49,18 +53,23 @@ class TenPlayIE(InfoExtractor):
             if protocol == 'rtmp':
                 url = url.replace('&mp4:', '')
 
+                tbr = int_or_none(rendition.get('encodingRate'), 1000)
+
             formats.append({
-                'format_id': '_'.join(['rtmp', rendition['videoContainer'].lower(), rendition['videoCodec'].lower()]),
-                'width': rendition['frameWidth'],
-                'height': rendition['frameHeight'],
-                'tbr': rendition['encodingRate'] / 1024,
-                'filesize': rendition['size'],
+                'format_id': '_'.join(
+                    ['rtmp', rendition['videoContainer'].lower(),
+                     rendition['videoCodec'].lower(), '%sk' % tbr]),
+                'width': int_or_none(rendition['frameWidth']),
+                'height': int_or_none(rendition['frameHeight']),
+                'tbr': tbr,
+                'filesize': int_or_none(rendition['size']),
                 'protocol': protocol,
                 'ext': ext,
                 'vcodec': rendition['videoCodec'].lower(),
                 'container': rendition['videoContainer'].lower(),
                 'url': url,
             })
+        self._sort_formats(formats)
 
         return {
             'id': video_id,
@@ -74,8 +83,8 @@ class TenPlayIE(InfoExtractor):
                 'url': json['thumbnailURL']
             }],
             'thumbnail': json['videoStillURL'],
-            'duration': json['length'] / 1000,
-            'timestamp': float(json['creationDate']) / 1000,
-            'uploader': json['customFields']['production_company_distributor'] if 'production_company_distributor' in json['customFields'] else 'TENplay',
-            'view_count': json['playsTotal']
+            'duration': float_or_none(json.get('length'), 1000),
+            'timestamp': float_or_none(json.get('creationDate'), 1000),
+            'uploader': json.get('customFields', {}).get('production_company_distributor') or 'TENplay',
+            'view_count': int_or_none(json.get('playsTotal')),
         }
index 025d0877cb928bb433aff9f6eff19a29d253e006..3a68eaa80ea6867e6806a4f242a8afc910b8ba06 100644 (file)
@@ -6,8 +6,8 @@ from .common import InfoExtractor
 
 class TF1IE(InfoExtractor):
     """TF1 uses the wat.tv player."""
-    _VALID_URL = r'http://(?:videos\.tf1|www\.tfou)\.fr/.*?-(?P<id>\d+)(?:-\d+)?\.html'
-    _TESTS = {
+    _VALID_URL = r'http://(?:(?:videos|www|lci)\.tf1|www\.tfou)\.fr/.*?-(?P<id>\d+)(?:-\d+)?\.html'
+    _TESTS = [{
         'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html',
         'info_dict': {
             'id': '10635995',
@@ -32,7 +32,13 @@ class TF1IE(InfoExtractor):
             # Sometimes wat serves the whole file with the --test option
             'skip_download': True,
         },
-    }
+    }, {
+        'url': 'http://www.tf1.fr/tf1/koh-lanta/videos/replay-koh-lanta-22-mai-2015.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://lci.tf1.fr/sept-a-huit/videos/sept-a-huit-du-24-mai-2015-8611550.html',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
index 92731ad3d7e8dcc3167b50ce1a15e3b035fb7721..83d833e30dbeb60caa43aa272bfd4d35f4507a53 100644 (file)
@@ -26,7 +26,7 @@ _x = lambda p: xpath_with_ns(p, {'smil': 'http://www.w3.org/2005/SMIL21/Language
 class ThePlatformIE(InfoExtractor):
     _VALID_URL = r'''(?x)
         (?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P<provider_id>[^/]+)/
-           (?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/)?
+           (?:(?P<media>(?:[^/]+/)+select/media/)|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))?
          |theplatform:)(?P<id>[^/\?&]+)'''
 
     _TESTS = [{
@@ -56,6 +56,17 @@ class ThePlatformIE(InfoExtractor):
             # rtmp download
             'skip_download': True,
         }
+    }, {
+        'url': 'https://player.theplatform.com/p/D6x-PC/pulse_preview/embed/select/media/yMBg9E8KFxZD',
+        'info_dict': {
+            'id': 'yMBg9E8KFxZD',
+            'ext': 'mp4',
+            'description': 'md5:644ad9188d655b742f942bf2e06b002d',
+            'title': 'HIGHLIGHTS: USA bag first ever series Cup win',
+        }
+    }, {
+        'url': 'http://player.theplatform.com/p/NnzsPC/widget/select/media/4Y0TlYUr_ZT7',
+        'only_matching': True,
     }]
 
     @staticmethod
@@ -85,6 +96,11 @@ class ThePlatformIE(InfoExtractor):
         if not provider_id:
             provider_id = 'dJ5BDC'
 
+        path = provider_id
+        if mobj.group('media'):
+            path += '/media'
+        path += '/' + video_id
+
         if smuggled_data.get('force_smil_url', False):
             smil_url = url
         elif mobj.group('config'):
@@ -94,8 +110,7 @@ class ThePlatformIE(InfoExtractor):
             config = self._download_json(config_url, video_id, 'Downloading config')
             smil_url = config['releaseUrl'] + '&format=SMIL&formats=MPEG4&manifest=f4m'
         else:
-            smil_url = ('http://link.theplatform.com/s/{0}/{1}/meta.smil?'
-                        'format=smil&mbr=true'.format(provider_id, video_id))
+            smil_url = 'http://link.theplatform.com/s/%s/meta.smil?format=smil&mbr=true' % path
 
         sig = smuggled_data.get('sig')
         if sig:
@@ -112,7 +127,7 @@ class ThePlatformIE(InfoExtractor):
         else:
             raise ExtractorError(error_msg, expected=True)
 
-        info_url = 'http://link.theplatform.com/s/{0}/{1}?format=preview'.format(provider_id, video_id)
+        info_url = 'http://link.theplatform.com/s/%s?format=preview' % path
         info_json = self._download_webpage(info_url, video_id)
         info = json.loads(info_json)
 
index 9f9e388c50948d658d1022f8514122643b623a03..13263614cc06b099d929ee71564899ac3620f76a 100644 (file)
@@ -12,17 +12,22 @@ class TlcIE(DiscoveryIE):
     IE_NAME = 'tlc.com'
     _VALID_URL = r'http://www\.tlc\.com\/[a-zA-Z0-9\-]*/[a-zA-Z0-9\-]*/videos/(?P<id>[a-zA-Z0-9\-]*)(.htm)?'
 
-    _TEST = {
+    # DiscoveryIE has _TESTS
+    _TESTS = [{
         'url': 'http://www.tlc.com/tv-shows/cake-boss/videos/too-big-to-fly.htm',
-        'md5': 'c4038f4a9b44d0b5d74caaa64ed2a01a',
         'info_dict': {
-            'id': '853232',
+            'id': '104493',
             'ext': 'mp4',
-            'title': 'Cake Boss: Too Big to Fly',
+            'title': 'Too Big to Fly',
             'description': 'Buddy has taken on a high flying task.',
             'duration': 119,
+            'timestamp': 1393365060,
+            'upload_date': '20140225',
         },
-    }
+        'params': {
+            'skip_download': True,  # requires ffmpef
+        },
+    }]
 
 
 class TlcDeIE(InfoExtractor):
index d48cbbf140054e639f7191acfa0909972ef3ab76..c282865b2517d8cbd62df6f2dee0540146baae48 100644 (file)
@@ -10,26 +10,32 @@ from ..utils import (
 
 
 class TNAFlixIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/(?P<cat_id>[\w-]+)/(?P<display_id>[\w-]+)/video(?P<id>\d+)'
+    _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P<display_id>[^/]+)/video(?P<id>\d+)'
 
     _TITLE_REGEX = r'<title>(.+?) - TNAFlix Porn Videos</title>'
     _DESCRIPTION_REGEX = r'<h3 itemprop="description">([^<]+)</h3>'
     _CONFIG_REGEX = r'flashvars\.config\s*=\s*escape\("([^"]+)"'
 
-    _TEST = {
-        'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878',
-        'md5': 'ecf3498417d09216374fc5907f9c6ec0',
-        'info_dict': {
-            'id': '553878',
-            'display_id': 'Carmella-Decesare-striptease',
-            'ext': 'mp4',
-            'title': 'Carmella Decesare - striptease',
-            'description': '',
-            'thumbnail': 're:https?://.*\.jpg$',
-            'duration': 91,
-            'age_limit': 18,
+    _TESTS = [
+        {
+            'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878',
+            'md5': 'ecf3498417d09216374fc5907f9c6ec0',
+            'info_dict': {
+                'id': '553878',
+                'display_id': 'Carmella-Decesare-striptease',
+                'ext': 'mp4',
+                'title': 'Carmella Decesare - striptease',
+                'description': '',
+                'thumbnail': 're:https?://.*\.jpg$',
+                'duration': 91,
+                'age_limit': 18,
+            }
+        },
+        {
+            'url': 'https://www.tnaflix.com/amateur-porn/bunzHD-Ms.Donk/video358632',
+            'only_matching': True,
         }
-    }
+    ]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
@@ -45,9 +51,8 @@ class TNAFlixIE(InfoExtractor):
 
         age_limit = self._rta_search(webpage)
 
-        duration = self._html_search_meta('duration', webpage, 'duration', default=None)
-        if duration:
-            duration = parse_duration(duration[1:])
+        duration = parse_duration(self._html_search_meta(
+            'duration', webpage, 'duration', default=None))
 
         cfg_url = self._proto_relative_url(self._html_search_regex(
             self._CONFIG_REGEX, webpage, 'flashvars.config'), 'http:')
@@ -56,14 +61,15 @@ class TNAFlixIE(InfoExtractor):
             cfg_url, display_id, note='Downloading metadata',
             transform_source=fix_xml_ampersands)
 
-        thumbnail = cfg_xml.find('./startThumb').text
+        thumbnail = self._proto_relative_url(
+            cfg_xml.find('./startThumb').text, 'http:')
 
         formats = []
         for item in cfg_xml.findall('./quality/item'):
             video_url = re.sub('speed=\d+', 'speed=', item.find('videoLink').text)
             format_id = item.find('res').text
             fmt = {
-                'url': video_url,
+                'url': self._proto_relative_url(video_url, 'http:'),
                 'format_id': format_id,
             }
             m = re.search(r'^(\d+)', format_id)
index d73ad3762a1b455cfd4bc384c27e2dd85e776dde..c9cb69333f7da0a9f4fe009e79b06433bca83726 100644 (file)
@@ -47,7 +47,7 @@ class Tube8IE(InfoExtractor):
         webpage = self._download_webpage(req, display_id)
 
         flashvars = json.loads(self._html_search_regex(
-            r'var flashvars\s*=\s*({.+?})', webpage, 'flashvars'))
+            r'flashvars\s*=\s*({.+?});\r?\n', webpage, 'flashvars'))
 
         video_url = flashvars['video_url']
         if flashvars.get('encrypted') is True:
@@ -58,19 +58,19 @@ class Tube8IE(InfoExtractor):
         thumbnail = flashvars.get('image_url')
 
         title = self._html_search_regex(
-            r'videotitle\s*=\s*"([^"]+)', webpage, 'title')
+            r'videoTitle\s*=\s*"([^"]+)', webpage, 'title')
         description = self._html_search_regex(
-            r'>Description:</strong>(.+?)<', webpage, 'description', fatal=False)
+            r'>Description:</strong>\s*(.+?)\s*<', webpage, 'description', fatal=False)
         uploader = self._html_search_regex(
-            r'<strong class="video-username">(?:<a href="[^"]+">)?([^<]+)(?:</a>)?</strong>',
+            r'<span class="username">\s*(.+?)\s*<',
             webpage, 'uploader', fatal=False)
 
         like_count = int_or_none(self._html_search_regex(
-            r"rupVar\s*=\s*'(\d+)'", webpage, 'like count', fatal=False))
+            r'rupVar\s*=\s*"(\d+)"', webpage, 'like count', fatal=False))
         dislike_count = int_or_none(self._html_search_regex(
-            r"rdownVar\s*=\s*'(\d+)'", webpage, 'dislike count', fatal=False))
+            r'rdownVar\s*=\s*"(\d+)"', webpage, 'dislike count', fatal=False))
         view_count = self._html_search_regex(
-            r'<strong>Views: </strong>([\d,\.]+)</li>', webpage, 'view count', fatal=False)
+            r'<strong>Views: </strong>([\d,\.]+)\s*</li>', webpage, 'view count', fatal=False)
         if view_count:
             view_count = str_to_int(view_count)
         comment_count = self._html_search_regex(
diff --git a/youtube_dl/extractor/tubitv.py b/youtube_dl/extractor/tubitv.py
new file mode 100644 (file)
index 0000000..2c4b218
--- /dev/null
@@ -0,0 +1,84 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import codecs
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_urllib_parse,
+    compat_urllib_request
+)
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+)
+
+
+class TubiTvIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?tubitv\.com/video\?id=(?P<id>[0-9]+)'
+    _LOGIN_URL = 'http://tubitv.com/login'
+    _NETRC_MACHINE = 'tubitv'
+    _TEST = {
+        'url': 'http://tubitv.com/video?id=54411&title=The_Kitchen_Musical_-_EP01',
+        'info_dict': {
+            'id': '54411',
+            'ext': 'mp4',
+            'title': 'The Kitchen Musical - EP01',
+            'thumbnail': 're:^https?://.*\.png$',
+            'description': 'md5:37532716166069b353e8866e71fefae7',
+            'duration': 2407,
+        },
+        'params': {
+            'skip_download': 'HLS download',
+        },
+    }
+
+    def _login(self):
+        (username, password) = self._get_login_info()
+        if username is None:
+            return
+        self.report_login()
+        form_data = {
+            'username': username,
+            'password': password,
+        }
+        payload = compat_urllib_parse.urlencode(form_data).encode('utf-8')
+        request = compat_urllib_request.Request(self._LOGIN_URL, payload)
+        request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+        login_page = self._download_webpage(
+            request, None, False, 'Wrong login info')
+        if not re.search(r'id="tubi-logout"', login_page):
+            raise ExtractorError(
+                'Login failed (invalid username/password)', expected=True)
+
+    def _real_initialize(self):
+        self._login()
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+        if re.search(r"<(?:DIV|div) class='login-required-screen'>", webpage):
+            raise ExtractorError(
+                'This video requires login, use --username and --password '
+                'options to provide account credentials.', expected=True)
+
+        title = self._og_search_title(webpage)
+        description = self._og_search_description(webpage)
+        thumbnail = self._og_search_thumbnail(webpage)
+        duration = int_or_none(self._html_search_meta(
+            'video:duration', webpage, 'duration'))
+
+        apu = self._search_regex(r"apu='([^']+)'", webpage, 'apu')
+        m3u8_url = codecs.decode(apu, 'rot_13')[::-1]
+        formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4')
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'thumbnail': thumbnail,
+            'description': description,
+            'duration': duration,
+        }
index 828c808a6456b6b99b134cb7ae9d9017de9ad3aa..63c20310d8e54f98f2c19bd23f9ebca947ae23d3 100644 (file)
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
+from .pornhub import PornHubIE
 
 
 class TumblrIE(InfoExtractor):
@@ -28,6 +29,17 @@ class TumblrIE(InfoExtractor):
             'description': 'md5:dba62ac8639482759c8eb10ce474586a',
             'thumbnail': 're:http://.*\.jpg',
         }
+    }, {
+        'url': 'http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching',
+        'md5': 'de07e5211d60d4f3a2c3df757ea9f6ab',
+        'info_dict': {
+            'id': 'Wmur',
+            'ext': 'mp4',
+            'title': 'naked smoking & stretching',
+            'upload_date': '20150506',
+            'timestamp': 1430931613,
+        },
+        'add_ie': ['Vidme'],
     }]
 
     def _real_extract(self, url):
@@ -38,6 +50,16 @@ class TumblrIE(InfoExtractor):
         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
         webpage = self._download_webpage(url, video_id)
 
+        vid_me_embed_url = self._search_regex(
+            r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]',
+            webpage, 'vid.me embed', default=None)
+        if vid_me_embed_url is not None:
+            return self.url_result(vid_me_embed_url, 'Vidme')
+
+        pornhub_url = PornHubIE._extract_url(webpage)
+        if pornhub_url:
+            return self.url_result(pornhub_url, 'PornHub')
+
         iframe_url = self._search_regex(
             r'src=\'(https?://www\.tumblr\.com/video/[^\']+)\'',
             webpage, 'iframe url')
index 29703a8a9a6ddf0981642c28cd2f1f68cc07c7b7..7ae63a4992a74368ec8b5f6a266a298cb6776b79 100644 (file)
@@ -23,7 +23,7 @@ class TurboIE(InfoExtractor):
             'ext': 'mp4',
             'duration': 3715,
             'title': 'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia... ',
-            'description': 'Retrouvez dans cette rubrique toutes les vidéos de l\'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia... ',
+            'description': 'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia...',
             'thumbnail': 're:^https?://.*\.jpg$',
         }
     }
@@ -42,7 +42,7 @@ class TurboIE(InfoExtractor):
         title = xpath_text(item, './title', 'title')
         duration = int_or_none(xpath_text(item, './durate', 'duration'))
         thumbnail = xpath_text(item, './visuel_clip', 'thumbnail')
-        description = self._og_search_description(webpage)
+        description = self._html_search_meta('description', webpage)
 
         formats = []
         get_quality = qualities(['3g', 'sd', 'hq'])
index 4de0aac523313eced334aab38a9a20c7bf08dfc7..fad720b681e125ac495b26ba3870dbd65340e3ce 100644 (file)
@@ -26,7 +26,7 @@ class TutvIE(InfoExtractor):
 
         data_content = self._download_webpage(
             'http://tu.tv/flvurl.php?codVideo=%s' % internal_id, video_id, 'Downloading video info')
-        video_url = base64.b64decode(compat_parse_qs(data_content)['kpt'][0]).decode('utf-8')
+        video_url = base64.b64decode(compat_parse_qs(data_content)['kpt'][0].encode('utf-8')).decode('utf-8')
 
         return {
             'id': internal_id,
diff --git a/youtube_dl/extractor/tvc.py b/youtube_dl/extractor/tvc.py
new file mode 100644 (file)
index 0000000..3a4f393
--- /dev/null
@@ -0,0 +1,109 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_html,
+    int_or_none,
+)
+
+
+class TVCIE(InfoExtractor):
+    _VALID_URL = r'http://(?:www\.)?tvc\.ru/video/iframe/id/(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://www.tvc.ru/video/iframe/id/74622/isPlay/false/id_stat/channel/?acc_video_id=/channel/brand/id/17/show/episodes/episode_id/39702',
+        'md5': 'bbc5ff531d1e90e856f60fc4b3afd708',
+        'info_dict': {
+            'id': '74622',
+            'ext': 'mp4',
+            'title': 'События. "События". Эфир от 22.05.2015 14:30',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'duration': 1122,
+        },
+    }
+
+    @classmethod
+    def _extract_url(cls, webpage):
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:http:)?//(?:www\.)?tvc\.ru/video/iframe/id/[^"]+)\1', webpage)
+        if mobj:
+            return mobj.group('url')
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        video = self._download_json(
+            'http://www.tvc.ru/video/json/id/%s' % video_id, video_id)
+
+        formats = []
+        for info in video.get('path', {}).get('quality', []):
+            video_url = info.get('url')
+            if not video_url:
+                continue
+            format_id = self._search_regex(
+                r'cdnvideo/([^/]+?)(?:-[^/]+?)?/', video_url,
+                'format id', default=None)
+            formats.append({
+                'url': video_url,
+                'format_id': format_id,
+                'width': int_or_none(info.get('width')),
+                'height': int_or_none(info.get('height')),
+                'tbr': int_or_none(info.get('bitrate')),
+            })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': video['title'],
+            'thumbnail': video.get('picture'),
+            'duration': int_or_none(video.get('duration')),
+            'formats': formats,
+        }
+
+
+class TVCArticleIE(InfoExtractor):
+    _VALID_URL = r'http://(?:www\.)?tvc\.ru/(?!video/iframe/id/)(?P<id>[^?#]+)'
+    _TESTS = [{
+        'url': 'http://www.tvc.ru/channel/brand/id/29/show/episodes/episode_id/39702/',
+        'info_dict': {
+            'id': '74622',
+            'ext': 'mp4',
+            'title': 'События. "События". Эфир от 22.05.2015 14:30',
+            'description': 'md5:ad7aa7db22903f983e687b8a3e98c6dd',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'duration': 1122,
+        },
+    }, {
+        'url': 'http://www.tvc.ru/news/show/id/69944',
+        'info_dict': {
+            'id': '75399',
+            'ext': 'mp4',
+            'title': 'Эксперты: в столице встал вопрос о максимально безопасных остановках',
+            'description': 'md5:f2098f71e21f309e89f69b525fd9846e',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'duration': 278,
+        },
+    }, {
+        'url': 'http://www.tvc.ru/channel/brand/id/47/show/episodes#',
+        'info_dict': {
+            'id': '2185',
+            'ext': 'mp4',
+            'title': 'Ещё не поздно. Эфир от 03.08.2013',
+            'description': 'md5:51fae9f3f8cfe67abce014e428e5b027',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'duration': 3316,
+        },
+    }]
+
+    def _real_extract(self, url):
+        webpage = self._download_webpage(url, self._match_id(url))
+        return {
+            '_type': 'url_transparent',
+            'ie_key': 'TVC',
+            'url': self._og_search_video_url(webpage),
+            'title': clean_html(self._og_search_title(webpage)),
+            'description': clean_html(self._og_search_description(webpage)),
+            'thumbnail': self._og_search_thumbnail(webpage),
+        }
index 102362b295450f58ff085ec9be7d21921a1ac494..dc3a8334a6b335143dff417d805a26df412d8783 100644 (file)
@@ -5,7 +5,9 @@ import re
 
 from .common import InfoExtractor
 from ..utils import (
+    ExtractorError,
     float_or_none,
+    int_or_none,
     parse_age_limit,
 )
 
@@ -24,22 +26,24 @@ class TvigleIE(InfoExtractor):
                 'display_id': 'sokrat',
                 'ext': 'flv',
                 'title': 'Сократ',
-                'description': 'md5:a05bd01be310074d5833efc6743be95e',
+                'description': 'md5:d6b92ffb7217b4b8ebad2e7665253c17',
                 'duration': 6586,
-                'age_limit': 0,
+                'age_limit': 12,
             },
+            'skip': 'georestricted',
         },
         {
             'url': 'http://www.tvigle.ru/video/vladimir-vysotskii/vedushchii-teleprogrammy-60-minut-ssha-o-vladimire-vysotskom/',
-            'md5': 'd9012d7c7c598fe7a11d7fb46dc1f574',
+            'md5': 'e7efe5350dd5011d0de6550b53c3ba7b',
             'info_dict': {
                 'id': '5142516',
-                'ext': 'mp4',
+                'ext': 'flv',
                 'title': 'Ведущий телепрограммы «60 минут» (США) о Владимире Высоцком',
                 'description': 'md5:027f7dc872948f14c96d19b4178428a4',
                 'duration': 186.080,
                 'age_limit': 0,
             },
+            'skip': 'georestricted',
         }, {
             'url': 'https://cloud.tvigle.ru/video/5267604/',
             'only_matching': True,
@@ -54,7 +58,7 @@ class TvigleIE(InfoExtractor):
         if not video_id:
             webpage = self._download_webpage(url, display_id)
             video_id = self._html_search_regex(
-                r'<li class="video-preview current_playing" id="(\d+)">',
+                r'class="video-preview current_playing" id="(\d+)">',
                 webpage, 'video id')
 
         video_data = self._download_json(
@@ -62,21 +66,34 @@ class TvigleIE(InfoExtractor):
 
         item = video_data['playlist']['items'][0]
 
+        videos = item.get('videos')
+
+        error_message = item.get('errorMessage')
+        if not videos and error_message:
+            raise ExtractorError(
+                '%s returned error: %s' % (self.IE_NAME, error_message), expected=True)
+
         title = item['title']
-        description = item['description']
-        thumbnail = item['thumbnail']
+        description = item.get('description')
+        thumbnail = item.get('thumbnail')
         duration = float_or_none(item.get('durationMilliseconds'), 1000)
         age_limit = parse_age_limit(item.get('ageRestrictions'))
 
         formats = []
         for vcodec, fmts in item['videos'].items():
-            for quality, video_url in fmts.items():
+            for format_id, video_url in fmts.items():
+                if format_id == 'm3u8':
+                    formats.extend(self._extract_m3u8_formats(
+                        video_url, video_id, 'mp4', m3u8_id=vcodec))
+                    continue
+                height = self._search_regex(
+                    r'^(\d+)[pP]$', format_id, 'height', default=None)
                 formats.append({
                     'url': video_url,
-                    'format_id': '%s-%s' % (vcodec, quality),
+                    'format_id': '%s-%s' % (vcodec, format_id),
                     'vcodec': vcodec,
-                    'height': int(quality[:-1]),
-                    'filesize': item['video_files_size'][vcodec][quality],
+                    'height': int_or_none(height),
+                    'filesize': int_or_none(item.get('video_files_size', {}).get(vcodec, {}).get(format_id)),
                 })
         self._sort_formats(formats)
 
index 67e8bfea03476ccf78d2470a973655f2a7213730..c1ee1decc433627ffa52196d44f7563b46d309cc 100644 (file)
@@ -15,7 +15,7 @@ class TwentyFourVideoIE(InfoExtractor):
     _TESTS = [
         {
             'url': 'http://www.24video.net/video/view/1044982',
-            'md5': '48dd7646775690a80447a8dca6a2df76',
+            'md5': 'd041af8b5b4246ea466226a0d6693345',
             'info_dict': {
                 'id': '1044982',
                 'ext': 'mp4',
@@ -54,7 +54,7 @@ class TwentyFourVideoIE(InfoExtractor):
             webpage, 'upload date'))
 
         uploader = self._html_search_regex(
-            r'Загрузил\s*<a href="/jsecUser/movies/[^"]+" class="link">([^<]+)</a>',
+            r'class="video-uploaded"[^>]*>\s*<a href="/jsecUser/movies/[^"]+"[^>]*>([^<]+)</a>',
             webpage, 'uploader', fatal=False)
 
         view_count = int_or_none(self._html_search_regex(
index e6ee1e4715efc5d47dd3f9aa32d6559a5737a8ea..f38a72fde8974a7a1ea290de04281f67079b1a16 100644 (file)
@@ -4,7 +4,10 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..utils import float_or_none
+from ..utils import (
+    ExtractorError,
+    float_or_none,
+)
 
 
 class VGTVIE(InfoExtractor):
@@ -59,16 +62,16 @@ class VGTVIE(InfoExtractor):
         },
         {
             # streamType: live
-            'url': 'http://www.vgtv.no/#!/live/100015/direkte-her-kan-du-se-laksen-live-fra-suldalslaagen',
+            'url': 'http://www.vgtv.no/#!/live/113063/direkte-v75-fra-solvalla',
             'info_dict': {
-                'id': '100015',
+                'id': '113063',
                 'ext': 'flv',
-                'title': 'DIREKTE: Her kan du se laksen live fra Suldalslågen!',
-                'description': 'md5:9a60cc23fa349f761628924e56eeec2d',
+                'title': 're:^DIREKTE: V75 fra Solvalla [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+                'description': 'md5:b3743425765355855f88e096acc93231',
                 'thumbnail': 're:^https?://.*\.jpg',
                 'duration': 0,
-                'timestamp': 1407423348,
-                'upload_date': '20140807',
+                'timestamp': 1432975582,
+                'upload_date': '20150530',
                 'view_count': int,
             },
             'params': {
@@ -97,7 +100,12 @@ class VGTVIE(InfoExtractor):
             % (host, video_id, HOST_WEBSITES[host]),
             video_id, 'Downloading media JSON')
 
+        if data.get('status') == 'inactive':
+            raise ExtractorError(
+                'Video %s is no longer available' % video_id, expected=True)
+
         streams = data['streamUrls']
+        stream_type = data.get('streamType')
 
         formats = []
 
@@ -107,7 +115,8 @@ class VGTVIE(InfoExtractor):
                 hls_url, video_id, 'mp4', m3u8_id='hls'))
 
         hds_url = streams.get('hds')
-        if hds_url:
+        # wasLive hds are always 404
+        if hds_url and stream_type != 'wasLive':
             formats.extend(self._extract_f4m_formats(
                 hds_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18',
                 video_id, f4m_id='hds'))
@@ -135,13 +144,14 @@ class VGTVIE(InfoExtractor):
 
         return {
             'id': video_id,
-            'title': data['title'],
+            'title': self._live_title(data['title']),
             'description': data['description'],
             'thumbnail': data['images']['main'] + '?t[]=900x506q80',
             'timestamp': data['published'],
             'duration': float_or_none(data['duration'], 1000),
             'view_count': data['displays'],
             'formats': formats,
+            'is_live': True if stream_type == 'live' else False,
         }
 
 
index ececc7ee0118932716ca9bdb06779cf94e6dc0ec..591024eaded0cdddbb0779bd942c0fa8f63d86a6 100644 (file)
@@ -43,7 +43,7 @@ class VideoTtIE(InfoExtractor):
 
         formats = [
             {
-                'url': base64.b64decode(res['u']).decode('utf-8'),
+                'url': base64.b64decode(res['u'].encode('utf-8')).decode('utf-8'),
                 'ext': 'flv',
                 'format_id': res['l'],
             } for res in settings['res'] if res['u']
index bd953fb4cc212f50dce2cac624c9391a14e82898..e0b55078b2c9af8bf654ee6cd6982305074cb39b 100644 (file)
@@ -10,7 +10,7 @@ from ..utils import (
 
 class VidmeIE(InfoExtractor):
     _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z]+)'
-    _TEST = {
+    _TESTS = [{
         'url': 'https://vid.me/QNB',
         'md5': 'f42d05e7149aeaec5c037b17e5d3dc82',
         'info_dict': {
@@ -23,9 +23,14 @@ class VidmeIE(InfoExtractor):
             'upload_date': '20140725',
             'thumbnail': 're:^https?://.*\.jpg',
         },
-    }
+    }, {
+        # From http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching
+        'url': 'https://vid.me/e/Wmur',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
+        url = url.replace('vid.me/e/', 'vid.me/')
         video_id = self._match_id(url)
         webpage = self._download_webpage(url, video_id)
 
index cf6af1e5cdb6315d325d2bd355d384cc283a3e0c..7f2fb1ca8896e29e48a41a9efddaded987ba1e96 100644 (file)
@@ -1,29 +1,65 @@
 from __future__ import unicode_literals
 
-import re
+import time
+import hmac
+import hashlib
+import itertools
 
-from ..compat import (
-    compat_urlparse,
-    compat_urllib_request,
-)
 from ..utils import (
     ExtractorError,
-    unescapeHTML,
-    unified_strdate,
-    US_RATINGS,
-    determine_ext,
-    mimetype2ext,
+    int_or_none,
+    parse_age_limit,
+    parse_iso8601,
 )
 from .common import InfoExtractor
 
 
-class VikiIE(InfoExtractor):
-    IE_NAME = 'viki'
+class VikiBaseIE(InfoExtractor):
+    _VALID_URL_BASE = r'https?://(?:www\.)?viki\.(?:com|net|mx|jp|fr)/'
+    _API_QUERY_TEMPLATE = '/v4/%sapp=%s&t=%s&site=www.viki.com'
+    _API_URL_TEMPLATE = 'http://api.viki.io%s&sig=%s'
+
+    _APP = '65535a'
+    _APP_VERSION = '2.2.5.1428709186'
+    _APP_SECRET = '-$iJ}@p7!G@SyU/je1bEyWg}upLu-6V6-Lg9VD(]siH,r.,m-r|ulZ,U4LC/SeR)'
+
+    def _prepare_call(self, path, timestamp=None):
+        path += '?' if '?' not in path else '&'
+        if not timestamp:
+            timestamp = int(time.time())
+        query = self._API_QUERY_TEMPLATE % (path, self._APP, timestamp)
+        sig = hmac.new(
+            self._APP_SECRET.encode('ascii'),
+            query.encode('ascii'),
+            hashlib.sha1
+        ).hexdigest()
+        return self._API_URL_TEMPLATE % (query, sig)
+
+    def _call_api(self, path, video_id, note, timestamp=None):
+        resp = self._download_json(
+            self._prepare_call(path, timestamp), video_id, note)
+
+        error = resp.get('error')
+        if error:
+            if error == 'invalid timestamp':
+                resp = self._download_json(
+                    self._prepare_call(path, int(resp['current_timestamp'])),
+                    video_id, '%s (retry)' % note)
+                error = resp.get('error')
+            if error:
+                self._raise_error(resp['error'])
+
+        return resp
 
-    # iPad2
-    _USER_AGENT = 'Mozilla/5.0(iPad; U; CPU OS 4_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8F191 Safari/6533.18.5'
+    def _raise_error(self, error):
+        raise ExtractorError(
+            '%s returned error: %s' % (self.IE_NAME, error),
+            expected=True)
 
-    _VALID_URL = r'^https?://(?:www\.)?viki\.com/videos/(?P<id>[0-9]+v)'
+
+class VikiIE(VikiBaseIE):
+    IE_NAME = 'viki'
+    _VALID_URL = r'%s(?:videos|player)/(?P<id>[0-9]+v)' % VikiBaseIE._VALID_URL_BASE
     _TESTS = [{
         'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14',
         'info_dict': {
@@ -37,111 +73,218 @@ class VikiIE(InfoExtractor):
         },
         'skip': 'Blocked in the US',
     }, {
+        # clip
         'url': 'http://www.viki.com/videos/1067139v-the-avengers-age-of-ultron-press-conference',
-        'md5': 'ca6493e6f0a6ec07da9aa8d6304b4b2c',
+        'md5': '86c0b5dbd4d83a6611a79987cc7a1989',
         'info_dict': {
             'id': '1067139v',
             'ext': 'mp4',
+            'title': "'The Avengers: Age of Ultron' Press Conference",
             'description': 'md5:d70b2f9428f5488321bfe1db10d612ea',
+            'duration': 352,
+            'timestamp': 1430380829,
             'upload_date': '20150430',
-            'title': '\'The Avengers: Age of Ultron\' Press Conference',
+            'uploader': 'Arirang TV',
+            'like_count': int,
+            'age_limit': 0,
         }
     }, {
         'url': 'http://www.viki.com/videos/1048879v-ankhon-dekhi',
         'info_dict': {
             'id': '1048879v',
             'ext': 'mp4',
-            'upload_date': '20140820',
-            'description': 'md5:54ff56d51bdfc7a30441ec967394e91c',
             'title': 'Ankhon Dekhi',
+            'duration': 6512,
+            'timestamp': 1408532356,
+            'upload_date': '20140820',
+            'uploader': 'Spuul',
+            'like_count': int,
+            'age_limit': 13,
         },
         'params': {
-            # requires ffmpeg
+            # m3u8 download
             'skip_download': True,
         }
+    }, {
+        # episode
+        'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1',
+        'md5': '190f3ef426005ba3a080a63325955bc3',
+        'info_dict': {
+            'id': '44699v',
+            'ext': 'mp4',
+            'title': 'Boys Over Flowers - Episode 1',
+            'description': 'md5:52617e4f729c7d03bfd4bcbbb6e946f2',
+            'duration': 4155,
+            'timestamp': 1270496524,
+            'upload_date': '20100405',
+            'uploader': 'group8',
+            'like_count': int,
+            'age_limit': 13,
+        }
+    }, {
+        # youtube external
+        'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1',
+        'md5': '216d1afdc0c64d1febc1e9f2bd4b864b',
+        'info_dict': {
+            'id': '50562v',
+            'ext': 'mp4',
+            'title': 'Poor Nastya [COMPLETE] - Episode 1',
+            'description': '',
+            'duration': 607,
+            'timestamp': 1274949505,
+            'upload_date': '20101213',
+            'uploader': 'ad14065n',
+            'uploader_id': 'ad14065n',
+            'like_count': int,
+            'age_limit': 13,
+        }
+    }, {
+        'url': 'http://www.viki.com/player/44699v',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        webpage = self._download_webpage(url, video_id)
-        title = self._og_search_title(webpage)
-        description = self._og_search_description(webpage)
-        thumbnail = self._og_search_thumbnail(webpage)
-
-        uploader_m = re.search(
-            r'<strong>Broadcast Network: </strong>\s*([^<]*)<', webpage)
-        if uploader_m is None:
-            uploader = None
-        else:
-            uploader = uploader_m.group(1).strip()
-
-        rating_str = self._html_search_regex(
-            r'<strong>Rating: </strong>\s*([^<]*)<', webpage,
-            'rating information', default='').strip()
-        age_limit = US_RATINGS.get(rating_str)
-
-        req = compat_urllib_request.Request(
-            'http://www.viki.com/player5_fragment/%s?action=show&controller=videos' % video_id)
-        req.add_header('User-Agent', self._USER_AGENT)
-        info_webpage = self._download_webpage(
-            req, video_id, note='Downloading info page')
-        err_msg = self._html_search_regex(r'<div[^>]+class="video-error[^>]+>(.+)</div>', info_webpage, 'error message', default=None)
-        if err_msg:
-            if 'not available in your region' in err_msg:
-                raise ExtractorError(
-                    'Video %s is blocked from your location.' % video_id,
-                    expected=True)
-            else:
-                raise ExtractorError('Viki said: ' + err_msg)
-        mobj = re.search(
-            r'<source[^>]+type="(?P<mime_type>[^"]+)"[^>]+src="(?P<url>[^"]+)"', info_webpage)
-        if not mobj:
-            raise ExtractorError('Unable to find video URL')
-        video_url = unescapeHTML(mobj.group('url'))
-        video_ext = mimetype2ext(mobj.group('mime_type'))
-
-        if determine_ext(video_url) == 'm3u8':
-            formats = self._extract_m3u8_formats(
-                video_url, video_id, ext=video_ext)
-        else:
-            formats = [{
-                'url': video_url,
-                'ext': video_ext,
-            }]
-
-        upload_date_str = self._html_search_regex(
-            r'"created_at":"([^"]+)"', info_webpage, 'upload date')
-        upload_date = (
-            unified_strdate(upload_date_str)
-            if upload_date_str is not None
-            else None
-        )
-
-        # subtitles
-        video_subtitles = self.extract_subtitles(video_id, info_webpage)
-
-        return {
+        video = self._call_api(
+            'videos/%s.json' % video_id, video_id, 'Downloading video JSON')
+
+        title = None
+        titles = video.get('titles')
+        if titles:
+            title = titles.get('en') or titles[titles.keys()[0]]
+        if not title:
+            title = 'Episode %d' % video.get('number') if video.get('type') == 'episode' else video.get('id') or video_id
+            container_titles = video.get('container', {}).get('titles')
+            if container_titles:
+                container_title = container_titles.get('en') or container_titles[container_titles.keys()[0]]
+                title = '%s - %s' % (container_title, title)
+
+        descriptions = video.get('descriptions')
+        description = descriptions.get('en') or descriptions[titles.keys()[0]] if descriptions else None
+
+        duration = int_or_none(video.get('duration'))
+        timestamp = parse_iso8601(video.get('created_at'))
+        uploader = video.get('author')
+        like_count = int_or_none(video.get('likes', {}).get('count'))
+        age_limit = parse_age_limit(video.get('rating'))
+
+        thumbnails = []
+        for thumbnail_id, thumbnail in video.get('images', {}).items():
+            thumbnails.append({
+                'id': thumbnail_id,
+                'url': thumbnail.get('url'),
+            })
+
+        subtitles = {}
+        for subtitle_lang, _ in video.get('subtitle_completions', {}).items():
+            subtitles[subtitle_lang] = [{
+                'ext': subtitles_format,
+                'url': self._prepare_call(
+                    'videos/%s/subtitles/%s.%s' % (video_id, subtitle_lang, subtitles_format)),
+            } for subtitles_format in ('srt', 'vtt')]
+
+        result = {
             'id': video_id,
             'title': title,
-            'formats': formats,
             'description': description,
-            'thumbnail': thumbnail,
-            'age_limit': age_limit,
+            'duration': duration,
+            'timestamp': timestamp,
             'uploader': uploader,
-            'subtitles': video_subtitles,
-            'upload_date': upload_date,
+            'like_count': like_count,
+            'age_limit': age_limit,
+            'thumbnails': thumbnails,
+            'subtitles': subtitles,
         }
 
-    def _get_subtitles(self, video_id, info_webpage):
-        res = {}
-        for sturl_html in re.findall(r'<track src="([^"]+)"', info_webpage):
-            sturl = unescapeHTML(sturl_html)
-            m = re.search(r'/(?P<lang>[a-z]+)\.vtt', sturl)
-            if not m:
-                continue
-            res[m.group('lang')] = [{
-                'url': compat_urlparse.urljoin('http://www.viki.com', sturl),
-                'ext': 'vtt',
-            }]
-        return res
+        streams = self._call_api(
+            'videos/%s/streams.json' % video_id, video_id,
+            'Downloading video streams JSON')
+
+        if 'external' in streams:
+            result.update({
+                '_type': 'url_transparent',
+                'url': streams['external']['url'],
+            })
+            return result
+
+        formats = []
+        for format_id, stream_dict in streams.items():
+            height = self._search_regex(
+                r'^(\d+)[pP]$', format_id, 'height', default=None)
+            for protocol, format_dict in stream_dict.items():
+                if format_id == 'm3u8':
+                    formats = self._extract_m3u8_formats(
+                        format_dict['url'], video_id, 'mp4', m3u8_id='m3u8-%s' % protocol)
+                else:
+                    formats.append({
+                        'url': format_dict['url'],
+                        'format_id': '%s-%s' % (format_id, protocol),
+                        'height': height,
+                    })
+        self._sort_formats(formats)
+
+        result['formats'] = formats
+        return result
+
+
+class VikiChannelIE(VikiBaseIE):
+    IE_NAME = 'viki:channel'
+    _VALID_URL = r'%s(?:tv|news|movies|artists)/(?P<id>[0-9]+c)' % VikiBaseIE._VALID_URL_BASE
+    _TESTS = [{
+        'url': 'http://www.viki.com/tv/50c-boys-over-flowers',
+        'info_dict': {
+            'id': '50c',
+            'title': 'Boys Over Flowers',
+            'description': 'md5:ecd3cff47967fe193cff37c0bec52790',
+        },
+        'playlist_count': 70,
+    }, {
+        'url': 'http://www.viki.com/tv/1354c-poor-nastya-complete',
+        'info_dict': {
+            'id': '1354c',
+            'title': 'Poor Nastya [COMPLETE]',
+            'description': 'md5:05bf5471385aa8b21c18ad450e350525',
+        },
+        'playlist_count': 127,
+    }, {
+        'url': 'http://www.viki.com/news/24569c-showbiz-korea',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.viki.com/movies/22047c-pride-and-prejudice-2005',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.viki.com/artists/2141c-shinee',
+        'only_matching': True,
+    }]
+
+    _PER_PAGE = 25
+
+    def _real_extract(self, url):
+        channel_id = self._match_id(url)
+
+        channel = self._call_api(
+            'containers/%s.json' % channel_id, channel_id,
+            'Downloading channel JSON')
+
+        titles = channel['titles']
+        title = titles.get('en') or titles[titles.keys()[0]]
+
+        descriptions = channel['descriptions']
+        description = descriptions.get('en') or descriptions[descriptions.keys()[0]]
+
+        entries = []
+        for video_type in ('episodes', 'clips', 'movies'):
+            for page_num in itertools.count(1):
+                page = self._call_api(
+                    'containers/%s/%s.json?per_page=%d&sort=number&direction=asc&with_paging=true&page=%d'
+                    % (channel_id, video_type, self._PER_PAGE, page_num), channel_id,
+                    'Downloading %s JSON page #%d' % (video_type, page_num))
+                for video in page['response']:
+                    video_id = video['id']
+                    entries.append(self.url_result(
+                        'http://www.viki.com/videos/%s' % video_id, 'Viki'))
+                if not page['pagination']['next']:
+                    break
+
+        return self.playlist_result(entries, channel_id, title, description)
index cc384adbf9837f35f90c64d0e8dc0396b0b601ec..d0e772108fe875100a8c2ceaec961814de0ab0e6 100644 (file)
@@ -119,8 +119,8 @@ class VKIE(InfoExtractor):
             'act': 'login',
             'role': 'al_frame',
             'expire': '1',
-            'email': username,
-            'pass': password,
+            'email': username.encode('cp1251'),
+            'pass': password.encode('cp1251'),
         }
 
         request = compat_urllib_request.Request('https://login.vk.com/?act=login',
index 1f9940cf5c1e4c8698a8a0bed9a874d5a79f18b0..3448bec4fdc96b361c0b071ab12832583533c14f 100644 (file)
@@ -785,7 +785,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
             s = mobj.group(1)
             dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
             return '/signature/%s' % dec_s
-        dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
+        dash_manifest_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, dash_manifest_url)
         dash_doc = self._download_xml(
             dash_manifest_url, video_id,
             note='Downloading DASH manifest',
@@ -1126,12 +1126,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                     self.report_warning(
                         'Skipping DASH manifest: %r' % e, video_id)
                 else:
-                    # Hide the formats we found through non-DASH
+                    # Remove the formats we found through non-DASH, they
+                    # contain less info and it can be wrong, because we use
+                    # fixed values (for example the resolution). See
+                    # https://github.com/rg3/youtube-dl/issues/5774 for an
+                    # example.
                     dash_keys = set(df['format_id'] for df in dash_formats)
-                    for f in formats:
-                        if f['format_id'] in dash_keys:
-                            f['format_id'] = 'nondash-%s' % f['format_id']
-                            f['preference'] = f.get('preference', 0) - 10000
+                    formats = [f for f in formats if f['format_id'] not in dash_keys]
                     formats.extend(dash_formats)
 
         # Check for malformed aspect ratio
@@ -1289,7 +1290,6 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
     def _extract_playlist(self, playlist_id):
         url = self._TEMPLATE_URL % playlist_id
         page = self._download_webpage(url, playlist_id)
-        more_widget_html = content_html = page
 
         for match in re.findall(r'<div class="yt-alert-message">([^<]+)</div>', page):
             match = match.strip()
@@ -1309,36 +1309,36 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
                 self.report_warning('Youtube gives an alert message: ' + match)
 
         # Extract the video ids from the playlist pages
-        ids = []
-
-        for page_num in itertools.count(1):
-            matches = re.finditer(self._VIDEO_RE, content_html)
-            # We remove the duplicates and the link with index 0
-            # (it's not the first video of the playlist)
-            new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
-            ids.extend(new_ids)
-
-            mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
-            if not mobj:
-                break
+        def _entries():
+            more_widget_html = content_html = page
+            for page_num in itertools.count(1):
+                matches = re.finditer(self._VIDEO_RE, content_html)
+                # We remove the duplicates and the link with index 0
+                # (it's not the first video of the playlist)
+                new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
+                for vid_id in new_ids:
+                    yield self.url_result(vid_id, 'Youtube', video_id=vid_id)
+
+                mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
+                if not mobj:
+                    break
 
-            more = self._download_json(
-                'https://youtube.com/%s' % mobj.group('more'), playlist_id,
-                'Downloading page #%s' % page_num,
-                transform_source=uppercase_escape)
-            content_html = more['content_html']
-            if not content_html.strip():
-                # Some webpages show a "Load more" button but they don't
-                # have more videos
-                break
-            more_widget_html = more['load_more_widget_html']
+                more = self._download_json(
+                    'https://youtube.com/%s' % mobj.group('more'), playlist_id,
+                    'Downloading page #%s' % page_num,
+                    transform_source=uppercase_escape)
+                content_html = more['content_html']
+                if not content_html.strip():
+                    # Some webpages show a "Load more" button but they don't
+                    # have more videos
+                    break
+                more_widget_html = more['load_more_widget_html']
 
         playlist_title = self._html_search_regex(
             r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
             page, 'title')
 
-        url_results = self._ids_to_results(ids)
-        return self.playlist_result(url_results, playlist_id, playlist_title)
+        return self.playlist_result(_entries(), playlist_id, playlist_title)
 
     def _real_extract(self, url):
         # Extract playlist id
@@ -1398,6 +1398,24 @@ class YoutubeChannelIE(InfoExtractor):
         channel_id = self._match_id(url)
 
         url = self._TEMPLATE_URL % channel_id
+
+        # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
+        # Workaround by extracting as a playlist if managed to obtain channel playlist URL
+        # otherwise fallback on channel by page extraction
+        channel_page = self._download_webpage(
+            url + '?view=57', channel_id,
+            'Downloading channel page', fatal=False)
+        channel_playlist_id = self._html_search_meta(
+            'channelId', channel_page, 'channel id', default=None)
+        if not channel_playlist_id:
+            channel_playlist_id = self._search_regex(
+                r'data-channel-external-id="([^"]+)"',
+                channel_page, 'channel id', default=None)
+        if channel_playlist_id and channel_playlist_id.startswith('UC'):
+            playlist_id = 'UU' + channel_playlist_id[2:]
+            return self.url_result(
+                compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
+
         channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
         autogenerated = re.search(r'''(?x)
                 class="[^"]*?(?:
index 22dbc3aec7866ad3f5d048c35737486a8fdac8fc..689fa75951e2fd390b93fa550807337bd6185aa2 100644 (file)
@@ -145,7 +145,7 @@ def parseOpts(overrideArguments=None):
     general.add_option(
         '--list-extractors',
         action='store_true', dest='list_extractors', default=False,
-        help='List all supported extractors and the URLs they would handle')
+        help='List all supported extractors')
     general.add_option(
         '--extractor-descriptions',
         action='store_true', dest='list_extractor_descriptions', default=False,
@@ -537,7 +537,7 @@ def parseOpts(overrideArguments=None):
     verbosity.add_option(
         '--dump-pages', '--dump-intermediate-pages',
         action='store_true', dest='dump_intermediate_pages', default=False,
-        help='Print downloaded pages to debug problems (very verbose)')
+        help='Print downloaded pages encoded using base64 to debug problems (very verbose)')
     verbosity.add_option(
         '--write-pages',
         action='store_true', dest='write_pages', default=False,
@@ -713,7 +713,7 @@ def parseOpts(overrideArguments=None):
         help='Parse additional metadata like song title / artist from the video title. '
              'The format syntax is the same as --output, '
              'the parsed parameters replace existing values. '
-             'Additional templates: %(album), %(artist). '
+             'Additional templates: %(album)s, %(artist)s. '
              'Example: --metadata-from-title "%(artist)s - %(title)s" matches a title like '
              '"Coldplay - Paradise"')
     postproc.add_option(
index 8f825f7859058c9c40cd55e50ec9832a92858c32..774494efd1dbc9af1901bf67d1453689e846b780 100644 (file)
@@ -49,7 +49,7 @@ class EmbedThumbnailPP(FFmpegPostProcessor):
             os.remove(encodeFilename(filename))
             os.rename(encodeFilename(temp_filename), encodeFilename(filename))
 
-        elif info['ext'] == 'm4a':
+        elif info['ext'] in ['m4a', 'mp4']:
             if not check_executable('AtomicParsley', ['-v']):
                 raise EmbedThumbnailPPError('AtomicParsley was not found. Please install.')
 
@@ -82,6 +82,6 @@ class EmbedThumbnailPP(FFmpegPostProcessor):
                 os.remove(encodeFilename(filename))
                 os.rename(encodeFilename(temp_filename), encodeFilename(filename))
         else:
-            raise EmbedThumbnailPPError('Only mp3 and m4a are supported for thumbnail embedding for now.')
+            raise EmbedThumbnailPPError('Only mp3 and m4a/mp4 are supported for thumbnail embedding for now.')
 
         return [], info
index de3169eef1d6ec29d82a60b2f4b6a68f49d7dd4e..fc7ac8305d71c8cce077ef3040cd0903ac9f09c5 100644 (file)
@@ -50,7 +50,7 @@ def rsa_verify(message, signature, key):
 def update_self(to_screen, verbose):
     """Update the program file with the latest version from the repository"""
 
-    UPDATE_URL = "http://rg3.github.io/youtube-dl/update/"
+    UPDATE_URL = "https://rg3.github.io/youtube-dl/update/"
     VERSION_URL = UPDATE_URL + 'LATEST_VERSION'
     JSON_URL = UPDATE_URL + 'versions.json'
     UPDATES_RSA_KEY = (0x9d60ee4d8f805312fdb15a62f87b95bd66177b91df176765d13514a0f1754bcd2057295c5b6f1d35daa6742c3ffc9a82d3e118861c207995a8031e151d863c9927e304576bc80692bc8e094896fcf11b66f3e29e04e3a71e9a11558558acea1840aec37fc396fb6b65dc81a1c4144e03bd1c011de62e3f1357b327d08426fe93, 65537)
index 38f00bc9bc2ef476ddb813d7bcd26e5d13f4947d..9cf84ff712da103b9c3ce0b3e2f0c9386082e574 100644 (file)
@@ -1,3 +1,3 @@
 from __future__ import unicode_literals
 
-__version__ = '2015.05.15'
+__version__ = '2015.06.04.1'