video id, %(playlist)s for the playlist the
video is in, %(playlist_index)s for the
position in the playlist and %% for a
- literal percent. Use - to output to stdout.
- Can also be used to download to a different
+ literal percent. %(height)s and %(width)s
+ for the width and height of the video
+ format. %(resolution)s for a textual
+ description of the resolution of the video
+ format. Use - to output to stdout. Can also
+ be used to download to a different
directory, for example with -o '/my/downloa
ds/%(uploader)s/%(title)s-%(id)s.%(ext)s' .
--autonumber-size NUMBER Specifies the number of digits in
set -e
-skip_tests=false
-if [ "$1" = '--skip-test' ]; then
- skip_tests=true
+skip_tests=true
+if [ "$1" = '--run-tests' ]; then
+ skip_tests=false
shift
fi
def test_youtube_truncated(self):
self.assertMatch('http://www.youtube.com/watch?', ['youtube:truncated_url'])
+ def test_youtube_search_matching(self):
+ self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url'])
+ self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url'])
+
def test_justin_tv_channelid_matching(self):
self.assertTrue(JustinTVIE.suitable(u"justin.tv/vanillatv"))
self.assertTrue(JustinTVIE.suitable(u"twitch.tv/vanillatv"))
import hashlib
import io
import json
+import re
import socket
import youtube_dl.YoutubeDL
if 'playlist' not in test_case:
info_dict = test_case.get('info_dict', {})
if not test_case.get('file') and not (info_dict.get('id') and info_dict.get('ext')):
- print_skipping('The output file cannot be know, the "file" '
- 'key is missing or the info_dict is incomplete')
- return
+ raise Exception('Test definition incorrect. The output file cannot be known. Are both \'id\' and \'ext\' keys present?')
if 'skip' in test_case:
print_skipping(test_case['skip'])
return
with io.open(info_json_fn, encoding='utf-8') as infof:
info_dict = json.load(infof)
for (info_field, expected) in tc.get('info_dict', {}).items():
- if isinstance(expected, compat_str) and expected.startswith('md5:'):
- got = 'md5:' + md5(info_dict.get(info_field))
- else:
+ if isinstance(expected, compat_str) and expected.startswith('re:'):
got = info_dict.get(info_field)
- self.assertEqual(expected, got,
- u'invalid value for field %s, expected %r, got %r' % (info_field, expected, got))
+ match_str = expected[len('re:'):]
+ match_rex = re.compile(match_str)
+
+ self.assertTrue(
+ isinstance(got, compat_str) and match_rex.match(got),
+ u'field %s (value: %r) should match %r' % (info_field, got, match_str))
+ else:
+ if isinstance(expected, compat_str) and expected.startswith('md5:'):
+ got = 'md5:' + md5(info_dict.get(info_field))
+ else:
+ got = info_dict.get(info_field)
+ self.assertEqual(expected, got,
+ u'invalid value for field %s, expected %r, got %r' % (info_field, expected, got))
# If checkable fields are missing from the test case, print the info_dict
test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'md5:' + md5(value))
RutubeChannelIE,
GoogleSearchIE,
GenericIE,
+ TEDIE,
)
def test_AcademicEarthCourse(self):
dl = FakeYDL()
ie = AcademicEarthCourseIE(dl)
- result = ie.extract('http://academicearth.org/courses/building-dynamic-websites/')
+ result = ie.extract('http://academicearth.org/playlists/laws-of-nature/')
self.assertIsPlaylist(result)
- self.assertEqual(result['id'], 'building-dynamic-websites')
- self.assertEqual(result['title'], 'Building Dynamic Websites')
- self.assertEqual(result['description'], u"Today's websites are increasingly dynamic. Pages are no longer static HTML files but instead generated by scripts and database calls. User interfaces are more seamless, with technologies like Ajax replacing traditional page reloads. This course teaches students how to build dynamic websites with Ajax and with Linux, Apache, MySQL, and PHP (LAMP), one of today's most popular frameworks. Students learn how to set up domain names with DNS, how to structure pages with XHTML and CSS, how to program in JavaScript and PHP, how to configure Apache and MySQL, how to design and query databases with SQL, how to use Ajax with both XML and JSON, and how to build mashups. The course explores issues of security, scalability, and cross-browser support and also discusses enterprise-level deployments of websites, including third-party hosting, virtualization, colocation in data centers, firewalling, and load-balancing.")
- self.assertEqual(len(result['entries']), 10)
+ self.assertEqual(result['id'], 'laws-of-nature')
+ self.assertEqual(result['title'], 'Laws of Nature')
+ self.assertEqual(result['description'],u'Introduce yourself to the laws of nature with these free online college lectures from Yale, Harvard, and MIT.')# u"Today's websites are increasingly dynamic. Pages are no longer static HTML files but instead generated by scripts and database calls. User interfaces are more seamless, with technologies like Ajax replacing traditional page reloads. This course teaches students how to build dynamic websites with Ajax and with Linux, Apache, MySQL, and PHP (LAMP), one of today's most popular frameworks. Students learn how to set up domain names with DNS, how to structure pages with XHTML and CSS, how to program in JavaScript and PHP, how to configure Apache and MySQL, how to design and query databases with SQL, how to use Ajax with both XML and JSON, and how to build mashups. The course explores issues of security, scalability, and cross-browser support and also discusses enterprise-level deployments of websites, including third-party hosting, virtualization, colocation in data centers, firewalling, and load-balancing.")
+ self.assertEqual(len(result['entries']), 4)
def test_ivi_compilation(self):
dl = FakeYDL()
self.assertEqual(result['title'], 'Zero Punctuation')
self.assertTrue(len(result['entries']) > 10)
+ def test_ted_playlist(self):
+ dl = FakeYDL()
+ ie = TEDIE(dl)
+ result = ie.extract('http://www.ted.com/playlists/who_are_the_hackers')
+ self.assertIsPlaylist(result)
+ self.assertEqual(result['id'], '10')
+ self.assertEqual(result['title'], 'Who are the hackers?')
+ self.assertTrue(len(result['entries']) >= 6)
+
if __name__ == '__main__':
unittest.main()
# Various small unit tests
+import io
import xml.etree.ElementTree
#from youtube_dl.utils import htmlentity_transform
orderedSet,
PagedList,
parse_duration,
+ read_batch_urls,
sanitize_filename,
shell_quote,
smuggle_url,
def test_struct_unpack(self):
self.assertEqual(struct_unpack(u'!B', b'\x00'), (0,))
+ def test_read_batch_urls(self):
+ f = io.StringIO(u'''\xef\xbb\xbf foo
+ bar\r
+ baz
+ # More after this line\r
+ ; or after this
+ bam''')
+ self.assertEqual(read_batch_urls(f), [u'foo', u'bar', u'baz', u'bam'])
+
if __name__ == '__main__':
unittest.main()
YoutubeChannelIE,
YoutubeShowIE,
YoutubeTopListIE,
+ YoutubeSearchURLIE,
)
self.assertEqual(original_video['id'], 'rjFaenf1T-Y')
def test_youtube_toptracks(self):
+ print('Skipping: The playlist page gives error 500')
+ return
dl = FakeYDL()
ie = YoutubePlaylistIE(dl)
result = ie.extract('https://www.youtube.com/playlist?list=MCUS')
entries = result['entries']
self.assertTrue(len(entries) >= 5)
+ def test_youtube_search_url(self):
+ dl = FakeYDL()
+ ie = YoutubeSearchURLIE(dl)
+ result = ie.extract('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video')
+ entries = result['entries']
+ self.assertIsPlaylist(result)
+ self.assertEqual(result['title'], 'youtube-dl test video')
+ self.assertTrue(len(entries) >= 5)
+
if __name__ == '__main__':
unittest.main()
template_dict['autonumber'] = autonumber_templ % self._num_downloads
if template_dict.get('playlist_index') is not None:
template_dict['playlist_index'] = '%05d' % template_dict['playlist_index']
+ if template_dict.get('resolution') is None:
+ if template_dict.get('width') and template_dict.get('height'):
+ template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
+ elif template_dict.get('height'):
+ res = '%sp' % template_dict['height']
+ elif template_dict.get('width'):
+ res = '?x%d' % template_dict['width']
sanitize = lambda k, v: sanitize_filename(
compat_str(v),
info_dict['playlist'] = None
info_dict['playlist_index'] = None
+ if 'display_id' not in info_dict and 'id' in info_dict:
+ info_dict['display_id'] = info_dict['id']
+
# This extractors handle format selection themselves
if info_dict['extractor'] in ['Youku']:
if download:
'Andreas Schmitz',
'Michael Kaiser',
'Niklas Laxström',
+ 'David Triendl',
+ 'Anthony Weems',
+ 'David Wagner',
+ 'Juan C. Olivares',
)
__license__ = 'Public Domain'
import codecs
import getpass
+import io
import locale
import optparse
import os
get_cachedir,
MaxDownloadsReached,
preferredencoding,
+ read_batch_urls,
SameFileError,
setproctitle,
std_headers,
'%(extractor)s for the provider (youtube, metacafe, etc), '
'%(id)s for the video id, %(playlist)s for the playlist the video is in, '
'%(playlist_index)s for the position in the playlist and %% for a literal percent. '
+ '%(height)s and %(width)s for the width and height of the video format. '
+ '%(resolution)s for a textual description of the resolution of the video format. '
'Use - to output to stdout. Can also be used to download to a different directory, '
'for example with -o \'/my/downloads/%(uploader)s/%(title)s-%(id)s.%(ext)s\' .'))
filesystem.add_option('--autonumber-size',
sys.exit(0)
# Batch file verification
- batchurls = []
+ batch_urls = []
if opts.batchfile is not None:
try:
if opts.batchfile == '-':
batchfd = sys.stdin
else:
- batchfd = open(opts.batchfile, 'r')
- batchurls = batchfd.readlines()
- batchurls = [x.strip() for x in batchurls]
- batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
+ batchfd = io.open(opts.batchfile, 'r', encoding='utf-8', errors='ignore')
+ batch_urls = read_batch_urls(batchfd)
if opts.verbose:
- write_string(u'[debug] Batch file urls: ' + repr(batchurls) + u'\n')
+ write_string(u'[debug] Batch file urls: ' + repr(batch_urls) + u'\n')
except IOError:
sys.exit(u'ERROR: batch file could not be read')
- all_urls = batchurls + args
+ all_urls = batch_urls + args
all_urls = [url.strip() for url in all_urls]
_enc = preferredencoding()
all_urls = [url.decode(_enc, 'ignore') if isinstance(url, bytes) else url for url in all_urls]
from ..utils import (
struct_pack,
struct_unpack,
- compat_urllib_request,
compat_urlparse,
format_bytes,
encodeFilename,
self.read_unsigned_char()
# flags
self.read(3)
- # BootstrapinfoVersion
- bootstrap_info_version = self.read_unsigned_int()
+
+ self.read_unsigned_int() # BootstrapinfoVersion
# Profile,Live,Update,Reserved
self.read(1)
# time scale
self.read_unsigned_long_long()
# SmpteTimeCodeOffset
self.read_unsigned_long_long()
- # MovieIdentifier
- movie_identifier = self.read_string()
+
+ self.read_string() # MovieIdentifier
server_count = self.read_unsigned_char()
# ServerEntryTable
for i in range(server_count):
self.read_string()
quality_count = self.read_unsigned_char()
# QualityEntryTable
- for i in range(server_count):
+ for i in range(quality_count):
self.read_string()
# DrmData
self.read_string()
else:
# The length does not match, we start the download over
self.report_unable_to_resume()
+ resume_len = 0
open_mode = 'wb'
break
# Retry
from .blinkx import BlinkxIE
from .bliptv import BlipTVIE, BlipTVUserIE
from .bloomberg import BloombergIE
+from .br import BRIE
from .breakcom import BreakIE
from .brightcove import BrightcoveIE
from .c56 import C56IE
+from .canal13cl import Canal13clIE
from .canalplus import CanalplusIE
from .canalc2 import Canalc2IE
from .cbs import CBSIE
+from .ceskatelevize import CeskaTelevizeIE
from .channel9 import Channel9IE
from .chilloutzone import ChilloutzoneIE
from .cinemassacre import CinemassacreIE
from .gamekings import GamekingsIE
from .gamespot import GameSpotIE
from .gametrailers import GametrailersIE
+from .gdcvault import GDCVaultIE
from .generic import GenericIE
from .googleplus import GooglePlusIE
from .googlesearch import GoogleSearchIE
)
from .m6 import M6IE
from .macgamestore import MacGameStoreIE
+from .mailru import MailRuIE
from .malemotion import MalemotionIE
from .mdr import MDRIE
from .metacafe import MetacafeIE
from .metacritic import MetacriticIE
-from .mit import TechTVMITIE, MITIE
+from .mit import TechTVMITIE, MITIE, OCWMITIE
from .mixcloud import MixcloudIE
from .mpora import MporaIE
from .mofosex import MofosexIE
from .myvideo import MyVideoIE
from .naver import NaverIE
from .nba import NBAIE
-from .nbc import NBCNewsIE
+from .nbc import (
+ NBCIE,
+ NBCNewsIE,
+)
from .ndr import NDRIE
from .ndtv import NDTVIE
from .newgrounds import NewgroundsIE
from .niconico import NiconicoIE
from .ninegag import NineGagIE
from .normalboots import NormalbootsIE
-from .novamov import NovamovIE
+from .novamov import NovaMovIE
from .nowness import NownessIE
from .nowvideo import NowVideoIE
from .ooyala import OoyalaIE
from .pornhd import PornHdIE
from .pornhub import PornHubIE
from .pornotube import PornotubeIE
+from .prosiebensat1 import ProSiebenSat1IE
from .pyvideo import PyvideoIE
from .radiofrance import RadioFranceIE
from .rbmaradio import RBMARadioIE
from .toutv import TouTvIE
from .traileraddict import TrailerAddictIE
from .trilulilu import TriluliluIE
+from .trutube import TruTubeIE
from .tube8 import Tube8IE
from .tudou import TudouIE
from .tumblr import TumblrIE
from .tutv import TutvIE
+from .tvigle import TvigleIE
from .tvp import TvpIE
from .unistra import UnistraIE
from .ustream import UstreamIE, UstreamChannelIE
from .vevo import VevoIE
from .vice import ViceIE
from .viddler import ViddlerIE
+from .videobam import VideoBamIE
from .videodetective import VideoDetectiveIE
from .videofyme import VideofyMeIE
from .videopremium import VideoPremiumIE
from .youporn import YouPornIE
from .youtube import (
YoutubeIE,
+ YoutubeChannelIE,
+ YoutubeFavouritesIE,
+ YoutubeHistoryIE,
YoutubePlaylistIE,
- YoutubeSearchIE,
+ YoutubeRecommendedIE,
YoutubeSearchDateIE,
- YoutubeUserIE,
- YoutubeChannelIE,
+ YoutubeSearchIE,
+ YoutubeSearchURLIE,
YoutubeShowIE,
YoutubeSubscriptionsIE,
- YoutubeRecommendedIE,
+ YoutubeTopListIE,
YoutubeTruncatedURLIE,
+ YoutubeUserIE,
YoutubeWatchLaterIE,
- YoutubeFavouritesIE,
- YoutubeHistoryIE,
- YoutubeTopListIE,
)
from .zdf import ZDFIE
class AcademicEarthCourseIE(InfoExtractor):
- _VALID_URL = r'^https?://(?:www\.)?academicearth\.org/(?:courses|playlists)/(?P<id>[^?#/]+)'
+ _VALID_URL = r'^https?://(?:www\.)?academicearth\.org/playlists/(?P<id>[^?#/]+)'
IE_NAME = 'AcademicEarth:Course'
def _real_extract(self, url):
webpage = self._download_webpage(url, playlist_id)
title = self._html_search_regex(
- r'<h1 class="playlist-name">(.*?)</h1>', webpage, u'title')
+ r'<h1 class="playlist-name"[^>]*?>(.*?)</h1>', webpage, u'title')
description = self._html_search_regex(
- r'<p class="excerpt">(.*?)</p>',
+ r'<p class="excerpt"[^>]*?>(.*?)</p>',
webpage, u'description', fatal=False)
urls = re.findall(
- r'<h3 class="lecture-title"><a target="_blank" href="([^"]+)">',
+ r'<li class="lecture-preview">\s*?<a target="_blank" href="([^"]+)">',
webpage)
entries = [self.url_result(u) for u in urls]
--- /dev/null
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+
+
+class BRIE(InfoExtractor):
+ IE_DESC = "Bayerischer Rundfunk Mediathek"
+ _VALID_URL = r"^https?://(?:www\.)?br\.de/mediathek/video/(?:sendungen/)?(?P<id>[a-z0-9\-]+)\.html$"
+ _BASE_URL = "http://www.br.de"
+
+ _TEST = {
+ "url": "http://www.br.de/mediathek/video/anselm-gruen-114.html",
+ "md5": "c4f83cf0f023ba5875aba0bf46860df2",
+ "info_dict": {
+ "id": "2c8d81c5-6fb7-4a74-88d4-e768e5856532",
+ "ext": "mp4",
+ "title": "Feiern und Verzichten",
+ "description": "Anselm Grün: Feiern und Verzichten",
+ "uploader": "BR/Birgit Baier",
+ "upload_date": "20140301"
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ display_id = mobj.group('id')
+ page = self._download_webpage(url, display_id)
+ xml_url = self._search_regex(
+ r"return BRavFramework\.register\(BRavFramework\('avPlayer_(?:[a-f0-9-]{36})'\)\.setup\({dataURL:'(/mediathek/video/[a-z0-9/~_.-]+)'}\)\);", page, "XMLURL")
+ xml = self._download_xml(self._BASE_URL + xml_url, None)
+
+ videos = [{
+ "id": xml_video.get("externalId"),
+ "title": xml_video.find("title").text,
+ "formats": self._extract_formats(xml_video.find("assets")),
+ "thumbnails": self._extract_thumbnails(xml_video.find("teaserImage/variants")),
+ "description": " ".join(xml_video.find("shareTitle").text.splitlines()),
+ "uploader": xml_video.find("author").text,
+ "upload_date": "".join(reversed(xml_video.find("broadcastDate").text.split("."))),
+ "webpage_url": xml_video.find("permalink").text,
+ } for xml_video in xml.findall("video")]
+
+ if len(videos) > 1:
+ self._downloader.report_warning(
+ 'found multiple videos; please '
+ 'report this with the video URL to http://yt-dl.org/bug')
+ if not videos:
+ raise ExtractorError('No video entries found')
+ return videos[0]
+
+ def _extract_formats(self, assets):
+ formats = [{
+ "url": asset.find("downloadUrl").text,
+ "ext": asset.find("mediaType").text,
+ "format_id": asset.get("type"),
+ "width": int(asset.find("frameWidth").text),
+ "height": int(asset.find("frameHeight").text),
+ "tbr": int(asset.find("bitrateVideo").text),
+ "abr": int(asset.find("bitrateAudio").text),
+ "vcodec": asset.find("codecVideo").text,
+ "container": asset.find("mediaType").text,
+ "filesize": int(asset.find("size").text),
+ } for asset in assets.findall("asset")
+ if asset.find("downloadUrl") is not None]
+
+ self._sort_formats(formats)
+ return formats
+
+ def _extract_thumbnails(self, variants):
+ thumbnails = [{
+ "url": self._BASE_URL + variant.find("url").text,
+ "width": int(variant.find("width").text),
+ "height": int(variant.find("height").text),
+ } for variant in variants.findall("variant")]
+ thumbnails.sort(key=lambda x: x["width"] * x["height"], reverse=True)
+ return thumbnails
video_id = mobj.group(1).split("-")[-1]
embed_url = 'http://www.break.com/embed/%s' % video_id
webpage = self._download_webpage(embed_url, video_id)
- info_json = self._search_regex(r'var embedVars = ({.*?});', webpage,
- 'info json', flags=re.DOTALL)
+ info_json = self._search_regex(r'var embedVars = ({.*})\s*?</script>',
+ webpage, 'info json', flags=re.DOTALL)
info = json.loads(info_json)
video_url = info['videoUri']
m_youtube = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', video_url)
--- /dev/null
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class Canal13clIE(InfoExtractor):
+ _VALID_URL = r'^http://(?:www\.)?13\.cl/(?:[^/?#]+/)*(?P<id>[^/?#]+)'
+ _TEST = {
+ 'url': 'http://www.13.cl/t13/nacional/el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda',
+ 'md5': '4cb1fa38adcad8fea88487a078831755',
+ 'info_dict': {
+ 'id': '1403022125',
+ 'display_id': 'el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda',
+ 'ext': 'mp4',
+ 'title': 'El "círculo de hierro" de Michelle Bachelet en su regreso a La Moneda',
+ 'description': '(Foto: Agencia Uno) En nueve días más, Michelle Bachelet va a asumir por segunda vez como presidenta de la República. Entre aquellos que la acompañarán hay caras que se repiten y otras que se consolidan en su entorno de colaboradores más cercanos.',
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ display_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, display_id)
+
+ title = self._html_search_meta(
+ 'twitter:title', webpage, 'title', fatal=True)
+ description = self._html_search_meta(
+ 'twitter:description', webpage, 'description')
+ url = self._html_search_regex(
+ r'articuloVideo = \"(.*?)\"', webpage, 'url')
+ real_id = self._search_regex(
+ r'[^0-9]([0-9]{7,})[^0-9]', url, 'id', default=display_id)
+ thumbnail = self._html_search_regex(
+ r'articuloImagen = \"(.*?)\"', webpage, 'thumbnail')
+
+ return {
+ 'id': real_id,
+ 'display_id': display_id,
+ 'url': url,
+ 'title': title,
+ 'description': description,
+ 'ext': 'mp4',
+ 'thumbnail': thumbnail,
+ }
--- /dev/null
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_urllib_request,
+ compat_urllib_parse,
+ compat_urllib_parse_urlparse,
+ ExtractorError,
+)
+
+
+class CeskaTelevizeIE(InfoExtractor):
+ _VALID_URL = r'https?://www\.ceskatelevize\.cz/(porady|ivysilani)/(.+/)?(?P<id>[^?#]+)'
+
+ _TESTS = [
+ {
+ 'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/213512120230004-spanelska-chripka',
+ 'info_dict': {
+ 'id': '213512120230004',
+ 'ext': 'flv',
+ 'title': 'První republika: Španělská chřipka',
+ 'duration': 3107.4,
+ },
+ 'params': {
+ 'skip_download': True, # requires rtmpdump
+ },
+ 'skip': 'Works only from Czech Republic.',
+ },
+ {
+ 'url': 'http://www.ceskatelevize.cz/ivysilani/1030584952-tsatsiki-maminka-a-policajt',
+ 'info_dict': {
+ 'id': '20138143440',
+ 'ext': 'flv',
+ 'title': 'Tsatsiki, maminka a policajt',
+ 'duration': 6754.1,
+ },
+ 'params': {
+ 'skip_download': True, # requires rtmpdump
+ },
+ 'skip': 'Works only from Czech Republic.',
+ },
+ {
+ 'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/bonus/14716-zpevacka-z-duparny-bobina',
+ 'info_dict': {
+ 'id': '14716',
+ 'ext': 'flv',
+ 'title': 'První republika: Zpěvačka z Dupárny Bobina',
+ 'duration': 90,
+ },
+ 'params': {
+ 'skip_download': True, # requires rtmpdump
+ },
+ },
+ ]
+
+ def _real_extract(self, url):
+ url = url.replace('/porady/', '/ivysilani/').replace('/video/', '')
+
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+
+ NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.'
+ if '%s</p>' % NOT_AVAILABLE_STRING in webpage:
+ raise ExtractorError(NOT_AVAILABLE_STRING, expected=True)
+
+ typ = self._html_search_regex(r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],', webpage, 'type')
+ episode_id = self._html_search_regex(r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],', webpage, 'episode_id')
+
+ data = {
+ 'playlist[0][type]': typ,
+ 'playlist[0][id]': episode_id,
+ 'requestUrl': compat_urllib_parse_urlparse(url).path,
+ 'requestSource': 'iVysilani',
+ }
+
+ req = compat_urllib_request.Request('http://www.ceskatelevize.cz/ivysilani/ajax/get-playlist-url',
+ data=compat_urllib_parse.urlencode(data))
+
+ req.add_header('Content-type', 'application/x-www-form-urlencoded')
+ req.add_header('x-addr', '127.0.0.1')
+ req.add_header('X-Requested-With', 'XMLHttpRequest')
+ req.add_header('Referer', url)
+
+ playlistpage = self._download_json(req, video_id)
+
+ req = compat_urllib_request.Request(compat_urllib_parse.unquote(playlistpage['url']))
+ req.add_header('Referer', url)
+
+ playlist = self._download_xml(req, video_id)
+
+ formats = []
+ for i in playlist.find('smilRoot/body'):
+ if 'AD' not in i.attrib['id']:
+ base_url = i.attrib['base']
+ parsedurl = compat_urllib_parse_urlparse(base_url)
+ duration = i.attrib['duration']
+
+ for video in i.findall('video'):
+ if video.attrib['label'] != 'AD':
+ format_id = video.attrib['label']
+ play_path = video.attrib['src']
+ vbr = int(video.attrib['system-bitrate'])
+
+ formats.append({
+ 'format_id': format_id,
+ 'url': base_url,
+ 'vbr': vbr,
+ 'play_path': play_path,
+ 'app': parsedurl.path[1:] + '?' + parsedurl.query,
+ 'rtmp_live': True,
+ 'ext': 'flv',
+ })
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': episode_id,
+ 'title': self._html_search_regex(r'<title>(.+?) — iVysílání — Česká televize</title>', webpage, 'title'),
+ 'duration': float(duration),
+ 'formats': formats,
+ }
# encoding: utf-8
+from __future__ import unicode_literals
import re
from .common import InfoExtractor
class CinemassacreIE(InfoExtractor):
- _VALID_URL = r'(?:http://)?(?:www\.)?(?P<url>cinemassacre\.com/(?P<date_Y>[0-9]{4})/(?P<date_m>[0-9]{2})/(?P<date_d>[0-9]{2})/.+?)(?:[/?].*)?'
- _TESTS = [{
- u'url': u'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/',
- u'file': u'19911.flv',
- u'info_dict': {
- u'upload_date': u'20121110',
- u'title': u'“Angry Video Game Nerd: The Movie” – Trailer',
- u'description': u'md5:fb87405fcb42a331742a0dce2708560b',
- },
- u'params': {
- # rtmp download
- u'skip_download': True,
- },
- },
- {
- u'url': u'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940',
- u'file': u'521be8ef82b16.flv',
- u'info_dict': {
- u'upload_date': u'20131002',
- u'title': u'The Mummy’s Hand (1940)',
- },
- u'params': {
- # rtmp download
- u'skip_download': True,
+ _VALID_URL = r'http://(?:www\.)?cinemassacre\.com/(?P<date_Y>[0-9]{4})/(?P<date_m>[0-9]{2})/(?P<date_d>[0-9]{2})/.+?'
+ _TESTS = [
+ {
+ 'url': 'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/',
+ 'file': '19911.mp4',
+ 'md5': 'fde81fbafaee331785f58cd6c0d46190',
+ 'info_dict': {
+ 'upload_date': '20121110',
+ 'title': '“Angry Video Game Nerd: The Movie” – Trailer',
+ 'description': 'md5:fb87405fcb42a331742a0dce2708560b',
+ },
},
- }]
+ {
+ 'url': 'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940',
+ 'file': '521be8ef82b16.mp4',
+ 'md5': 'd72f10cd39eac4215048f62ab477a511',
+ 'info_dict': {
+ 'upload_date': '20131002',
+ 'title': 'The Mummy’s Hand (1940)',
+ },
+ }
+ ]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- webpage_url = u'http://' + mobj.group('url')
- webpage = self._download_webpage(webpage_url, None) # Don't know video id yet
+ webpage = self._download_webpage(url, None) # Don't know video id yet
video_date = mobj.group('date_Y') + mobj.group('date_m') + mobj.group('date_d')
mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?id=(?:Cinemassacre-)?(?P<video_id>.+?))"', webpage)
if not mobj:
- raise ExtractorError(u'Can\'t extract embed url and video id')
- playerdata_url = mobj.group(u'embed_url')
- video_id = mobj.group(u'video_id')
+ raise ExtractorError('Can\'t extract embed url and video id')
+ playerdata_url = mobj.group('embed_url')
+ video_id = mobj.group('video_id')
video_title = self._html_search_regex(r'<title>(?P<title>.+?)\|',
- webpage, u'title')
+ webpage, 'title')
video_description = self._html_search_regex(r'<div class="entry-content">(?P<description>.+?)</div>',
- webpage, u'description', flags=re.DOTALL, fatal=False)
+ webpage, 'description', flags=re.DOTALL, fatal=False)
if len(video_description) == 0:
video_description = None
playerdata = self._download_webpage(playerdata_url, video_id)
- url = self._html_search_regex(r'\'streamer\': \'(?P<url>[^\']+)\'', playerdata, u'url')
- sd_file = self._html_search_regex(r'\'file\': \'(?P<sd_file>[^\']+)\'', playerdata, u'sd_file')
- hd_file = self._html_search_regex(r'\'?file\'?: "(?P<hd_file>[^"]+)"', playerdata, u'hd_file')
- video_thumbnail = self._html_search_regex(r'\'image\': \'(?P<thumbnail>[^\']+)\'', playerdata, u'thumbnail', fatal=False)
+ sd_url = self._html_search_regex(r'file: \'(?P<sd_file>[^\']+)\', label: \'SD\'', playerdata, 'sd_file')
+ hd_url = self._html_search_regex(r'file: \'(?P<hd_file>[^\']+)\', label: \'HD\'', playerdata, 'hd_file')
+ video_thumbnail = self._html_search_regex(r'image: \'(?P<thumbnail>[^\']+)\'', playerdata, 'thumbnail', fatal=False)
formats = [
{
- 'url': url,
- 'play_path': 'mp4:' + sd_file,
- 'rtmp_live': True, # workaround
- 'ext': 'flv',
+ 'url': sd_url,
+ 'ext': 'mp4',
'format': 'sd',
'format_id': 'sd',
},
{
- 'url': url,
- 'play_path': 'mp4:' + hd_file,
- 'rtmp_live': True, # workaround
- 'ext': 'flv',
+ 'url': hd_url,
+ 'ext': 'mp4',
'format': 'hd',
'format_id': 'hd',
},
'id': 'W5gMp3ZjYg4',
'ext': 'mp4',
'title': 'Funny Dogs Protecting Babies Compilation 2014 [NEW HD]',
- 'uploader': 'Funnyplox TV',
+ 'uploader': 'FunnyPlox TV',
'uploader_id': 'funnyploxtv',
'description': 'md5:7ded37421526d54afdf005e25bc2b7a3',
'upload_date': '20140128',
The following fields are optional:
+ display_id An alternative identifier for the video, not necessarily
+ unique, but available before title. Typically, id is
+ something like "4234987", title "Dancing naked mole rats",
+ and display_id "dancing-naked-mole-rats"
thumbnails: A list of dictionaries (with the entries "resolution" and
"url") for the varying thumbnails
thumbnail: Full URL to a video thumbnail image.
if secure: regexes = self._og_regexes('video:secure_url') + regexes
return self._html_search_regex(regexes, html, name, **kargs)
- def _html_search_meta(self, name, html, display_name=None):
+ def _html_search_meta(self, name, html, display_name=None, fatal=False):
if display_name is None:
display_name = name
return self._html_search_regex(
r'''(?ix)<meta
(?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
[^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
- html, display_name, fatal=False)
+ html, display_name, fatal=fatal)
def _dc_search_uploader(self, html):
return self._html_search_meta('dc.creator', html, 'uploader')
# encoding: utf-8
from __future__ import unicode_literals
-import re, base64, zlib
+import re
+import json
+import base64
+import zlib
+
from hashlib import sha1
from math import pow, sqrt, floor
from .common import InfoExtractor
inc,
)
+
class CrunchyrollIE(InfoExtractor):
- _VALID_URL = r'(?:https?://)?(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?:[^/]*/[^/?&]*?|media/\?id=)(?P<video_id>[0-9]+))(?:[/?&]|$)'
- _TESTS = [{
+ _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?:[^/]*/[^/?&]*?|media/\?id=)(?P<video_id>[0-9]+))(?:[/?&]|$)'
+ _TEST = {
'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513',
- 'file': '645513.flv',
#'md5': 'b1639fd6ddfaa43788c85f6d1dddd412',
'info_dict': {
+ 'id': '645513',
+ 'ext': 'flv',
'title': 'Wanna be the Strongest in the World Episode 1 – An Idol-Wrestler is Born!',
'description': 'md5:2d17137920c64f2f49981a7797d275ef',
'thumbnail': 'http://img1.ak.crunchyroll.com/i/spire1-tmb/20c6b5e10f1a47b10516877d3c039cae1380951166_full.jpg',
# rtmp
'skip_download': True,
},
- }]
+ }
_FORMAT_IDS = {
'360': ('60', '106'),
shaHash = bytes_to_intlist(sha1(prefix + str(num4).encode('ascii')).digest())
# Extend 160 Bit hash to 256 Bit
return shaHash + [0] * 12
-
+
key = obfuscate_key(id)
class Counter:
__value = iv
return zlib.decompress(decrypted_data)
def _convert_subtitles_to_srt(self, subtitles):
- i=1
output = ''
- for start, end, text in re.findall(r'<event [^>]*?start="([^"]+)" [^>]*?end="([^"]+)" [^>]*?text="([^"]+)"[^>]*?>', subtitles):
+ for i, (start, end, text) in enumerate(re.findall(r'<event [^>]*?start="([^"]+)" [^>]*?end="([^"]+)" [^>]*?text="([^"]+)"[^>]*?>', subtitles), 1):
start = start.replace('.', ',')
end = end.replace('.', ',')
text = clean_html(text)
if not text:
continue
output += '%d\n%s --> %s\n%s\n\n' % (i, start, end, text)
- i+=1
return output
def _real_extract(self,url):
if note_m:
raise ExtractorError(note_m)
+ mobj = re.search(r'Page\.messaging_box_controller\.addItems\(\[(?P<msg>{.+?})\]\)', webpage)
+ if mobj:
+ msg = json.loads(mobj.group('msg'))
+ if msg.get('type') == 'error':
+ raise ExtractorError('crunchyroll returned error: %s' % msg['message_body'], expected=True)
+
video_title = self._html_search_regex(r'<h1[^>]*>(.+?)</h1>', webpage, 'video_title', flags=re.DOTALL)
video_title = re.sub(r' {2,}', ' ', video_title)
video_description = self._html_search_regex(r'"description":"([^"]+)', webpage, 'video_description', default='')
playerdata_req.data = compat_urllib_parse.urlencode({'current_page': webpage_url})
playerdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
playerdata = self._download_webpage(playerdata_req, video_id, note='Downloading media info')
-
+
stream_id = self._search_regex(r'<media_id>([^<]+)', playerdata, 'stream_id')
video_thumbnail = self._search_regex(r'<episode_image_url>([^<]+)', playerdata, 'thumbnail', fatal=False)
data = base64.b64decode(data)
subtitle = self._decrypt_subtitles(data, iv, id).decode('utf-8')
- lang_code = self._search_regex(r'lang_code=\'([^\']+)', subtitle, 'subtitle_lang_code', fatal=False)
+ lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False)
if not lang_code:
continue
subtitles[lang_code] = self._convert_subtitles_to_srt(subtitle)
get_element_by_id,
orderedSet,
str_to_int,
+ int_or_none,
ExtractorError,
)
if video_url is not None:
m_size = re.search(r'H264-(\d+)x(\d+)', video_url)
if m_size is not None:
- width, height = m_size.group(1), m_size.group(2)
+ width, height = map(int_or_none, (m_size.group(1), m_size.group(2)))
else:
width, height = None, None
formats.append({
+from __future__ import unicode_literals
+
import json
import re
import socket
_LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1'
_CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1'
_NETRC_MACHINE = 'facebook'
- IE_NAME = u'facebook'
+ IE_NAME = 'facebook'
_TEST = {
- u'url': u'https://www.facebook.com/photo.php?v=120708114770723',
- u'file': u'120708114770723.mp4',
- u'md5': u'48975a41ccc4b7a581abd68651c1a5a8',
- u'info_dict': {
- u"duration": 279,
- u"title": u"PEOPLE ARE AWESOME 2013"
+ 'url': 'https://www.facebook.com/photo.php?v=120708114770723',
+ 'md5': '48975a41ccc4b7a581abd68651c1a5a8',
+ 'info_dict': {
+ 'id': '120708114770723',
+ 'ext': 'mp4',
+ 'duration': 279,
+ 'title': 'PEOPLE ARE AWESOME 2013'
}
}
def report_login(self):
"""Report attempt to log in."""
- self.to_screen(u'Logging in')
+ self.to_screen('Logging in')
def _login(self):
(useremail, password) = self._get_login_info()
login_page_req.add_header('Cookie', 'locale=en_US')
self.report_login()
login_page = self._download_webpage(login_page_req, None, note=False,
- errnote=u'Unable to download login page')
- lsd = self._search_regex(r'"lsd":"(\w*?)"', login_page, u'lsd')
- lgnrnd = self._search_regex(r'name="lgnrnd" value="([^"]*?)"', login_page, u'lgnrnd')
+ errnote='Unable to download login page')
+ lsd = self._search_regex(
+ r'<input type="hidden" name="lsd" value="([^"]*)"',
+ login_page, 'lsd')
+ lgnrnd = self._search_regex(r'name="lgnrnd" value="([^"]*?)"', login_page, 'lgnrnd')
login_form = {
'email': useremail,
try:
login_results = compat_urllib_request.urlopen(request).read()
if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
- self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
+ self._downloader.report_warning('unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
return
check_form = {
- 'fb_dtsg': self._search_regex(r'"fb_dtsg":"(.*?)"', login_results, u'fb_dtsg'),
- 'nh': self._search_regex(r'name="nh" value="(\w*?)"', login_results, u'nh'),
+ 'fb_dtsg': self._search_regex(r'"fb_dtsg":"(.*?)"', login_results, 'fb_dtsg'),
+ 'nh': self._search_regex(r'name="nh" value="(\w*?)"', login_results, 'nh'),
'name_action_selected': 'dont_save',
- 'submit[Continue]': self._search_regex(r'<input value="(.*?)" name="submit\[Continue\]"', login_results, u'continue'),
+ 'submit[Continue]': self._search_regex(r'<input value="(.*?)" name="submit\[Continue\]"', login_results, 'continue'),
}
check_req = compat_urllib_request.Request(self._CHECKPOINT_URL, compat_urllib_parse.urlencode(check_form))
check_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
check_response = compat_urllib_request.urlopen(check_req).read()
if re.search(r'id="checkpointSubmitButton"', check_response) is not None:
- self._downloader.report_warning(u'Unable to confirm login, you have to login in your brower and authorize the login.')
+ self._downloader.report_warning('Unable to confirm login, you have to login in your brower and authorize the login.')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
+ self._downloader.report_warning('unable to log in: %s' % compat_str(err))
return
def _real_initialize(self):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
+ raise ExtractorError('Invalid URL: %s' % url)
video_id = mobj.group('id')
url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*"><div>(.*?)</div>', webpage)
if m_msg is not None:
raise ExtractorError(
- u'The video is not available, Facebook said: "%s"' % m_msg.group(1),
+ 'The video is not available, Facebook said: "%s"' % m_msg.group(1),
expected=True)
else:
- raise ExtractorError(u'Cannot parse data')
+ raise ExtractorError('Cannot parse data')
data = dict(json.loads(m.group(1)))
params_raw = compat_urllib_parse.unquote(data['params'])
params = json.loads(params_raw)
if not video_url:
video_url = video_data['sd_src']
if not video_url:
- raise ExtractorError(u'Cannot find video URL')
+ raise ExtractorError('Cannot find video URL')
video_duration = int(video_data['video_duration'])
thumbnail = video_data['thumbnail_src']
video_title = self._html_search_regex(
- r'<h2 class="uiHeaderTitle">([^<]*)</h2>', webpage, u'title')
+ r'<h2 class="uiHeaderTitle">([^<]*)</h2>', webpage, 'title')
info = {
'id': video_id,
unified_strdate,
str_to_int,
parse_duration,
+ clean_html,
)
-from youtube_dl.utils import clean_html
class FourTubeIE(InfoExtractor):
--- /dev/null
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_urllib_parse,
+ compat_urllib_request,
+)
+
+class GDCVaultIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?gdcvault\.com/play/(?P<id>\d+)/(?P<name>(\w|-)+)'
+ _TESTS = [
+ {
+ 'url': 'http://www.gdcvault.com/play/1019721/Doki-Doki-Universe-Sweet-Simple',
+ 'md5': '7ce8388f544c88b7ac11c7ab1b593704',
+ 'info_dict': {
+ 'id': '1019721',
+ 'ext': 'mp4',
+ 'title': 'Doki-Doki Universe: Sweet, Simple and Genuine (GDC Next 10)'
+ }
+ },
+ {
+ 'url': 'http://www.gdcvault.com/play/1015683/Embracing-the-Dark-Art-of',
+ 'info_dict': {
+ 'id': '1015683',
+ 'ext': 'flv',
+ 'title': 'Embracing the Dark Art of Mathematical Modeling in AI'
+ },
+ 'params': {
+ 'skip_download': True, # Requires rtmpdump
+ }
+ },
+ ]
+
+ def _parse_mp4(self, xml_description):
+ video_formats = []
+ mp4_video = xml_description.find('./metadata/mp4video')
+ if mp4_video is None:
+ return None
+
+ mobj = re.match(r'(?P<root>https?://.*?/).*', mp4_video.text)
+ video_root = mobj.group('root')
+ formats = xml_description.findall('./metadata/MBRVideos/MBRVideo')
+ for format in formats:
+ mobj = re.match(r'mp4\:(?P<path>.*)', format.find('streamName').text)
+ url = video_root + mobj.group('path')
+ vbr = format.find('bitrate').text
+ video_formats.append({
+ 'url': url,
+ 'vbr': int(vbr),
+ })
+ return video_formats
+
+ def _parse_flv(self, xml_description):
+ video_formats = []
+ akami_url = xml_description.find('./metadata/akamaiHost').text
+ slide_video_path = xml_description.find('./metadata/slideVideo').text
+ video_formats.append({
+ 'url': 'rtmp://' + akami_url + '/' + slide_video_path,
+ 'format_note': 'slide deck video',
+ 'quality': -2,
+ 'preference': -2,
+ 'format_id': 'slides',
+ })
+ speaker_video_path = xml_description.find('./metadata/speakerVideo').text
+ video_formats.append({
+ 'url': 'rtmp://' + akami_url + '/' + speaker_video_path,
+ 'format_note': 'speaker video',
+ 'quality': -1,
+ 'preference': -1,
+ 'format_id': 'speaker',
+ })
+ return video_formats
+
+ def _login(self, webpage_url, video_id):
+ (username, password) = self._get_login_info()
+ if username is None or password is None:
+ self.report_warning('It looks like ' + webpage_url + ' requires a login. Try specifying a username and password and try again.')
+ return None
+
+ mobj = re.match(r'(?P<root_url>https?://.*?/).*', webpage_url)
+ login_url = mobj.group('root_url') + 'api/login.php'
+ logout_url = mobj.group('root_url') + 'logout'
+
+ login_form = {
+ 'email': username,
+ 'password': password,
+ }
+
+ request = compat_urllib_request.Request(login_url, compat_urllib_parse.urlencode(login_form))
+ request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+ self._download_webpage(request, video_id, 'Logging in')
+ start_page = self._download_webpage(webpage_url, video_id, 'Getting authenticated video page')
+ self._download_webpage(logout_url, video_id, 'Logging out')
+
+ return start_page
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+
+ video_id = mobj.group('id')
+ webpage_url = 'http://www.gdcvault.com/play/' + video_id
+ start_page = self._download_webpage(webpage_url, video_id)
+
+ xml_root = self._html_search_regex(r'<iframe src="(?P<xml_root>.*?)player.html.*?".*?</iframe>', start_page, 'xml root', None, False)
+
+ if xml_root is None:
+ # Probably need to authenticate
+ start_page = self._login(webpage_url, video_id)
+ if start_page is None:
+ self.report_warning('Could not login.')
+ else:
+ # Grab the url from the authenticated page
+ xml_root = self._html_search_regex(r'<iframe src="(?P<xml_root>.*?)player.html.*?".*?</iframe>', start_page, 'xml root')
+
+ xml_name = self._html_search_regex(r'<iframe src=".*?\?xml=(?P<xml_file>.+?\.xml).*?".*?</iframe>', start_page, 'xml filename', None, False)
+ if xml_name is None:
+ # Fallback to the older format
+ xml_name = self._html_search_regex(r'<iframe src=".*?\?xmlURL=xml/(?P<xml_file>.+?\.xml).*?".*?</iframe>', start_page, 'xml filename')
+
+ xml_decription_url = xml_root + 'xml/' + xml_name
+ xml_description = self._download_xml(xml_decription_url, video_id)
+
+ video_title = xml_description.find('./metadata/title').text
+ video_formats = self._parse_mp4(xml_description)
+ if video_formats is None:
+ video_formats = self._parse_flv(xml_description)
+
+ return {
+ 'id': video_id,
+ 'title': video_title,
+ 'formats': video_formats,
+ }
# Direct link to a video
{
'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
- 'file': 'trailer.mp4',
'md5': '67d406c2bcb6af27fa886f31aa934bbe',
'info_dict': {
'id': 'trailer',
+ 'ext': 'mp4',
'title': 'trailer',
'upload_date': '20100513',
}
# ooyala video
{
'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
- 'file': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ.mp4',
'md5': '5644c6ca5d5782c1d0d350dad9bd840c',
'info_dict': {
'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
'title': '2cc213299525360.mov', # that's what we get
},
},
+ # google redirect
+ {
+ 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
+ 'info_dict': {
+ 'id': 'cmQHVoWB5FY',
+ 'ext': 'mp4',
+ 'upload_date': '20130224',
+ 'uploader_id': 'TheVerge',
+ 'description': 'Chris Ziegler takes a look at the Alcatel OneTouch Fire and the ZTE Open; two of the first Firefox OS handsets to be officially announced.',
+ 'uploader': 'The Verge',
+ 'title': 'First Firefox OS phones side-by-side',
+ },
+ 'params': {
+ 'skip_download': False,
+ }
+ },
# embed.ly video
{
'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
if mobj is not None:
return self.url_result(mobj.group(1), 'Mpora')
- # Look for embedded Novamov player
+ # Look for embedded NovaMov player
mobj = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>http://(?:(?:embed|www)\.)?novamov\.com/embed\.php.+?)\1', webpage)
if mobj is not None:
- return self.url_result(mobj.group('url'), 'Novamov')
+ return self.url_result(mobj.group('url'), 'NovaMov')
+
+ # Look for embedded NowVideo player
+ mobj = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>http://(?:(?:embed|www)\.)?nowvideo\.(?:ch|sx|eu)/embed\.php.+?)\1', webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'), 'NowVideo')
# Look for embedded Facebook player
mobj = re.search(
if mobj is not None:
return self.url_result(mobj.group('url'), 'Facebook')
+ # Look for embedded VK player
+ mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'), 'VK')
+
# Look for embedded Huffington Post player
mobj = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
if mobj is None:
# HTML5 video
mobj = re.search(r'<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage, flags=re.DOTALL)
+ if mobj is None:
+ mobj = re.search(
+ r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
+ r'(?:[a-z-]+="[^"]+"\s+)*?content="[0-9]{,2};url=\'([^\']+)\'"',
+ webpage)
+ if mobj:
+ new_url = mobj.group(1)
+ self.report_following_redirect(new_url)
+ return {
+ '_type': 'url',
+ 'url': new_url,
+ }
if mobj is None:
raise ExtractorError('Unsupported URL: %s' % url)
class IPrimaIE(InfoExtractor):
- _VALID_URL = r'https?://play\.iprima\.cz/(?P<videogroup>.+)/(?P<videoid>.+)'
+ _VALID_URL = r'https?://play\.iprima\.cz/[^?#]+/(?P<id>[^?#]+)'
_TESTS = [{
'url': 'http://play.iprima.cz/particka/particka-92',
'thumbnail': 'http://play.iprima.cz/sites/default/files/image_crops/image_620x349/3/491483_particka-92_image_620x349.jpg',
},
'params': {
- 'skip_download': True,
+ 'skip_download': True, # requires rtmpdump
},
- },
- ]
+ }, {
+ 'url': 'http://play.iprima.cz/particka/tchibo-particka-jarni-moda',
+ 'info_dict': {
+ 'id': '9718337',
+ 'ext': 'flv',
+ 'title': 'Tchibo Partička - Jarní móda',
+ 'description': 'md5:589f8f59f414220621ff8882eb3ce7be',
+ 'thumbnail': 're:^http:.*\.jpg$',
+ },
+ 'params': {
+ 'skip_download': True, # requires rtmpdump
+ },
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('videoid')
+ video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
- player_url = 'http://embed.livebox.cz/iprimaplay/player-embed-v2.js?__tok%s__=%s' % (
- floor(random()*1073741824),
- floor(random()*1073741824))
+ player_url = (
+ 'http://embed.livebox.cz/iprimaplay/player-embed-v2.js?__tok%s__=%s' %
+ (floor(random()*1073741824), floor(random()*1073741824))
+ )
req = compat_urllib_request.Request(player_url)
req.add_header('Referer', url)
base_url = ''.join(re.findall(r"embed\['stream'\] = '(.+?)'.+'(\?auth=)'.+'(.+?)';", playerpage)[1])
zoneGEO = self._html_search_regex(r'"zoneGEO":(.+?),', webpage, 'zoneGEO')
-
if zoneGEO != '0':
- base_url = base_url.replace('token', 'token_'+zoneGEO)
+ base_url = base_url.replace('token', 'token_' + zoneGEO)
formats = []
for format_id in ['lq', 'hq', 'hd']:
- filename = self._html_search_regex(r'"%s_id":(.+?),' % format_id, webpage, 'filename')
+ filename = self._html_search_regex(
+ r'"%s_id":(.+?),' % format_id, webpage, 'filename')
if filename == 'null':
continue
- real_id = self._search_regex(r'Prima-[0-9]{10}-([0-9]+)_', filename, 'real video id')
+ real_id = self._search_regex(
+ r'Prima-(?:[0-9]{10}|WEB)-([0-9]+)[-_]',
+ filename, 'real video id')
if format_id == 'lq':
quality = 0
quality = 1
elif format_id == 'hd':
quality = 2
- filename = 'hq/'+filename
+ filename = 'hq/' + filename
formats.append({
'format_id': format_id,
'url': base_url,
'quality': quality,
- 'play_path': 'mp4:'+filename.replace('"', '')[:-4],
+ 'play_path': 'mp4:' + filename.replace('"', '')[:-4],
'rtmp_live': True,
'ext': 'flv',
})
from .common import InfoExtractor
from ..utils import (
int_or_none,
- unified_strdate
+ unified_strdate,
+ ExtractorError,
)
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
- webpage = self._download_webpage('http://lifenews.ru/mobile/news/%s' % video_id, video_id, 'Downloading page')
+ webpage = self._download_webpage('http://lifenews.ru/news/%s' % video_id, video_id, 'Downloading page')
- video_url = self._html_search_regex(
- r'<video.*?src="([^"]+)".*?></video>', webpage, 'video URL')
-
- thumbnail = self._html_search_regex(
- r'<video.*?poster="([^"]+)".*?"></video>', webpage, 'video thumbnail')
+ videos = re.findall(r'<video.*?poster="(?P<poster>[^"]+)".*?src="(?P<video>[^"]+)".*?></video>', webpage)
+ if not videos:
+ raise ExtractorError('No media links available for %s' % video_id)
title = self._og_search_title(webpage)
TITLE_SUFFIX = ' - Первый по срочным новостям — LIFE | NEWS'
view_count = self._html_search_regex(
r'<div class=\'views\'>(\d+)</div>', webpage, 'view count', fatal=False)
comment_count = self._html_search_regex(
- r'<div class=\'comments\'>(\d+)</div>', webpage, 'comment count', fatal=False)
+ r'<div class=\'comments\'>\s*<span class=\'counter\'>(\d+)</span>', webpage, 'comment count', fatal=False)
upload_date = self._html_search_regex(
r'<time datetime=\'([^\']+)\'>', webpage, 'upload date',fatal=False)
if upload_date is not None:
upload_date = unified_strdate(upload_date)
- return {
- 'id': video_id,
- 'url': video_url,
- 'thumbnail': thumbnail,
- 'title': title,
- 'description': description,
- 'view_count': int_or_none(view_count),
- 'comment_count': int_or_none(comment_count),
- 'upload_date': upload_date,
- }
\ No newline at end of file
+ def make_entry(video_id, media, video_number=None):
+ return {
+ 'id': video_id,
+ 'url': media[1],
+ 'thumbnail': media[0],
+ 'title': title if video_number is None else '%s-video%s' % (title, video_number),
+ 'description': description,
+ 'view_count': int_or_none(view_count),
+ 'comment_count': int_or_none(comment_count),
+ 'upload_date': upload_date,
+ }
+
+ if len(videos) == 1:
+ return make_entry(video_id, videos[0])
+ else:
+ return [make_entry(video_id, media, video_number+1) for video_number, media in enumerate(videos)]
\ No newline at end of file
--- /dev/null
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+import datetime
+
+from .common import InfoExtractor
+
+
+class MailRuIE(InfoExtractor):
+ IE_NAME = 'mailru'
+ IE_DESC = 'Видео@Mail.Ru'
+ _VALID_URL = r'http://(?:www\.)?my\.mail\.ru/video/.*#video=/?(?P<id>[^/]+/[^/]+/[^/]+/\d+)'
+
+ _TEST = {
+ 'url': 'http://my.mail.ru/video/top#video=/mail/sonypicturesrus/75/76',
+ 'md5': 'dea205f03120046894db4ebb6159879a',
+ 'info_dict': {
+ 'id': '46301138',
+ 'ext': 'mp4',
+ 'title': 'Новый Человек-Паук. Высокое напряжение. Восстание Электро',
+ 'upload_date': '20140224',
+ 'uploader': 'sonypicturesrus',
+ 'uploader_id': 'sonypicturesrus@mail.ru',
+ 'duration': 184,
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ video_data = self._download_json(
+ 'http://videoapi.my.mail.ru/videos/%s.json?new=1' % video_id, video_id, 'Downloading video JSON')
+
+ author = video_data['author']
+ uploader = author['name']
+ uploader_id = author['id']
+
+ movie = video_data['movie']
+ content_id = str(movie['contentId'])
+ title = movie['title']
+ thumbnail = movie['poster']
+ duration = movie['duration']
+
+ upload_date = datetime.datetime.fromtimestamp(video_data['timestamp']).strftime('%Y%m%d')
+ view_count = video_data['views_count']
+
+ formats = [
+ {
+ 'url': video['url'],
+ 'format_id': video['name'],
+ } for video in video_data['videos']
+ ]
+
+ return {
+ 'id': content_id,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'upload_date': upload_date,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'formats': formats,
+ }
\ No newline at end of file
video_title = self._html_search_regex(r'(?im)<title>(.*) - Video</title>', webpage, u'title')
description = self._og_search_description(webpage)
+ thumbnail = self._og_search_thumbnail(webpage)
video_uploader = self._html_search_regex(
r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);',
webpage, u'uploader nickname', fatal=False)
'uploader': video_uploader,
'upload_date': None,
'title': video_title,
+ 'thumbnail':thumbnail,
'ext': video_ext,
'age_limit': age_limit,
}
+from __future__ import unicode_literals
+
import re
import json
from .common import InfoExtractor
+from .youtube import YoutubeIE
from ..utils import (
+ compat_urlparse,
clean_html,
+ ExtractorError,
get_element_by_id,
)
class TechTVMITIE(InfoExtractor):
- IE_NAME = u'techtv.mit.edu'
+ IE_NAME = 'techtv.mit.edu'
_VALID_URL = r'https?://techtv\.mit\.edu/(videos|embeds)/(?P<id>\d+)'
_TEST = {
- u'url': u'http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set',
- u'file': u'25418.mp4',
- u'md5': u'1f8cb3e170d41fd74add04d3c9330e5f',
- u'info_dict': {
- u'title': u'MIT DNA Learning Center Set',
- u'description': u'md5:82313335e8a8a3f243351ba55bc1b474',
+ 'url': 'http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set',
+ 'md5': '1f8cb3e170d41fd74add04d3c9330e5f',
+ 'info_dict': {
+ 'id': '25418',
+ 'ext': 'mp4',
+ 'title': 'MIT DNA Learning Center Set',
+ 'description': 'md5:82313335e8a8a3f243351ba55bc1b474',
},
}
video_id = mobj.group('id')
raw_page = self._download_webpage(
'http://techtv.mit.edu/videos/%s' % video_id, video_id)
- clean_page = re.compile(u'<!--.*?-->', re.S).sub(u'', raw_page)
+ clean_page = re.compile(r'<!--.*?-->', re.S).sub('', raw_page)
- base_url = self._search_regex(r'ipadUrl: \'(.+?cloudfront.net/)',
- raw_page, u'base url')
- formats_json = self._search_regex(r'bitrates: (\[.+?\])', raw_page,
- u'video formats')
+ base_url = self._search_regex(
+ r'ipadUrl: \'(.+?cloudfront.net/)', raw_page, 'base url')
+ formats_json = self._search_regex(
+ r'bitrates: (\[.+?\])', raw_page, 'video formats')
formats_mit = json.loads(formats_json)
formats = [
{
title = get_element_by_id('edit-title', clean_page)
description = clean_html(get_element_by_id('edit-description', clean_page))
- thumbnail = self._search_regex(r'playlist:.*?url: \'(.+?)\'',
- raw_page, u'thumbnail', flags=re.DOTALL)
+ thumbnail = self._search_regex(
+ r'playlist:.*?url: \'(.+?)\'',
+ raw_page, 'thumbnail', flags=re.DOTALL)
- return {'id': video_id,
- 'title': title,
- 'formats': formats,
- 'description': description,
- 'thumbnail': thumbnail,
- }
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ }
class MITIE(TechTVMITIE):
- IE_NAME = u'video.mit.edu'
+ IE_NAME = 'video.mit.edu'
_VALID_URL = r'https?://video\.mit\.edu/watch/(?P<title>[^/]+)'
_TEST = {
- u'url': u'http://video.mit.edu/watch/the-government-is-profiling-you-13222/',
- u'file': u'21783.mp4',
- u'md5': u'7db01d5ccc1895fc5010e9c9e13648da',
- u'info_dict': {
- u'title': u'The Government is Profiling You',
- u'description': u'md5:ad5795fe1e1623b73620dbfd47df9afd',
+ 'url': 'http://video.mit.edu/watch/the-government-is-profiling-you-13222/',
+ 'md5': '7db01d5ccc1895fc5010e9c9e13648da',
+ 'info_dict': {
+ 'id': '21783',
+ 'ext': 'mp4',
+ 'title': 'The Government is Profiling You',
+ 'description': 'md5:ad5795fe1e1623b73620dbfd47df9afd',
},
}
mobj = re.match(self._VALID_URL, url)
page_title = mobj.group('title')
webpage = self._download_webpage(url, page_title)
- self.to_screen('%s: Extracting %s url' % (page_title, TechTVMITIE.IE_NAME))
- embed_url = self._search_regex(r'<iframe .*?src="(.+?)"', webpage,
- u'embed url')
+ embed_url = self._search_regex(
+ r'<iframe .*?src="(.+?)"', webpage, 'embed url')
return self.url_result(embed_url, ie='TechTVMIT')
+
+
+class OCWMITIE(InfoExtractor):
+ IE_NAME = 'ocw.mit.edu'
+ _VALID_URL = r'^http://ocw\.mit\.edu/courses/(?P<topic>[a-z0-9\-]+)'
+ _BASE_URL = 'http://ocw.mit.edu/'
+
+ _TESTS = [
+ {
+ 'url': 'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/',
+ 'info_dict': {
+ 'id': 'EObHWIEKGjA',
+ 'ext': 'mp4',
+ 'title': 'Lecture 7: Multiple Discrete Random Variables: Expectations, Conditioning, Independence',
+ 'description': 'In this lecture, the professor discussed multiple random variables, expectations, and binomial distribution.',
+ #'subtitles': 'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/MIT6_041F11_lec07_300k.mp4.srt'
+ }
+ },
+ {
+ 'url': 'http://ocw.mit.edu/courses/mathematics/18-01sc-single-variable-calculus-fall-2010/1.-differentiation/part-a-definition-and-basic-rules/session-1-introduction-to-derivatives/',
+ 'info_dict': {
+ 'id': '7K1sB05pE0A',
+ 'ext': 'mp4',
+ 'title': 'Session 1: Introduction to Derivatives',
+ 'description': 'This section contains lecture video excerpts, lecture notes, an interactive mathlet with supporting documents, and problem solving videos.',
+ #'subtitles': 'http://ocw.mit.edu//courses/mathematics/18-01sc-single-variable-calculus-fall-2010/ocw-18.01-f07-lec01_300k.SRT'
+ }
+ }
+ ]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ topic = mobj.group('topic')
+
+ webpage = self._download_webpage(url, topic)
+ title = self._html_search_meta('WT.cg_s', webpage)
+ description = self._html_search_meta('Description', webpage)
+
+ # search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, start, stop, captions_file)
+ embed_chapter_media = re.search(r'ocw_embed_chapter_media\((.+?)\)', webpage)
+ if embed_chapter_media:
+ metadata = re.sub(r'[\'"]', '', embed_chapter_media.group(1))
+ metadata = re.split(r', ?', metadata)
+ yt = metadata[1]
+ subs = compat_urlparse.urljoin(self._BASE_URL, metadata[7])
+ else:
+ # search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, captions_file)
+ embed_media = re.search(r'ocw_embed_media\((.+?)\)', webpage)
+ if embed_media:
+ metadata = re.sub(r'[\'"]', '', embed_media.group(1))
+ metadata = re.split(r', ?', metadata)
+ yt = metadata[1]
+ subs = compat_urlparse.urljoin(self._BASE_URL, metadata[5])
+ else:
+ raise ExtractorError('Unable to find embedded YouTube video.')
+ video_id = YoutubeIE.extract_id(yt)
+
+ return {
+ '_type': 'url_transparent',
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'url': yt,
+ 'url_transparent'
+ 'subtitles': subs,
+ 'ie_key': 'Youtube',
+ }
from .common import InfoExtractor
from ..utils import (
unified_strdate,
+ compat_urllib_parse,
ExtractorError,
)
class MixcloudIE(InfoExtractor):
- _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
+ _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([^/]+)/([^/]+)'
IE_NAME = 'mixcloud'
_TEST = {
'url': 'http://www.mixcloud.com/dholbach/cryptkeeper/',
- 'file': 'dholbach-cryptkeeper.mp3',
'info_dict': {
+ 'id': 'dholbach-cryptkeeper',
+ 'ext': 'mp3',
'title': 'Cryptkeeper',
'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.',
'uploader': 'Daniel Holbach',
mobj = re.match(self._VALID_URL, url)
uploader = mobj.group(1)
cloudcast_name = mobj.group(2)
- track_id = '-'.join((uploader, cloudcast_name))
+ track_id = compat_urllib_parse.unquote('-'.join((uploader, cloudcast_name)))
webpage = self._download_webpage(url, track_id)
+from __future__ import unicode_literals
+
import re
from .common import InfoExtractor
from ..utils import find_xpath_attr, compat_str
+class NBCIE(InfoExtractor):
+ _VALID_URL = r'http://www\.nbc\.com/[^/]+/video/[^/]+/(?P<id>n?\d+)'
+
+ _TEST = {
+ 'url': 'http://www.nbc.com/chicago-fire/video/i-am-a-firefighter/2734188',
+ 'md5': '54d0fbc33e0b853a65d7b4de5c06d64e',
+ 'info_dict': {
+ 'id': 'u1RInQZRN7QJ',
+ 'ext': 'flv',
+ 'title': 'I Am a Firefighter',
+ 'description': 'An emergency puts Dawson\'sf irefighter skills to the ultimate test in this four-part digital series.',
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ webpage = self._download_webpage(url, video_id)
+ theplatform_url = self._search_regex('class="video-player video-player-full" data-mpx-url="(.*?)"', webpage, 'theplatform url')
+ if theplatform_url.startswith('//'):
+ theplatform_url = 'http:' + theplatform_url
+ return self.url_result(theplatform_url)
+
+
class NBCNewsIE(InfoExtractor):
_VALID_URL = r'https?://www\.nbcnews\.com/video/.+?/(?P<id>\d+)'
_TEST = {
- u'url': u'http://www.nbcnews.com/video/nbc-news/52753292',
- u'file': u'52753292.flv',
- u'md5': u'47abaac93c6eaf9ad37ee6c4463a5179',
- u'info_dict': {
- u'title': u'Crew emerges after four-month Mars food study',
- u'description': u'md5:24e632ffac72b35f8b67a12d1b6ddfc1',
+ 'url': 'http://www.nbcnews.com/video/nbc-news/52753292',
+ 'md5': '47abaac93c6eaf9ad37ee6c4463a5179',
+ 'info_dict': {
+ 'id': '52753292',
+ 'ext': 'flv',
+ 'title': 'Crew emerges after four-month Mars food study',
+ 'description': 'md5:24e632ffac72b35f8b67a12d1b6ddfc1',
},
}
all_info = self._download_xml('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id)
info = all_info.find('video')
- return {'id': video_id,
- 'title': info.find('headline').text,
- 'ext': 'flv',
- 'url': find_xpath_attr(info, 'media', 'type', 'flashVideo').text,
- 'description': compat_str(info.find('caption').text),
- 'thumbnail': find_xpath_attr(info, 'media', 'type', 'thumbnail').text,
- }
+ return {
+ 'id': video_id,
+ 'title': info.find('headline').text,
+ 'ext': 'flv',
+ 'url': find_xpath_attr(info, 'media', 'type', 'flashVideo').text,
+ 'description': compat_str(info.find('caption').text),
+ 'thumbnail': find_xpath_attr(info, 'media', 'type', 'thumbnail').text,
+ }
+# encoding: utf-8
+from __future__ import unicode_literals
+
import re
from .common import InfoExtractor
from ..utils import (
- ExtractorError,
unified_strdate,
)
+
class NormalbootsIE(InfoExtractor):
- _VALID_URL = r'(?:http://)?(?:www\.)?normalboots\.com/video/(?P<videoid>[0-9a-z-]*)/?$'
+ _VALID_URL = r'http://(?:www\.)?normalboots\.com/video/(?P<videoid>[0-9a-z-]*)/?$'
_TEST = {
- u'url': u'http://normalboots.com/video/home-alone-games-jontron/',
- u'file': u'home-alone-games-jontron.mp4',
- u'md5': u'8bf6de238915dd501105b44ef5f1e0f6',
- u'info_dict': {
- u'title': u'Home Alone Games - JonTron - NormalBoots',
- u'description': u'Jon is late for Christmas. Typical. Thanks to: Paul Ritchey for Co-Writing/Filming: http://www.youtube.com/user/ContinueShow Michael Azzi for Christmas Intro Animation: http://michafrar.tumblr.com/ Jerrod Waters for Christmas Intro Music: http://www.youtube.com/user/xXJerryTerryXx Casey Ormond for \u2018Tense Battle Theme\u2019:\xa0http://www.youtube.com/Kiamet/',
- u'uploader': u'JonTron',
- u'upload_date': u'20140125',
+ 'url': 'http://normalboots.com/video/home-alone-games-jontron/',
+ 'md5': '8bf6de238915dd501105b44ef5f1e0f6',
+ 'info_dict': {
+ 'id': 'home-alone-games-jontron',
+ 'ext': 'mp4',
+ 'title': 'Home Alone Games - JonTron - NormalBoots',
+ 'description': 'Jon is late for Christmas. Typical. Thanks to: Paul Ritchey for Co-Writing/Filming: http://www.youtube.com/user/ContinueShow Michael Azzi for Christmas Intro Animation: http://michafrar.tumblr.com/ Jerrod Waters for Christmas Intro Music: http://www.youtube.com/user/xXJerryTerryXx Casey Ormond for ‘Tense Battle Theme’:\xa0http://www.youtube.com/Kiamet/',
+ 'uploader': 'JonTron',
+ 'upload_date': '20140125',
}
}
-
+
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group('videoid')
-
- info = {
- 'id': video_id,
- 'uploader': None,
- 'upload_date': None,
- }
-
- if url[:4] != 'http':
- url = 'http://' + url
-
+
webpage = self._download_webpage(url, video_id)
- video_title = self._og_search_title(webpage)
- video_description = self._og_search_description(webpage)
- video_thumbnail = self._og_search_thumbnail(webpage)
video_uploader = self._html_search_regex(r'Posted\sby\s<a\shref="[A-Za-z0-9/]*">(?P<uploader>[A-Za-z]*)\s</a>',
webpage, 'uploader')
- raw_upload_date = self._html_search_regex('<span style="text-transform:uppercase; font-size:inherit;">[A-Za-z]+, (?P<date>.*)</span>',
+ raw_upload_date = self._html_search_regex('<span style="text-transform:uppercase; font-size:inherit;">[A-Za-z]+, (?P<date>.*)</span>',
webpage, 'date')
video_upload_date = unified_strdate(raw_upload_date)
- video_upload_date = unified_strdate(raw_upload_date)
-
+
player_url = self._html_search_regex(r'<iframe\swidth="[0-9]+"\sheight="[0-9]+"\ssrc="(?P<url>[\S]+)"', webpage, 'url')
player_page = self._download_webpage(player_url, video_id)
- video_url = u'http://player.screenwavemedia.com/' + self._html_search_regex(r"'file':\s'(?P<file>[0-9A-Za-z-_\.]+)'", player_page, 'file')
-
- info['url'] = video_url
- info['title'] = video_title
- info['description'] = video_description
- info['thumbnail'] = video_thumbnail
- info['uploader'] = video_uploader
- info['upload_date'] = video_upload_date
-
- return info
+ video_url = self._html_search_regex(r"file:\s'(?P<file>[^']+\.mp4)'", player_page, 'file')
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': self._og_search_title(webpage),
+ 'description': self._og_search_description(webpage),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'uploader': video_uploader,
+ 'upload_date': video_upload_date,
+ }
)
-class NovamovIE(InfoExtractor):
- _VALID_URL = r'http://(?:(?:www\.)?novamov\.com/video/|(?:(?:embed|www)\.)novamov\.com/embed\.php\?v=)(?P<videoid>[a-z\d]{13})'
+class NovaMovIE(InfoExtractor):
+ IE_NAME = 'novamov'
+ IE_DESC = 'NovaMov'
+
+ _VALID_URL = r'http://(?:(?:www\.)?%(host)s/video/|(?:(?:embed|www)\.)%(host)s/embed\.php\?(?:.*?&)?v=)(?P<videoid>[a-z\d]{13})' % {'host': 'novamov\.com'}
+
+ _HOST = 'www.novamov.com'
+
+ _FILE_DELETED_REGEX = r'This file no longer exists on our servers!</h2>'
+ _FILEKEY_REGEX = r'flashvars\.filekey="(?P<filekey>[^"]+)";'
+ _TITLE_REGEX = r'(?s)<div class="v_tab blockborder rounded5" id="v_tab1">\s*<h3>([^<]+)</h3>'
+ _DESCRIPTION_REGEX = r'(?s)<div class="v_tab blockborder rounded5" id="v_tab1">\s*<h3>[^<]+</h3><p>([^<]+)</p>'
_TEST = {
'url': 'http://www.novamov.com/video/4rurhn9x446jj',
- 'file': '4rurhn9x446jj.flv',
'md5': '7205f346a52bbeba427603ba10d4b935',
'info_dict': {
+ 'id': '4rurhn9x446jj',
+ 'ext': 'flv',
'title': 'search engine optimization',
'description': 'search engine optimization is used to rank the web page in the google search engine'
},
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('videoid')
- page = self._download_webpage('http://www.novamov.com/video/%s' % video_id,
- video_id, 'Downloading video page')
+ page = self._download_webpage(
+ 'http://%s/video/%s' % (self._HOST, video_id), video_id, 'Downloading video page')
- if re.search(r'This file no longer exists on our servers!</h2>', page) is not None:
+ if re.search(self._FILE_DELETED_REGEX, page) is not None:
raise ExtractorError(u'Video %s does not exist' % video_id, expected=True)
- filekey = self._search_regex(
- r'flashvars\.filekey="(?P<filekey>[^"]+)";', page, 'filekey')
+ filekey = self._search_regex(self._FILEKEY_REGEX, page, 'filekey')
- title = self._html_search_regex(
- r'(?s)<div class="v_tab blockborder rounded5" id="v_tab1">\s*<h3>([^<]+)</h3>',
- page, 'title', fatal=False)
+ title = self._html_search_regex(self._TITLE_REGEX, page, 'title', fatal=False)
- description = self._html_search_regex(
- r'(?s)<div class="v_tab blockborder rounded5" id="v_tab1">\s*<h3>[^<]+</h3><p>([^<]+)</p>',
- page, 'description', fatal=False)
+ description = self._html_search_regex(self._DESCRIPTION_REGEX, page, 'description', default='', fatal=False)
api_response = self._download_webpage(
- 'http://www.novamov.com/api/player.api.php?key=%s&file=%s' % (filekey, video_id),
- video_id, 'Downloading video api response')
+ 'http://%s/api/player.api.php?key=%s&file=%s' % (self._HOST, filekey, video_id), video_id,
+ 'Downloading video api response')
response = compat_urlparse.parse_qs(api_response)
if 'error_msg' in response:
- raise ExtractorError('novamov returned error: %s' % response['error_msg'][0], expected=True)
+ raise ExtractorError('%s returned error: %s' % (self.IE_NAME, response['error_msg'][0]), expected=True)
video_url = response['url'][0]
'url': video_url,
'title': title,
'description': description
- }
+ }
\ No newline at end of file
-import re
+from __future__ import unicode_literals
-from .common import InfoExtractor
-from ..utils import compat_urlparse
+from .novamov import NovaMovIE
-class NowVideoIE(InfoExtractor):
- _VALID_URL = r'(?:https?://)?(?:www\.)?nowvideo\.(?:ch|sx)/video/(?P<id>\w+)'
- _TEST = {
- u'url': u'http://www.nowvideo.ch/video/0mw0yow7b6dxa',
- u'file': u'0mw0yow7b6dxa.flv',
- u'md5': u'f8fbbc8add72bd95b7850c6a02fc8817',
- u'info_dict': {
- u"title": u"youtubedl test video _BaW_jenozKc.mp4"
- }
- }
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
-
- video_id = mobj.group('id')
- webpage_url = 'http://www.nowvideo.ch/video/' + video_id
- embed_url = 'http://embed.nowvideo.ch/embed.php?v=' + video_id
- webpage = self._download_webpage(webpage_url, video_id)
- embed_page = self._download_webpage(embed_url, video_id,
- u'Downloading embed page')
+class NowVideoIE(NovaMovIE):
+ IE_NAME = 'nowvideo'
+ IE_DESC = 'NowVideo'
- self.report_extraction(video_id)
+ _VALID_URL = r'http://(?:(?:www\.)?%(host)s/video/|(?:(?:embed|www)\.)%(host)s/embed\.php\?(?:.*?&)?v=)(?P<videoid>[a-z\d]{13})' % {'host': 'nowvideo\.(?:ch|sx|eu)'}
- video_title = self._html_search_regex(r'<h4>(.*)</h4>',
- webpage, u'video title')
+ _HOST = 'www.nowvideo.ch'
- video_key = self._search_regex(r'var fkzd="(.*)";',
- embed_page, u'video key')
+ _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<'
+ _FILEKEY_REGEX = r'var fkzd="([^"]+)";'
+ _TITLE_REGEX = r'<h4>([^<]+)</h4>'
+ _DESCRIPTION_REGEX = r'</h4>\s*<p>([^<]+)</p>'
- api_call = "http://www.nowvideo.ch/api/player.api.php?file={0}&numOfErrors=0&cid=1&key={1}".format(video_id, video_key)
- api_response = self._download_webpage(api_call, video_id,
- u'Downloading API page')
- video_url = compat_urlparse.parse_qs(api_response)[u'url'][0]
-
- return [{
- 'id': video_id,
- 'url': video_url,
- 'ext': 'flv',
- 'title': video_title,
- }]
+ _TEST = {
+ 'url': 'http://www.nowvideo.ch/video/0mw0yow7b6dxa',
+ 'md5': 'f8fbbc8add72bd95b7850c6a02fc8817',
+ 'info_dict': {
+ 'id': '0mw0yow7b6dxa',
+ 'ext': 'flv',
+ 'title': 'youtubedl test video _BaW_jenozKc.mp4',
+ 'description': 'Description',
+ }
+ }
\ No newline at end of file
from ..utils import (
HEADRequest,
unified_strdate,
+ ExtractorError,
)
data_json = self._search_regex(
r'initializeAdworx\((.+?)\);\n', webpage, 'video info')
all_data = json.loads(data_json)
- sdata = all_data[0]['values']['segments']
+
+ def get_segments(all_data):
+ for data in all_data:
+ if data['name'] == 'Tracker::EPISODE_DETAIL_PAGE_OVER_PROGRAM':
+ return data['values']['segments']
+
+ sdata = get_segments(all_data)
+ if not sdata:
+ raise ExtractorError('Unable to extract segments')
def quality_to_int(s):
m = re.search('([0-9]+)', s)
+from __future__ import unicode_literals
+
import json
import re
from .common import InfoExtractor
+from ..utils import int_or_none
class PodomaticIE(InfoExtractor):
_VALID_URL = r'^(?P<proto>https?)://(?P<channel>[^.]+)\.podomatic\.com/entry/(?P<id>[^?]+)'
_TEST = {
- u"url": u"http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00",
- u"file": u"2009-01-02T16_03_35-08_00.mp3",
- u"md5": u"84bb855fcf3429e6bf72460e1eed782d",
- u"info_dict": {
- u"uploader": u"Science Teaching Tips",
- u"uploader_id": u"scienceteachingtips",
- u"title": u"64. When the Moon Hits Your Eye",
- u"duration": 446,
+ "url": "http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00",
+ "file": "2009-01-02T16_03_35-08_00.mp3",
+ "md5": "84bb855fcf3429e6bf72460e1eed782d",
+ "info_dict": {
+ "uploader": "Science Teaching Tips",
+ "uploader_id": "scienceteachingtips",
+ "title": "64. When the Moon Hits Your Eye",
+ "duration": 446,
}
}
uploader = data['podcast']
title = data['title']
thumbnail = data['imageLocation']
- duration = int(data['length'] / 1000.0)
+ duration = int_or_none(data.get('length'), 1000)
return {
'id': video_id,
--- /dev/null
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from hashlib import sha1
+from .common import InfoExtractor
+from ..utils import (
+ compat_urllib_parse,
+ unified_strdate,
+ clean_html,
+ RegexNotFoundError,
+)
+
+
+class ProSiebenSat1IE(InfoExtractor):
+ IE_NAME = 'prosiebensat1'
+ IE_DESC = 'ProSiebenSat.1 Digital'
+ _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|ran|the-voice-of-germany)\.de|fem\.com)/(?P<id>.+)'
+
+ _TESTS = [
+ {
+ 'url': 'http://www.prosieben.de/tv/circus-halligalli/videos/218-staffel-2-episode-18-jahresrueckblick-ganze-folge',
+ 'info_dict': {
+ 'id': '2104602',
+ 'ext': 'mp4',
+ 'title': 'Staffel 2, Episode 18 - Jahresrückblick',
+ 'description': 'md5:8733c81b702ea472e069bc48bb658fc1',
+ 'upload_date': '20131231',
+ 'duration': 5845.04,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://www.prosieben.de/videokatalog/Gesellschaft/Leben/Trends/video-Lady-Umstyling-f%C3%BCr-Audrina-Rebekka-Audrina-Fergen-billig-aussehen-Battal-Modica-700544.html',
+ 'info_dict': {
+ 'id': '2570327',
+ 'ext': 'mp4',
+ 'title': 'Lady-Umstyling für Audrina',
+ 'description': 'md5:4c16d0c17a3461a0d43ea4084e96319d',
+ 'upload_date': '20131014',
+ 'duration': 606.76,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ 'skip': 'Seems to be broken',
+ },
+ {
+ 'url': 'http://www.prosiebenmaxx.de/tv/experience/video/144-countdown-fuer-die-autowerkstatt-ganze-folge',
+ 'info_dict': {
+ 'id': '2429369',
+ 'ext': 'mp4',
+ 'title': 'Countdown für die Autowerkstatt',
+ 'description': 'md5:809fc051a457b5d8666013bc40698817',
+ 'upload_date': '20140223',
+ 'duration': 2595.04,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://www.sixx.de/stars-style/video/sexy-laufen-in-ugg-boots-clip',
+ 'info_dict': {
+ 'id': '2904997',
+ 'ext': 'mp4',
+ 'title': 'Sexy laufen in Ugg Boots',
+ 'description': 'md5:edf42b8bd5bc4e5da4db4222c5acb7d6',
+ 'upload_date': '20140122',
+ 'duration': 245.32,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://www.sat1.de/film/der-ruecktritt/video/im-interview-kai-wiesinger-clip',
+ 'info_dict': {
+ 'id': '2906572',
+ 'ext': 'mp4',
+ 'title': 'Im Interview: Kai Wiesinger',
+ 'description': 'md5:e4e5370652ec63b95023e914190b4eb9',
+ 'upload_date': '20140225',
+ 'duration': 522.56,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://www.kabeleins.de/tv/rosins-restaurants/videos/jagd-auf-fertigkost-im-elsthal-teil-2-ganze-folge',
+ 'info_dict': {
+ 'id': '2992323',
+ 'ext': 'mp4',
+ 'title': 'Jagd auf Fertigkost im Elsthal - Teil 2',
+ 'description': 'md5:2669cde3febe9bce13904f701e774eb6',
+ 'upload_date': '20140225',
+ 'duration': 2410.44,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://www.ran.de/fussball/bundesliga/video/schalke-toennies-moechte-raul-zurueck-ganze-folge',
+ 'info_dict': {
+ 'id': '3004256',
+ 'ext': 'mp4',
+ 'title': 'Schalke: Tönnies möchte Raul zurück',
+ 'description': 'md5:4b5b271d9bcde223b54390754c8ece3f',
+ 'upload_date': '20140226',
+ 'duration': 228.96,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://www.the-voice-of-germany.de/video/31-andreas-kuemmert-rocket-man-clip',
+ 'info_dict': {
+ 'id': '2572814',
+ 'ext': 'mp4',
+ 'title': 'Andreas Kümmert: Rocket Man',
+ 'description': 'md5:6ddb02b0781c6adf778afea606652e38',
+ 'upload_date': '20131017',
+ 'duration': 469.88,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://www.fem.com/wellness/videos/wellness-video-clip-kurztripps-zum-valentinstag.html',
+ 'info_dict': {
+ 'id': '2156342',
+ 'ext': 'mp4',
+ 'title': 'Kurztrips zum Valentinstag',
+ 'description': 'md5:8ba6301e70351ae0bedf8da00f7ba528',
+ 'upload_date': '20130206',
+ 'duration': 307.24,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ },
+ ]
+
+ _CLIPID_REGEXES = [
+ r'"clip_id"\s*:\s+"(\d+)"',
+ r'clipid: "(\d+)"',
+ ]
+ _TITLE_REGEXES = [
+ r'<h2 class="subtitle" itemprop="name">\s*(.+?)</h2>',
+ r'<header class="clearfix">\s*<h3>(.+?)</h3>',
+ r'<!-- start video -->\s*<h1>(.+?)</h1>',
+ r'<div class="ep-femvideos-pi4-video-txt">\s*<h2>(.+?)</h2>',
+ ]
+ _DESCRIPTION_REGEXES = [
+ r'<p itemprop="description">\s*(.+?)</p>',
+ r'<div class="videoDecription">\s*<p><strong>Beschreibung</strong>: (.+?)</p>',
+ r'<div class="g-plusone" data-size="medium"></div>\s*</div>\s*</header>\s*(.+?)\s*<footer>',
+ r'<p>(.+?)</p>\s*<div class="ep-femvideos-pi4-video-footer">',
+ ]
+ _UPLOAD_DATE_REGEXES = [
+ r'<meta property="og:published_time" content="(.+?)">',
+ r'<span>\s*(\d{2}\.\d{2}\.\d{4} \d{2}:\d{2}) \|\s*<span itemprop="duration"',
+ r'<footer>\s*(\d{2}\.\d{2}\.\d{4}) \d{2}:\d{2} Uhr',
+ r'<span style="padding-left: 4px;line-height:20px; color:#404040">(\d{2}\.\d{2}\.\d{4})</span>',
+ r'(\d{2}\.\d{2}\.\d{4}) \| \d{2}:\d{2} Min<br/>',
+ ]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ page = self._download_webpage(url, video_id, 'Downloading page')
+
+ def extract(patterns, name, page, fatal=False):
+ for pattern in patterns:
+ mobj = re.search(pattern, page)
+ if mobj:
+ return clean_html(mobj.group(1))
+ if fatal:
+ raise RegexNotFoundError(u'Unable to extract %s' % name)
+ return None
+
+ clip_id = extract(self._CLIPID_REGEXES, 'clip id', page, fatal=True)
+
+ access_token = 'testclient'
+ client_name = 'kolibri-1.2.5'
+ client_location = url
+
+ videos_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos?%s' % compat_urllib_parse.urlencode({
+ 'access_token': access_token,
+ 'client_location': client_location,
+ 'client_name': client_name,
+ 'ids': clip_id,
+ })
+
+ videos = self._download_json(videos_api_url, clip_id, 'Downloading videos JSON')
+
+ duration = float(videos[0]['duration'])
+ source_ids = [source['id'] for source in videos[0]['sources']]
+ source_ids_str = ','.join(map(str, source_ids))
+
+ g = '01!8d8F_)r9]4s[qeuXfP%'
+
+ client_id = g[:2] + sha1(''.join([clip_id, g, access_token, client_location, g, client_name])
+ .encode('utf-8')).hexdigest()
+
+ sources_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources?%s' % (clip_id, compat_urllib_parse.urlencode({
+ 'access_token': access_token,
+ 'client_id': client_id,
+ 'client_location': client_location,
+ 'client_name': client_name,
+ }))
+
+ sources = self._download_json(sources_api_url, clip_id, 'Downloading sources JSON')
+ server_id = sources['server_id']
+
+ client_id = g[:2] + sha1(''.join([g, clip_id, access_token, server_id,
+ client_location, source_ids_str, g, client_name])
+ .encode('utf-8')).hexdigest()
+
+ url_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url?%s' % (clip_id, compat_urllib_parse.urlencode({
+ 'access_token': access_token,
+ 'client_id': client_id,
+ 'client_location': client_location,
+ 'client_name': client_name,
+ 'server_id': server_id,
+ 'source_ids': source_ids_str,
+ }))
+
+ urls = self._download_json(url_api_url, clip_id, 'Downloading urls JSON')
+
+ title = extract(self._TITLE_REGEXES, 'title', page, fatal=True)
+ description = extract(self._DESCRIPTION_REGEXES, 'description', page)
+ thumbnail = self._og_search_thumbnail(page)
+
+ upload_date = extract(self._UPLOAD_DATE_REGEXES, 'upload date', page)
+ if upload_date:
+ upload_date = unified_strdate(upload_date)
+
+ formats = []
+
+ urls_sources = urls['sources']
+ if isinstance(urls_sources, dict):
+ urls_sources = urls_sources.values()
+
+ def fix_bitrate(bitrate):
+ return bitrate / 1000 if bitrate % 1000 == 0 else bitrate
+
+ for source in urls_sources:
+ protocol = source['protocol']
+ if protocol == 'rtmp' or protocol == 'rtmpe':
+ mobj = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', source['url'])
+ if not mobj:
+ continue
+ formats.append({
+ 'url': mobj.group('url'),
+ 'app': mobj.group('app'),
+ 'play_path': mobj.group('playpath'),
+ 'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf',
+ 'page_url': 'http://www.prosieben.de',
+ 'vbr': fix_bitrate(source['bitrate']),
+ 'ext': 'mp4',
+ 'format_id': '%s_%s' % (source['cdn'], source['bitrate']),
+ })
+ else:
+ formats.append({
+ 'url': source['url'],
+ 'vbr': fix_bitrate(source['bitrate']),
+ })
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': clip_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'upload_date': upload_date,
+ 'duration': duration,
+ 'formats': formats,
+ }
\ No newline at end of file
# encoding: utf-8
-
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
- clean_html,
ExtractorError,
+ clean_html,
+ unified_strdate,
+ int_or_none,
)
class RTLnowIE(InfoExtractor):
"""Information Extractor for RTL NOW, RTL2 NOW, RTL NITRO, SUPER RTL NOW, VOX NOW and n-tv NOW"""
- _VALID_URL = r'(?:http://)?(?P<url>(?P<domain>rtl-now\.rtl\.de|rtl2now\.rtl2\.de|(?:www\.)?voxnow\.de|(?:www\.)?rtlnitronow\.de|(?:www\.)?superrtlnow\.de|(?:www\.)?n-tvnow\.de)/+[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?(?:container_id|film_id)=(?P<video_id>[0-9]+)&player=1(?:&season=[0-9]+)?(?:&.*)?)'
- _TESTS = [{
- 'url': 'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1',
- 'file': '90419.flv',
- 'info_dict': {
- 'upload_date': '20070416',
- 'title': 'Ahornallee - Folge 1 - Der Einzug',
- 'description': 'Folge 1 - Der Einzug',
- },
- 'params': {
- 'skip_download': True,
- },
- 'skip': 'Only works from Germany',
- },
- {
- 'url': 'http://rtl2now.rtl2.de/aerger-im-revier/episode-15-teil-1.php?film_id=69756&player=1&season=2&index=5',
- 'file': '69756.flv',
- 'info_dict': {
- 'upload_date': '20120519',
- 'title': 'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit...',
- 'description': 'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit u.a.',
- 'thumbnail': 'http://autoimg.static-fra.de/rtl2now/219850/1500x1500/image2.jpg',
- },
- 'params': {
- 'skip_download': True,
- },
- 'skip': 'Only works from Germany',
- },
- {
- 'url': 'http://www.voxnow.de/voxtours/suedafrika-reporter-ii.php?film_id=13883&player=1&season=17',
- 'file': '13883.flv',
- 'info_dict': {
- 'upload_date': '20090627',
- 'title': 'Voxtours - Südafrika-Reporter II',
- 'description': 'Südafrika-Reporter II',
- },
- 'params': {
- 'skip_download': True,
+ _VALID_URL = r'''(?x)
+ (?:https?://)?
+ (?P<url>
+ (?P<domain>
+ rtl-now\.rtl\.de|
+ rtl2now\.rtl2\.de|
+ (?:www\.)?voxnow\.de|
+ (?:www\.)?rtlnitronow\.de|
+ (?:www\.)?superrtlnow\.de|
+ (?:www\.)?n-tvnow\.de)
+ /+[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?
+ (?:container_id|film_id)=(?P<video_id>[0-9]+)&
+ player=1(?:&season=[0-9]+)?(?:&.*)?
+ )'''
+
+ _TESTS = [
+ {
+ 'url': 'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1',
+ 'info_dict': {
+ 'id': '90419',
+ 'ext': 'flv',
+ 'title': 'Ahornallee - Folge 1 - Der Einzug',
+ 'description': 'md5:ce843b6b5901d9a7f7d04d1bbcdb12de',
+ 'upload_date': '20070416',
+ 'duration': 1685,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'skip': 'Only works from Germany',
},
- },
- {
- 'url': 'http://superrtlnow.de/medicopter-117/angst.php?film_id=99205&player=1',
- 'file': '99205.flv',
- 'info_dict': {
- 'upload_date': '20080928',
- 'title': 'Medicopter 117 - Angst!',
- 'description': 'Angst!',
- 'thumbnail': 'http://autoimg.static-fra.de/superrtlnow/287529/1500x1500/image2.jpg'
+ {
+ 'url': 'http://rtl2now.rtl2.de/aerger-im-revier/episode-15-teil-1.php?film_id=69756&player=1&season=2&index=5',
+ 'info_dict': {
+ 'id': '69756',
+ 'ext': 'flv',
+ 'title': 'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit u.a.',
+ 'description': 'md5:3fb247005ed21a935ffc82b7dfa70cf0',
+ 'thumbnail': 'http://autoimg.static-fra.de/rtl2now/219850/1500x1500/image2.jpg',
+ 'upload_date': '20120519',
+ 'duration': 1245,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'skip': 'Only works from Germany',
},
- 'params': {
- 'skip_download': True,
+ {
+ 'url': 'http://www.voxnow.de/voxtours/suedafrika-reporter-ii.php?film_id=13883&player=1&season=17',
+ 'info_dict': {
+ 'id': '13883',
+ 'ext': 'flv',
+ 'title': 'Voxtours - Südafrika-Reporter II',
+ 'description': 'md5:de7f8d56be6fd4fed10f10f57786db00',
+ 'upload_date': '20090627',
+ 'duration': 1800,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
},
- },
- {
- 'url': 'http://www.n-tvnow.de/top-gear/episode-1-2013-01-01-00-00-00.php?film_id=124903&player=1&season=10',
- 'file': '124903.flv',
- 'info_dict': {
- 'upload_date': '20130101',
- 'title': 'Top Gear vom 01.01.2013',
- 'description': 'Episode 1',
+ {
+ 'url': 'http://superrtlnow.de/medicopter-117/angst.php?film_id=99205&player=1',
+ 'info_dict': {
+ 'id': '99205',
+ 'ext': 'flv',
+ 'title': 'Medicopter 117 - Angst!',
+ 'description': 'md5:895b1df01639b5f61a04fc305a5cb94d',
+ 'thumbnail': 'http://autoimg.static-fra.de/superrtlnow/287529/1500x1500/image2.jpg',
+ 'upload_date': '20080928',
+ 'duration': 2691,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
},
- 'params': {
- 'skip_download': True,
+ {
+ 'url': 'http://www.n-tvnow.de/deluxe-alles-was-spass-macht/thema-ua-luxushotel-fuer-vierbeiner.php?container_id=153819&player=1&season=0',
+ 'info_dict': {
+ 'id': '153819',
+ 'ext': 'flv',
+ 'title': 'Deluxe - Alles was Spaß macht - Thema u.a.: Luxushotel für Vierbeiner',
+ 'description': 'md5:c3705e1bb32e1a5b2bcd634fc065c631',
+ 'thumbnail': 'http://autoimg.static-fra.de/ntvnow/383157/1500x1500/image2.jpg',
+ 'upload_date': '20140221',
+ 'duration': 2429,
+ },
+ 'skip': 'Only works from Germany',
},
- 'skip': 'Only works from Germany',
- }]
+ ]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
-
- webpage_url = 'http://' + mobj.group('url')
- video_page_url = 'http://' + mobj.group('domain') + '/'
+ video_page_url = 'http://%s/' % mobj.group('domain')
video_id = mobj.group('video_id')
- webpage = self._download_webpage(webpage_url, video_id)
+ webpage = self._download_webpage('http://' + mobj.group('url'), video_id)
- note_m = re.search(r'''(?sx)
- <div[ ]style="margin-left:[ ]20px;[ ]font-size:[ ]13px;">(.*?)
- <div[ ]id="playerteaser">''', webpage)
- if note_m:
- msg = clean_html(note_m.group(1))
- raise ExtractorError(msg)
+ mobj = re.search(r'(?s)<div style="margin-left: 20px; font-size: 13px;">(.*?)<div id="playerteaser">', webpage)
+ if mobj:
+ raise ExtractorError(clean_html(mobj.group(1)), expected=True)
- video_title = self._html_search_regex(
- r'<title>(?P<title>[^<]+?)( \| [^<]*)?</title>',
- webpage, 'title')
- playerdata_url = self._html_search_regex(
- r'\'playerdata\': \'(?P<playerdata_url>[^\']+)\'',
- webpage, 'playerdata_url')
+ title = self._og_search_title(webpage)
+ description = self._og_search_description(webpage)
+ thumbnail = self._og_search_thumbnail(webpage, default=None)
- playerdata = self._download_webpage(playerdata_url, video_id)
- mobj = re.search(r'<title><!\[CDATA\[(?P<description>.+?)(?:\s+- (?:Sendung )?vom (?P<upload_date_d>[0-9]{2})\.(?P<upload_date_m>[0-9]{2})\.(?:(?P<upload_date_Y>[0-9]{4})|(?P<upload_date_y>[0-9]{2})) [0-9]{2}:[0-9]{2} Uhr)?\]\]></title>', playerdata)
- if mobj:
- video_description = mobj.group('description')
- if mobj.group('upload_date_Y'):
- video_upload_date = mobj.group('upload_date_Y')
- elif mobj.group('upload_date_y'):
- video_upload_date = '20' + mobj.group('upload_date_y')
- else:
- video_upload_date = None
- if video_upload_date:
- video_upload_date += mobj.group('upload_date_m') + mobj.group('upload_date_d')
- else:
- video_description = None
- video_upload_date = None
- self._downloader.report_warning('Unable to extract description and upload date')
+ upload_date = unified_strdate(self._html_search_meta('uploadDate', webpage, 'upload date'))
- # Thumbnail: not every video has an thumbnail
- mobj = re.search(r'<meta property="og:image" content="(?P<thumbnail>[^"]+)">', webpage)
- if mobj:
- video_thumbnail = mobj.group('thumbnail')
- else:
- video_thumbnail = None
+ mobj = re.search(r'<meta itemprop="duration" content="PT(?P<seconds>\d+)S" />', webpage)
+ duration = int(mobj.group('seconds')) if mobj else None
- mobj = re.search(r'<filename [^>]+><!\[CDATA\[(?P<url>rtmpe://(?:[^/]+/){2})(?P<play_path>[^\]]+)\]\]></filename>', playerdata)
- if mobj is None:
- raise ExtractorError('Unable to extract media URL')
- video_url = mobj.group('url')
- video_play_path = 'mp4:' + mobj.group('play_path')
- video_player_url = video_page_url + 'includes/vodplayer.swf'
+ playerdata_url = self._html_search_regex(
+ r"'playerdata': '(?P<playerdata_url>[^']+)'", webpage, 'playerdata_url')
+
+ playerdata = self._download_xml(playerdata_url, video_id, 'Downloading player data XML')
+
+ videoinfo = playerdata.find('./playlist/videoinfo')
+
+ formats = []
+ for filename in videoinfo.findall('filename'):
+ mobj = re.search(r'(?P<url>rtmpe://(?:[^/]+/){2})(?P<play_path>.+)', filename.text)
+ if mobj:
+ fmt = {
+ 'url': mobj.group('url'),
+ 'play_path': 'mp4:' + mobj.group('play_path'),
+ 'page_url': video_page_url,
+ 'player_url': video_page_url + 'includes/vodplayer.swf',
+ }
+ else:
+ fmt = {
+ 'url': filename.text,
+ }
+ fmt.update({
+ 'width': int_or_none(filename.get('width')),
+ 'height': int_or_none(filename.get('height')),
+ 'vbr': int_or_none(filename.get('bitrate')),
+ 'ext': 'flv',
+ })
+ formats.append(fmt)
return {
'id': video_id,
- 'url': video_url,
- 'play_path': video_play_path,
- 'page_url': video_page_url,
- 'player_url': video_player_url,
- 'ext': 'flv',
- 'title': video_title,
- 'description': video_description,
- 'upload_date': video_upload_date,
- 'thumbnail': video_thumbnail,
- }
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'upload_date': upload_date,
+ 'duration': duration,
+ 'formats': formats,
+ }
\ No newline at end of file
return self._extract_info_dict(info, full_title, secret_token=token)
class SoundcloudSetIE(SoundcloudIE):
- _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)(?:[?].*)?$'
+ _VALID_URL = r'https?://(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
IE_NAME = 'soundcloud:set'
# it's in tests/test_playlists.py
_TESTS = []
from .subtitles import SubtitlesInfoExtractor
from ..utils import (
+ compat_str,
RegexNotFoundError,
)
class TEDIE(SubtitlesInfoExtractor):
- _VALID_URL=r'''http://www\.ted\.com/
- (
- ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
- |
- ((?P<type_talk>talks)) # We have a simple talk
- )
- (/lang/(.*?))? # The url may contain the language
- /(?P<name>\w+) # Here goes the name and then ".html"
- '''
+ _VALID_URL = r'''(?x)http://www\.ted\.com/
+ (
+ (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
+ |
+ ((?P<type_talk>talks)) # We have a simple talk
+ )
+ (/lang/(.*?))? # The url may contain the language
+ /(?P<name>\w+) # Here goes the name and then ".html"
+ '''
_TEST = {
'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
'file': '102.mp4',
'md5': '4ea1dada91e4174b53dac2bb8ace429d',
'info_dict': {
- "description": "md5:c6fa72e6eedbd938c9caf6b2702f5922",
- "title": "Dan Dennett: The illusion of consciousness"
+ 'title': 'The illusion of consciousness',
+ 'description': ('Philosopher Dan Dennett makes a compelling '
+ 'argument that not only don\'t we understand our own '
+ 'consciousness, but that half the time our brains are '
+ 'actively fooling us.'),
+ 'uploader': 'Dan Dennett',
}
}
- @classmethod
- def suitable(cls, url):
- """Receives a URL and returns True if suitable for this IE."""
- return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
+ _FORMATS_PREFERENCE = {
+ 'low': 1,
+ 'medium': 2,
+ 'high': 3,
+ }
+
+ def _extract_info(self, webpage):
+ info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
+ webpage, 'info json')
+ return json.loads(info_json)
def _real_extract(self, url):
- m=re.match(self._VALID_URL, url, re.VERBOSE)
+ m = re.match(self._VALID_URL, url, re.VERBOSE)
+ name = m.group('name')
if m.group('type_talk'):
- return self._talk_info(url)
- else :
- playlist_id=m.group('playlist_id')
- name=m.group('name')
- self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
- return [self._playlist_videos_info(url,name,playlist_id)]
+ return self._talk_info(url, name)
+ else:
+ return self._playlist_videos_info(url, name)
-
- def _playlist_videos_info(self, url, name, playlist_id):
+ def _playlist_videos_info(self, url, name):
'''Returns the videos of the playlist'''
- webpage = self._download_webpage(
- url, playlist_id, 'Downloading playlist webpage')
- matches = re.finditer(
- r'<p\s+class="talk-title[^"]*"><a\s+href="(?P<talk_url>/talks/[^"]+\.html)">[^<]*</a></p>',
- webpage)
-
- playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
- webpage, 'playlist title')
+ webpage = self._download_webpage(url, name,
+ 'Downloading playlist webpage')
+ info = self._extract_info(webpage)
+ playlist_info = info['playlist']
playlist_entries = [
- self.url_result(u'http://www.ted.com' + m.group('talk_url'), 'TED')
- for m in matches
+ self.url_result(u'http://www.ted.com/talks/' + talk['slug'], self.ie_key())
+ for talk in info['talks']
]
return self.playlist_result(
- playlist_entries, playlist_id=playlist_id, playlist_title=playlist_title)
+ playlist_entries,
+ playlist_id=compat_str(playlist_info['id']),
+ playlist_title=playlist_info['title'])
- def _talk_info(self, url, video_id=0):
- """Return the video for the talk in the url"""
- m = re.match(self._VALID_URL, url,re.VERBOSE)
- video_name = m.group('name')
- webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
+ def _talk_info(self, url, video_name):
+ webpage = self._download_webpage(url, video_name)
self.report_extraction(video_name)
- # If the url includes the language we get the title translated
- title = self._html_search_regex(r'<span .*?id="altHeadline".+?>(?P<title>.*)</span>',
- webpage, 'title')
- json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
- webpage, 'json data')
- info = json.loads(json_data)
- desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
- webpage, 'description', flags = re.DOTALL)
-
- thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
- webpage, 'thumbnail')
- formats = [{
- 'ext': 'mp4',
- 'url': stream['file'],
- 'format': stream['id']
- } for stream in info['htmlStreams']]
- video_id = info['id']
+ talk_info = self._extract_info(webpage)['talks'][0]
+ formats = [{
+ 'ext': 'mp4',
+ 'url': format_url,
+ 'format_id': format_id,
+ 'format': format_id,
+ 'preference': self._FORMATS_PREFERENCE.get(format_id, -1),
+ } for (format_id, format_url) in talk_info['nativeDownloads'].items()]
+ self._sort_formats(formats)
+
+ video_id = talk_info['id']
# subtitles
- video_subtitles = self.extract_subtitles(video_id, webpage)
+ video_subtitles = self.extract_subtitles(video_id, talk_info)
if self._downloader.params.get('listsubtitles', False):
- self._list_available_subtitles(video_id, webpage)
+ self._list_available_subtitles(video_id, talk_info)
return
return {
'id': video_id,
- 'title': title,
- 'thumbnail': thumbnail,
- 'description': desc,
+ 'title': talk_info['title'],
+ 'uploader': talk_info['speaker'],
+ 'thumbnail': talk_info['thumb'],
+ 'description': self._og_search_description(webpage),
'subtitles': video_subtitles,
'formats': formats,
}
- def _get_available_subtitles(self, video_id, webpage):
- try:
- options = self._search_regex(r'(?:<select name="subtitles_language_select" id="subtitles_language_select">)(.*?)(?:</select>)', webpage, 'subtitles_language_select', flags=re.DOTALL)
- languages = re.findall(r'(?:<option value=")(\S+)"', options)
- if languages:
- sub_lang_list = {}
- for l in languages:
- url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
- sub_lang_list[l] = url
- return sub_lang_list
- except RegexNotFoundError:
+ def _get_available_subtitles(self, video_id, talk_info):
+ languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
+ if languages:
+ sub_lang_list = {}
+ for l in languages:
+ url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
+ sub_lang_list[l] = url
+ return sub_lang_list
+ else:
self._downloader.report_warning(u'video doesn\'t have subtitles')
- return {}
+ return {}
('Found multiple matching extractors: %s' %
' '.join(ie.IE_NAME for ie in matching_extractors)),
expected=True)
+ else:
+ extractor = matching_extractors[0]
num_str = mobj.group('num')
num = int(num_str) if num_str else 0
class ThePlatformIE(InfoExtractor):
_VALID_URL = r'''(?x)
(?:https?://(?:link|player)\.theplatform\.com/[sp]/[^/]+/
- (?P<config>[^/\?]+/(?:swf|config)/select/)?
+ (?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/)?
|theplatform:)(?P<id>[^/\?&]+)'''
_TEST = {
f4m_node = body.find(_x('smil:seq/smil:video'))
if f4m_node is not None:
+ f4m_url = f4m_node.attrib['src']
+ if 'manifest.f4m?' not in f4m_url:
+ f4m_url += '?'
+ # the parameters are from syfy.com, other sites may use others,
+ # they also work for nbc.com
+ f4m_url += '&g=UXWGVKRWHFSP&hdcore=3.0.3'
formats = [{
'ext': 'flv',
- # the parameters are from syfy.com, other sites may use others
- 'url': f4m_node.attrib['src'] + '?g=UXWGVKRWHFSP&hdcore=3.0.3',
+ 'url': f4m_url,
}]
else:
base_url = head.find(_x('smil:meta')).attrib['base']
if mobj.group('config'):
config_url = url+ '&form=json'
config_url = config_url.replace('swf/', 'config/')
+ config_url = config_url.replace('onsite/', 'onsite/config/')
config_json = self._download_webpage(config_url, video_id, u'Downloading config')
config = json.loads(config_json)
- smil_url = config['releaseUrl'] + '&format=SMIL&formats=MPEG4'
+ smil_url = config['releaseUrl'] + '&format=SMIL&formats=MPEG4&manifest=f4m'
else:
smil_url = ('http://link.theplatform.com/s/dJ5BDC/{0}/meta.smil?'
'format=smil&mbr=true'.format(video_id))
import re
from .common import InfoExtractor
-from youtube_dl.utils import ExtractorError
+from ..utils import ExtractorError
class TinyPicIE(InfoExtractor):
--- /dev/null
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class TruTubeIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?trutube\.tv/video/(?P<id>[0-9]+)/.*'
+ _TEST = {
+ 'url': 'http://trutube.tv/video/14880/Ramses-II-Proven-To-Be-A-Red-Headed-Caucasoid-',
+ 'md5': 'c5b6e301b0a2040b074746cbeaa26ca1',
+ 'info_dict': {
+ 'id': '14880',
+ 'ext': 'flv',
+ 'title': 'Ramses II - Proven To Be A Red Headed Caucasoid',
+ 'thumbnail': 're:^http:.*\.jpg$',
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+ video_title = self._og_search_title(webpage).strip()
+ thumbnail = self._search_regex(
+ r"var splash_img = '([^']+)';", webpage, 'thumbnail', fatal=False)
+
+ all_formats = re.finditer(
+ r"var (?P<key>[a-z]+)_video_file\s*=\s*'(?P<url>[^']+)';", webpage)
+ formats = [{
+ 'format_id': m.group('key'),
+ 'quality': -i,
+ 'url': m.group('url'),
+ } for i, m in enumerate(all_formats)]
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': video_title,
+ 'formats': formats,
+ 'thumbnail': thumbnail,
+ }
--- /dev/null
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ unified_strdate,
+ clean_html,
+ int_or_none,
+)
+
+
+class TvigleIE(InfoExtractor):
+ IE_NAME = 'tvigle'
+ IE_DESC = 'Интернет-телевидение Tvigle.ru'
+ _VALID_URL = r'http://(?:www\.)?tvigle\.ru/category/.+?[\?&]v(?:ideo)?=(?P<id>\d+)'
+
+ _TESTS = [
+ {
+ 'url': 'http://www.tvigle.ru/category/cinema/1608/?video=503081',
+ 'md5': '09afba4616666249f087efc6dcf83cb3',
+ 'info_dict': {
+ 'id': '503081',
+ 'ext': 'flv',
+ 'title': 'Брат 2 ',
+ 'description': 'md5:f5a42970f50648cee3d7ad740f3ae769',
+ 'upload_date': '20110919',
+ },
+ },
+ {
+ 'url': 'http://www.tvigle.ru/category/men/vysotskiy_vospominaniya02/?flt=196&v=676433',
+ 'md5': 'e7efe5350dd5011d0de6550b53c3ba7b',
+ 'info_dict': {
+ 'id': '676433',
+ 'ext': 'flv',
+ 'title': 'Ведущий телепрограммы «60 минут» (США) о Владимире Высоцком',
+ 'description': 'md5:027f7dc872948f14c96d19b4178428a4',
+ 'upload_date': '20121218',
+ },
+ },
+ ]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ video_data = self._download_xml(
+ 'http://www.tvigle.ru/xml/single.php?obj=%s' % video_id, video_id, 'Downloading video XML')
+
+ video = video_data.find('./video')
+
+ title = video.get('name')
+ description = video.get('anons')
+ if description:
+ description = clean_html(description)
+ thumbnail = video_data.get('img')
+ upload_date = unified_strdate(video.get('date'))
+ like_count = int_or_none(video.get('vtp'))
+
+ formats = []
+ for num, (format_id, format_note) in enumerate([['low_file', 'SQ'], ['file', 'HQ'], ['hd', 'HD 720']]):
+ video_url = video.get(format_id)
+ if not video_url:
+ continue
+ formats.append({
+ 'url': video_url,
+ 'format_id': format_id,
+ 'format_note': format_note,
+ 'quality': num,
+ })
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'upload_date': upload_date,
+ 'like_count': like_count,
+ 'age_limit': 18,
+ 'formats': formats,
+ }
\ No newline at end of file
import json
from .common import InfoExtractor
+from ..utils import compat_urllib_request
class VeohIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
+ age_limit = 0
+ if 'class="adultwarning-container"' in webpage:
+ self.report_age_confirmation()
+ age_limit = 18
+ request = compat_urllib_request.Request(url)
+ request.add_header('Cookie', 'confirmedAdult=true')
+ webpage = self._download_webpage(request, video_id)
m_youtube = re.search(r'http://www\.youtube\.com/v/(.*?)(\&|")', webpage)
if m_youtube is not None:
'thumbnail': info.get('highResImage') or info.get('medResImage'),
'description': info['description'],
'view_count': info['views'],
+ 'age_limit': age_limit,
}
priority_transport = playlist['priority_transport']
thumbnail = media['picture']
- width = media['width']
- height = media['height']
+ width = int_or_none(media['width'])
+ height = int_or_none(media['height'])
description = media['anons']
title = media['title']
duration = int_or_none(media.get('duration'))
(?P<id>[^&?#]+)'''
_TESTS = [{
'url': 'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280',
- 'file': 'GB1101300280.mp4',
"md5": "06bea460acb744eab74a9d7dcb4bfd61",
'info_dict': {
+ 'id': 'GB1101300280',
+ 'ext': 'mp4',
"upload_date": "20130624",
"uploader": "Hurts",
"title": "Somebody to Die For",
"width": 1920,
"height": 1080,
}
+ }, {
+ 'note': 'v3 SMIL format',
+ 'url': 'http://www.vevo.com/watch/cassadee-pope/i-wish-i-could-break-your-heart/USUV71302923',
+ 'md5': '893ec0e0d4426a1d96c01de8f2bdff58',
+ 'info_dict': {
+ 'id': 'USUV71302923',
+ 'ext': 'mp4',
+ 'upload_date': '20140219',
+ 'uploader': 'Cassadee Pope',
+ 'title': 'I Wish I Could Break Your Heart',
+ 'duration': 226.101,
+ 'age_limit': 0,
+ }
+ }, {
+ 'note': 'Age-limited video',
+ 'url': 'https://www.vevo.com/watch/justin-timberlake/tunnel-vision-explicit/USRV81300282',
+ 'info_dict': {
+ 'id': 'USRV81300282',
+ 'ext': 'mp4',
+ 'age_limit': 18,
+ 'title': 'Tunnel Vision (Explicit)',
+ 'uploader': 'Justin Timberlake',
+ 'upload_date': '20130704',
+ },
+ 'params': {
+ 'skip_download': 'true',
+ }
}]
_SMIL_BASE_URL = 'http://smil.lvl3.vevo.com/'
video_info = self._download_json(json_url, video_id)['video']
formats = self._formats_from_json(video_info)
+
+ is_explicit = video_info.get('isExplicit')
+ if is_explicit is True:
+ age_limit = 18
+ elif is_explicit is False:
+ age_limit = 0
+ else:
+ age_limit = None
+
+ # Download SMIL
+ smil_blocks = sorted((
+ f for f in video_info['videoVersions']
+ if f['sourceType'] == 13),
+ key=lambda f: f['version'])
+
+ smil_url = '%s/Video/V2/VFILE/%s/%sr.smil' % (
+ self._SMIL_BASE_URL, video_id, video_id.lower())
+ if smil_blocks:
+ smil_url_m = self._search_regex(
+ r'url="([^"]+)"', smil_blocks[-1]['data'], 'SMIL URL',
+ fatal=False)
+ if smil_url_m is not None:
+ smil_url = smil_url_m
+
try:
- smil_url = '%s/Video/V2/VFILE/%s/%sr.smil' % (
- self._SMIL_BASE_URL, video_id, video_id.lower())
smil_xml = self._download_webpage(smil_url, video_id,
'Downloading SMIL info')
formats.extend(self._formats_from_smil(smil_xml))
'upload_date': upload_date.strftime('%Y%m%d'),
'uploader': video_info['mainArtists'][0]['artistName'],
'duration': video_info['duration'],
+ 'age_limit': age_limit,
}
--- /dev/null
+from __future__ import unicode_literals
+
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class VideoBamIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)?videobam\.com/(?:videos/download/)?(?P<id>[a-zA-Z]+)'
+
+ _TESTS = [
+ {
+ 'url': 'http://videobam.com/OiJQM',
+ 'md5': 'db471f27763a531f10416a0c58b5a1e0',
+ 'info_dict': {
+ 'id': 'OiJQM',
+ 'ext': 'mp4',
+ 'title': 'Is Alcohol Worse Than Ecstasy?',
+ 'description': 'md5:d25b96151515c91debc42bfbb3eb2683',
+ 'uploader': 'frihetsvinge',
+ },
+ },
+ {
+ 'url': 'http://videobam.com/pqLvq',
+ 'md5': 'd9a565b5379a99126ef94e1d7f9a383e',
+ 'note': 'HD video',
+ 'info_dict': {
+ 'id': 'pqLvq',
+ 'ext': 'mp4',
+ }
+ },
+ ]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ page = self._download_webpage('http://videobam.com/%s' % video_id, video_id, 'Downloading page')
+
+ formats = []
+
+ for preference, format_id in enumerate(['low', 'high']):
+ mobj = re.search(r"%s: '(?P<url>[^']+)'" % format_id, page)
+ if not mobj:
+ continue
+ formats.append({
+ 'url': mobj.group('url'),
+ 'ext': 'mp4',
+ 'format_id': format_id,
+ 'preference': preference,
+ })
+
+ if not formats:
+ player_config = json.loads(self._html_search_regex(r'var player_config = ({.+?});', page, 'player config'))
+ formats = [{
+ 'url': item['url'],
+ 'ext': 'mp4',
+ } for item in player_config['playlist'] if 'autoPlay' in item]
+
+ self._sort_formats(formats)
+
+ title = self._og_search_title(page, default='VideoBam', fatal=False)
+ description = self._og_search_description(page, default=None)
+ thumbnail = self._og_search_thumbnail(page)
+ uploader = self._html_search_regex(r'Upload by ([^<]+)</a>', page, 'uploader', fatal=False, default=None)
+ view_count = int_or_none(
+ self._html_search_regex(r'<strong>Views:</strong> (\d+) ', page, 'view count', fatal=False))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'uploader': uploader,
+ 'view_count': view_count,
+ 'formats': formats,
+ 'age_limit': 18,
+ }
\ No newline at end of file
# Extract video thumbnail
video_thumbnail = config["video"].get("thumbnail")
if video_thumbnail is None:
- _, video_thumbnail = sorted((int(width), t_url) for (width, t_url) in config["video"]["thumbs"].items())[-1]
+ video_thumbs = config["video"].get("thumbs")
+ if video_thumbs and isinstance(video_thumbs, dict):
+ _, video_thumbnail = sorted((int(width), t_url) for (width, t_url) in video_thumbs.items())[-1]
# Extract video description
video_description = None
from __future__ import unicode_literals
import re
+import json
from .common import InfoExtractor
+from ..utils import unified_strdate
class VineIE(InfoExtractor):
'info_dict': {
'id': 'b9KOOWX7HUx',
'ext': 'mp4',
- 'uploader': 'Jack Dorsey',
'title': 'Chicken.',
+ 'description': 'Chicken.',
+ 'upload_date': '20130519',
+ 'uploader': 'Jack Dorsey',
+ 'uploader_id': '76',
},
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
-
video_id = mobj.group('id')
- webpage_url = 'https://vine.co/v/' + video_id
- webpage = self._download_webpage(webpage_url, video_id)
- self.report_extraction(video_id)
+ webpage = self._download_webpage('https://vine.co/v/' + video_id, video_id)
- video_url = self._html_search_meta('twitter:player:stream', webpage,
- 'video URL')
+ data = json.loads(self._html_search_regex(
+ r'window\.POST_DATA = { %s: ({.+?}) }' % video_id, webpage, 'vine data'))
- uploader = self._html_search_regex(r'<p class="username">(.*?)</p>',
- webpage, 'uploader', fatal=False, flags=re.DOTALL)
+ formats = [
+ {
+ 'url': data['videoLowURL'],
+ 'ext': 'mp4',
+ 'format_id': 'low',
+ },
+ {
+ 'url': data['videoUrl'],
+ 'ext': 'mp4',
+ 'format_id': 'standard',
+ }
+ ]
return {
'id': video_id,
- 'url': video_url,
- 'ext': 'mp4',
'title': self._og_search_title(webpage),
- 'thumbnail': self._og_search_thumbnail(webpage),
- 'uploader': uploader,
- }
+ 'description': data['description'],
+ 'thumbnail': data['thumbnailUrl'],
+ 'upload_date': unified_strdate(data['created']),
+ 'uploader': data['username'],
+ 'uploader_id': data['userIdStr'],
+ 'like_count': data['likes']['count'],
+ 'comment_count': data['comments']['count'],
+ 'repost_count': data['reposts']['count'],
+ 'formats': formats,
+ }
\ No newline at end of file
class VKIE(InfoExtractor):
IE_NAME = 'vk.com'
- _VALID_URL = r'https?://vk\.com/(?:videos.*?\?.*?z=)?video(?P<id>.*?)(?:\?|%2F|$)'
+ _VALID_URL = r'https?://vk\.com/(?:video_ext\.php\?.*?\boid=(?P<oid>\d+).*?\bid=(?P<id>\d+)|(?:videos.*?\?.*?z=)?video(?P<videoid>.*?)(?:\?|%2F|$))'
_NETRC_MACHINE = 'vk'
_TESTS = [
'duration': 558,
}
},
+ {
+ 'note': 'Embedded video',
+ 'url': 'http://vk.com/video_ext.php?oid=32194266&id=162925554&hash=7d8c2e0d5e05aeaa&hd=1',
+ 'md5': 'c7ce8f1f87bec05b3de07fdeafe21a0a',
+ 'info_dict': {
+ 'id': '162925554',
+ 'ext': 'mp4',
+ 'uploader': 'Vladimir Gavrin',
+ 'title': 'Lin Dan',
+ 'duration': 101,
+ }
+ },
{
'url': 'http://vk.com/video-8871596_164049491',
'md5': 'a590bcaf3d543576c9bd162812387666',
'duration': 8352,
},
'skip': 'Requires vk account credentials',
- }
+ },
]
def _login(self):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = mobj.group('videoid')
+
+ if not video_id:
+ video_id = '%s_%s' % (mobj.group('oid'), mobj.group('id'))
info_url = 'http://vk.com/al_video.php?act=show&al=1&video=%s' % video_id
info_page = self._download_webpage(info_url, video_id)
webpage_src = self._download_webpage(url, video_id)
m_vevo_id = re.search(r'videoId=(.*?)&?',
- webpage_src)
-
+ webpage_src)
+
if m_vevo_id is not None:
self.to_screen(u'Vevo video detected:')
return self.url_result('vevo:%s' % m_vevo_id.group(1), ie='Vevo')
}]
if not hd:
+ mrss_url = self._search_regex(r'<link rel="canonical" href="([^"]+)', webpage, 'mrss_url')
webpage = self._download_webpage(mrss_url + '?hd', video_id, note='Downloading HD webpage')
if is_hd(webpage):
video_url = extract_video_url(webpage)
from ..utils import (
compat_urllib_parse_urlparse,
compat_urllib_request,
+ parse_duration,
+ str_to_int,
)
+
class XTubeIE(InfoExtractor):
- _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>xtube\.com/watch\.php\?v=(?P<videoid>[^/?&]+))'
+ _VALID_URL = r'https?://(?:www\.)?(?P<url>xtube\.com/watch\.php\?v=(?P<videoid>[^/?&]+))'
_TEST = {
'url': 'http://www.xtube.com/watch.php?v=kVTUy_G222_',
- 'file': 'kVTUy_G222_.mp4',
'md5': '092fbdd3cbe292c920ef6fc6a8a9cdab',
'info_dict': {
- "title": "strange erotica",
- "description": "surreal gay themed erotica...almost an ET kind of thing",
- "uploader": "greenshowers",
- "age_limit": 18,
+ 'id': 'kVTUy_G222_',
+ 'ext': 'mp4',
+ 'title': 'strange erotica',
+ 'description': 'surreal gay themed erotica...almost an ET kind of thing',
+ 'uploader': 'greenshowers',
+ 'duration': 450,
+ 'age_limit': 18,
}
}
req.add_header('Cookie', 'age_verified=1')
webpage = self._download_webpage(req, video_id)
- video_title = self._html_search_regex(r'<div class="p_5px[^>]*>([^<]+)', webpage, 'title')
- video_uploader = self._html_search_regex(r'so_s\.addVariable\("owner_u", "([^"]+)', webpage, 'uploader', fatal=False)
- video_description = self._html_search_regex(r'<p class="video_description">([^<]+)', webpage, 'description', fatal=False)
- video_url= self._html_search_regex(r'var videoMp4 = "([^"]+)', webpage, 'video_url').replace('\\/', '/')
+ video_title = self._html_search_regex(r'<p class="title">([^<]+)', webpage, 'title')
+ video_uploader = self._html_search_regex(
+ r'so_s\.addVariable\("owner_u", "([^"]+)', webpage, 'uploader', fatal=False)
+ video_description = self._html_search_regex(
+ r'<p class="fieldsDesc">([^<]+)', webpage, 'description', fatal=False)
+ video_url = self._html_search_regex(r'var videoMp4 = "([^"]+)', webpage, 'video_url').replace('\\/', '/')
+ duration = parse_duration(self._html_search_regex(
+ r'<span class="bold">Runtime:</span> ([^<]+)</p>', webpage, 'duration', fatal=False))
+ view_count = self._html_search_regex(
+ r'<span class="bold">Views:</span> ([\d,\.]+)</p>', webpage, 'view count', fatal=False)
+ if view_count:
+ view_count = str_to_int(view_count)
+ comment_count = self._html_search_regex(
+ r'<div id="commentBar">([\d,\.]+) Comments</div>', webpage, 'comment count', fatal=False)
+ if comment_count:
+ comment_count = str_to_int(comment_count)
+
path = compat_urllib_parse_urlparse(video_url).path
extension = os.path.splitext(path)[1][1:]
format = path.split('/')[5].split('_')[:2]
'title': video_title,
'uploader': video_uploader,
'description': video_description,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'comment_count': comment_count,
'url': video_url,
'ext': extension,
'format': format,
ExtractorError,
int_or_none,
PagedList,
- RegexNotFoundError,
unescapeHTML,
unified_strdate,
orderedSet,
'135': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'DASH video', 'preference': -40},
'136': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'DASH video', 'preference': -40},
'137': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
- '138': {'ext': 'mp4', 'height': 1081, 'resolution': '>1080p', 'format_note': 'DASH video', 'preference': -40},
+ '138': {'ext': 'mp4', 'height': 2160, 'resolution': '2160p', 'format_note': 'DASH video', 'preference': -40},
'160': {'ext': 'mp4', 'height': 192, 'resolution': '192p', 'format_note': 'DASH video', 'preference': -40},
- '264': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
+ '264': {'ext': 'mp4', 'height': 1440, 'resolution': '1440p', 'format_note': 'DASH video', 'preference': -40},
# Dash mp4 audio
'139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
# the id of the playlist is just 'RD' + video_id
url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
- title_span = (get_element_by_attribute('class', 'title long-title', webpage) or
- get_element_by_attribute('class', 'title ', webpage))
+ search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
+ title_span = (search_title('playlist-title') or
+ search_title('title long-title') or search_title('title'))
title = clean_html(title_span)
- video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s' % re.escape(playlist_id)
- ids = orderedSet(re.findall(video_re, webpage))
+ video_re = r'''(?x)data-video-username="(.*?)".*?
+ href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id)
+ matches = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL))
+ # Some of the videos may have been deleted, their username field is empty
+ ids = [video_id for (username, video_id) in matches if username]
url_results = self._ids_to_results(ids)
return self.playlist_result(url_results, playlist_id, title)
class YoutubeUserIE(InfoExtractor):
IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
- _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
+ _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
_TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
_GDATA_PAGE_SIZE = 50
_GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
for video_id in video_ids]
return self.playlist_result(videos, query)
+
class YoutubeSearchDateIE(YoutubeSearchIE):
IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
_API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
_SEARCH_KEY = 'ytsearchdate'
IE_DESC = u'YouTube.com searches, newest videos first'
+
+class YoutubeSearchURLIE(InfoExtractor):
+ IE_DESC = u'YouTube.com search URLs'
+ IE_NAME = u'youtube:search_url'
+ _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ query = compat_urllib_parse.unquote_plus(mobj.group('query'))
+
+ webpage = self._download_webpage(url, query)
+ result_code = self._search_regex(
+ r'(?s)<ol id="search-results"(.*?)</ol>', webpage, u'result HTML')
+
+ part_codes = re.findall(
+ r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
+ entries = []
+ for part_code in part_codes:
+ part_title = self._html_search_regex(
+ r'(?s)title="([^"]+)"', part_code, 'item title', fatal=False)
+ part_url_snippet = self._html_search_regex(
+ r'(?s)href="([^"]+)"', part_code, 'item URL')
+ part_url = compat_urlparse.urljoin(
+ 'https://www.youtube.com/', part_url_snippet)
+ entries.append({
+ '_type': 'url',
+ 'url': part_url,
+ 'title': part_title,
+ })
+
+ return {
+ '_type': 'playlist',
+ 'entries': entries,
+ 'title': query,
+ }
+
+
class YoutubeShowIE(InfoExtractor):
IE_DESC = u'YouTube.com (multi-season) shows'
_VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
# coding: utf-8
+from __future__ import unicode_literals
import re
_VALID_URL = r'^https?://www\.zdf\.de/ZDFmediathek(?P<hash>#)?/(.*beitrag/(?:video/)?)(?P<video_id>[0-9]+)(?:/[^/?]+)?(?:\?.*)?'
_TEST = {
- u"url": u"http://www.zdf.de/ZDFmediathek/beitrag/video/2037704/ZDFspezial---Ende-des-Machtpokers--?bc=sts;stt",
- u"file": u"2037704.webm",
- u"info_dict": {
- u"upload_date": u"20131127",
- u"description": u"Union und SPD haben sich auf einen Koalitionsvertrag geeinigt. Aber was bedeutet das für die Bürger? Sehen Sie hierzu das ZDFspezial \"Ende des Machtpokers - Große Koalition für Deutschland\".",
- u"uploader": u"spezial",
- u"title": u"ZDFspezial - Ende des Machtpokers"
+ 'url': 'http://www.zdf.de/ZDFmediathek/beitrag/video/2037704/ZDFspezial---Ende-des-Machtpokers--?bc=sts;stt',
+ 'info_dict': {
+ 'id': '2037704',
+ 'ext': 'webm',
+ 'title': 'ZDFspezial - Ende des Machtpokers',
+ 'description': 'Union und SPD haben sich auf einen Koalitionsvertrag geeinigt. Aber was bedeutet das für die Bürger? Sehen Sie hierzu das ZDFspezial "Ende des Machtpokers - Große Koalition für Deutschland".',
+ 'duration': 1022,
+ 'uploader': 'spezial',
+ 'uploader_id': '225948',
+ 'upload_date': '20131127',
},
- u"skip": u"Videos on ZDF.de are depublicised in short order",
+ 'skip': 'Videos on ZDF.de are depublicised in short order',
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('video_id')
- xml_url = u'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id
+ xml_url = 'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id
doc = self._download_xml(
xml_url, video_id,
- note=u'Downloading video info',
- errnote=u'Failed to download video info')
+ note='Downloading video info',
+ errnote='Failed to download video info')
title = doc.find('.//information/title').text
description = doc.find('.//information/detail').text
+ duration = int(doc.find('.//details/lengthSec').text)
uploader_node = doc.find('.//details/originChannelTitle')
uploader = None if uploader_node is None else uploader_node.text
- duration_str = doc.find('.//details/length').text
- duration_m = re.match(r'''(?x)^
- (?P<hours>[0-9]{2})
- :(?P<minutes>[0-9]{2})
- :(?P<seconds>[0-9]{2})
- (?:\.(?P<ms>[0-9]+)?)
- ''', duration_str)
- duration = (
- (
- (int(duration_m.group('hours')) * 60 * 60) +
- (int(duration_m.group('minutes')) * 60) +
- int(duration_m.group('seconds'))
- )
- if duration_m
- else None
- )
+ uploader_id_node = doc.find('.//details/originChannelId')
+ uploader_id = None if uploader_id_node is None else uploader_id_node.text
upload_date = unified_strdate(doc.find('.//details/airtime').text)
def xml_to_format(fnode):
video_url = fnode.find('url').text
- is_available = u'http://www.metafilegenerator' not in video_url
+ is_available = 'http://www.metafilegenerator' not in video_url
format_id = fnode.attrib['basetype']
format_m = re.match(r'''(?x)
quality = fnode.find('./quality').text
abr = int(fnode.find('./audioBitrate').text) // 1000
- vbr = int(fnode.find('./videoBitrate').text) // 1000
+ vbr_node = fnode.find('./videoBitrate')
+ vbr = None if vbr_node is None else int(vbr_node.text) // 1000
- format_note = u''
+ width_node = fnode.find('./width')
+ width = None if width_node is None else int_or_none(width_node.text)
+ height_node = fnode.find('./height')
+ height = None if height_node is None else int_or_none(height_node.text)
+
+ format_note = ''
if not format_note:
format_note = None
return {
- 'format_id': format_id + u'-' + quality,
+ 'format_id': format_id + '-' + quality,
'url': video_url,
'ext': ext,
'acodec': format_m.group('acodec'),
'vcodec': format_m.group('vcodec'),
'abr': abr,
'vbr': vbr,
- 'width': int_or_none(fnode.find('./width').text),
- 'height': int_or_none(fnode.find('./height').text),
+ 'width': width,
+ 'height': height,
'filesize': int_or_none(fnode.find('./filesize').text),
'format_note': format_note,
'protocol': proto,
return {
'id': video_id,
'title': title,
- 'formats': formats,
'description': description,
- 'uploader': uploader,
'duration': duration,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
'upload_date': upload_date,
- }
+ 'formats': formats,
+ }
\ No newline at end of file
#!/usr/bin/env python
# -*- coding: utf-8 -*-
+import contextlib
import ctypes
import datetime
import email.utils
'%B %d %Y',
'%b %d %Y',
'%Y-%m-%d',
+ '%d.%m.%Y',
'%d/%m/%Y',
'%Y/%m/%d %H:%M:%S',
'%Y-%m-%d %H:%M:%S',
'%Y-%m-%dT%H:%M:%S.%fZ',
'%Y-%m-%dT%H:%M:%S.%f0Z',
'%Y-%m-%dT%H:%M:%S',
+ '%Y-%m-%dT%H:%M:%S.%f',
'%Y-%m-%dT%H:%M',
]
for expression in format_expressions:
else:
struct_pack = struct.pack
struct_unpack = struct.unpack
+
+
+def read_batch_urls(batch_fd):
+ def fixup(url):
+ if not isinstance(url, compat_str):
+ url = url.decode('utf-8', 'replace')
+ BOM_UTF8 = u'\xef\xbb\xbf'
+ if url.startswith(BOM_UTF8):
+ url = url[len(BOM_UTF8):]
+ url = url.strip()
+ if url.startswith(('#', ';', ']')):
+ return False
+ return url
+
+ with contextlib.closing(batch_fd) as fd:
+ return [url for url in map(fixup, fd) if url]
-__version__ = '2014.02.21.1'
+__version__ = '2014.03.04.2'