X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2FInfoExtractors.py;h=f69bad4f3a77d49be2d8b9cac3c6c256f4ee9898;hb=64c78d50ccf05f34e27b652530fc8b702aa54122;hp=3715bef5084ddfb9ce76e3525373ca806691c9ea;hpb=92e3e18a1d1fb35b6d73b609725c250123835969;p=youtube-dl diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 3715bef50..f69bad4f3 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -3,7 +3,9 @@ from __future__ import absolute_import +import base64 import datetime +import itertools import netrc import os import re @@ -13,6 +15,7 @@ import email.utils import xml.etree.ElementTree import random import math +import operator from .utils import * @@ -34,15 +37,16 @@ class InfoExtractor(object): url: Final video URL. title: Video title, unescaped. ext: Video filename extension. - uploader: Full name of the video uploader. - upload_date: Video upload date (YYYYMMDD). The following fields are optional: format: The video format, defaults to ext (used for --get-format) thumbnail: Full URL to a video thumbnail image. description: One-line video description. + uploader: Full name of the video uploader. + upload_date: Video upload date (YYYYMMDD). uploader_id: Nickname or id of the video uploader. + location: Physical location of the video. player_url: SWF Player URL (used for rtmpdump). subtitles: The .srt file contents. urlhandle: [internal] The urlHandle to be used to download the file, @@ -70,13 +74,15 @@ class InfoExtractor(object): self._ready = False self.set_downloader(downloader) - def suitable(self, url): + @classmethod + def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" - return re.match(self._VALID_URL, url) is not None + return re.match(cls._VALID_URL, url) is not None - def working(self): + @classmethod + def working(cls): """Getter method for _WORKING.""" - return self._WORKING + return cls._WORKING def initialize(self): """Initializes an instance (authentication, etc).""" @@ -105,6 +111,25 @@ class InfoExtractor(object): def IE_NAME(self): return type(self).__name__[:-2] + def _request_webpage(self, url_or_request, video_id, note=None, errnote=None): + """ Returns the response handle """ + if note is None: + note = u'Downloading video webpage' + self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note)) + try: + return compat_urllib_request.urlopen(url_or_request) + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + if errnote is None: + errnote = u'Unable to download webpage' + raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2]) + + def _download_webpage(self, url_or_request, video_id, note=None, errnote=None): + """ Returns the data of the page as a string """ + urlh = self._request_webpage(url_or_request, video_id, note, errnote) + webpage_bytes = urlh.read() + return webpage_bytes.decode('utf-8', 'replace') + + class YoutubeIE(InfoExtractor): """Information extractor for youtube.com.""" @@ -114,7 +139,6 @@ class YoutubeIE(InfoExtractor): (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/| tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls - (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs (?: # the various things that can precede the ID: (?:(?:v|embed|e)/) # v/ or embed/ or e/ |(?: # or the v= param in all its forms @@ -129,7 +153,7 @@ class YoutubeIE(InfoExtractor): (?(1).+)? # if we found the ID, everything can follow $""" _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1' - _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en' + _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en' _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' _NETRC_MACHINE = 'youtube' @@ -166,9 +190,11 @@ class YoutubeIE(InfoExtractor): } IE_NAME = u'youtube' - def suitable(self, url): + @classmethod + def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" - return re.match(self._VALID_URL, url, re.VERBOSE) is not None + if YoutubePlaylistIE.suitable(url): return False + return re.match(cls._VALID_URL, url, re.VERBOSE) is not None def report_lang(self): """Report attempt to set language.""" @@ -242,13 +268,18 @@ class YoutubeIE(InfoExtractor): srt_lang = list(srt_lang_list.keys())[0] if not srt_lang in srt_lang_list: return (u'WARNING: no closed captions found in the specified language', None) - request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id)) + params = compat_urllib_parse.urlencode({ + 'lang': srt_lang, + 'name': srt_lang_list[srt_lang].encode('utf-8'), + 'v': video_id, + }) + url = 'http://www.youtube.com/api/timedtext?' + params try: - srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8') + srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None) if not srt_xml: - return (u'WARNING: unable to download video subtitles', None) + return (u'WARNING: Did not fetch video subtitles', None) return (None, self._closed_captions_xml_to_srt(srt_xml)) def _print_formats(self, formats): @@ -277,7 +308,7 @@ class YoutubeIE(InfoExtractor): else: raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) except (IOError, netrc.NetrcParseError) as err: - self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err)) + self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err)) return # Set language @@ -286,30 +317,65 @@ class YoutubeIE(InfoExtractor): self.report_lang() compat_urllib_request.urlopen(request).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err)) + self._downloader.report_warning(u'unable to set language: %s' % compat_str(err)) return # No authentication to be performed if username is None: return + request = compat_urllib_request.Request(self._LOGIN_URL) + try: + login_page = compat_urllib_request.urlopen(request).read().decode('utf-8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err)) + return + + galx = None + dsh = None + match = re.search(re.compile(r']* name="loginForm"', login_results) is not None: - self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password') + if re.search(r'(?i)]* id="gaia_loginform"', login_results) is not None: + self._downloader.report_warning(u'unable to log in: bad username or password') return except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err)) + self._downloader.report_warning(u'unable to log in: %s' % compat_str(err)) return # Confirm age @@ -397,7 +463,7 @@ class YoutubeIE(InfoExtractor): # uploader_id video_uploader_id = None - mobj = re.search(r'', video_webpage) + mobj = re.search(r'', video_webpage) if mobj is not None: video_uploader_id = mobj.group(1) else: @@ -656,14 +722,11 @@ class DailymotionIE(InfoExtractor): _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)' IE_NAME = u'dailymotion' + _WORKING = False def __init__(self, downloader=None): InfoExtractor.__init__(self, downloader) - def report_download_webpage(self, video_id): - """Report webpage download.""" - self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id) - def report_extraction(self, video_id): """Report information extraction.""" self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id) @@ -682,13 +745,7 @@ class DailymotionIE(InfoExtractor): # Retrieve video webpage to extract further information request = compat_urllib_request.Request(url) request.add_header('Cookie', 'family_filter=off') - try: - self.report_download_webpage(video_id) - webpage_bytes = compat_urllib_request.urlopen(request).read() - webpage = webpage_bytes.decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err)) - return + webpage = self._download_webpage(request, video_id) # Extract URL, uploader and title from webpage self.report_extraction(video_id) @@ -961,7 +1018,7 @@ class VimeoIE(InfoExtractor): """Information extractor for vimeo.com.""" # _VALID_URL matches Vimeo URLs - _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)' + _VALID_URL = r'(?Phttps?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?Pplay_redirect_hls\?clip_id=)?(?:videos?/)?(?P[0-9]+)' IE_NAME = u'vimeo' def __init__(self, downloader=None): @@ -982,7 +1039,11 @@ class VimeoIE(InfoExtractor): self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) return - video_id = mobj.group(1) + video_id = mobj.group('id') + if not mobj.group('proto'): + url = 'https://' + url + if mobj.group('direct_link'): + url = 'https://vimeo.com/' + video_id # Retrieve video webpage to extract further information request = compat_urllib_request.Request(url, None, std_headers) @@ -1273,7 +1334,7 @@ class GenericIE(InfoExtractor): opener = compat_urllib_request.OpenerDirector() for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler, HTTPMethodFallback, HEADRedirectHandler, - compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]: + compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]: opener.add_handler(handler()) response = opener.open(HeadRequest(url)) @@ -1309,6 +1370,9 @@ class GenericIE(InfoExtractor): if mobj is None: # Broaden the search a little bit mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage) + if mobj is None: + # Broaden the search a little bit: JWPlayer JS loader + mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage) if mobj is None: self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) return @@ -1392,7 +1456,7 @@ class YoutubeSearchIE(InfoExtractor): self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query)) return elif n > self._max_youtube_results: - self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n)) + self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n)) n = self._max_youtube_results self._download_n_results(query, n) return @@ -1412,12 +1476,16 @@ class YoutubeSearchIE(InfoExtractor): result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1) request = compat_urllib_request.Request(result_url) try: - data = compat_urllib_request.urlopen(request).read() + data = compat_urllib_request.urlopen(request).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err)) return api_response = json.loads(data)['data'] + if not 'items' in api_response: + self._downloader.trouble(u'[youtube] No video results') + return + new_ids = list(video['id'] for video in api_response['items']) video_ids += new_ids @@ -1470,7 +1538,7 @@ class GoogleSearchIE(InfoExtractor): self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query)) return elif n > self._max_google_results: - self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n)) + self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n)) n = self._max_google_results self._download_n_results(query, n) return @@ -1507,2816 +1575,225 @@ class GoogleSearchIE(InfoExtractor): if re.search(self._MORE_PAGES_INDICATOR, page) is None: for id in video_ids: - self._downloader#!/usr/bin/env python -# -*- coding: utf-8 -*- + self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id]) + return -from __future__ import absolute_import + pagenum = pagenum + 1 -import datetime -import netrc -import os -import re -import socket -import time -import email.utils -import xml.etree.ElementTree -import random -import math -from .utils import * +class YahooSearchIE(InfoExtractor): + """Information Extractor for Yahoo! Video search queries.""" + _WORKING = False + _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+' + _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s' + _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"' + _MORE_PAGES_INDICATOR = r'\s*Next' + _max_yahoo_results = 1000 + IE_NAME = u'video.yahoo:search' -class InfoExtractor(object): - """Information Extractor class. + def __init__(self, downloader=None): + InfoExtractor.__init__(self, downloader) - Information extractors are the classes that, given a URL, extract - information about the video (or videos) the URL refers to. This - information includes the real video URL, the video title, author and - others. The information is stored in a dictionary which is then - passed to the FileDownloader. The FileDownloader processes this - information possibly downloading the video to the file system, among - other possible outcomes. + def report_download_page(self, query, pagenum): + """Report attempt to download playlist page with given number.""" + query = query.decode(preferredencoding()) + self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum)) - The dictionaries must include the following fields: + def _real_extract(self, query): + mobj = re.match(self._VALID_URL, query) + if mobj is None: + self._downloader.trouble(u'ERROR: invalid search query "%s"' % query) + return - id: Video identifier. - url: Final video URL. - title: Video title, unescaped. - ext: Video filename extension. - uploader: Full name of the video uploader. - upload_date: Video upload date (YYYYMMDD). + prefix, query = query.split(':') + prefix = prefix[8:] + query = query.encode('utf-8') + if prefix == '': + self._download_n_results(query, 1) + return + elif prefix == 'all': + self._download_n_results(query, self._max_yahoo_results) + return + else: + try: + n = int(prefix) + if n <= 0: + self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query)) + return + elif n > self._max_yahoo_results: + self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n)) + n = self._max_yahoo_results + self._download_n_results(query, n) + return + except ValueError: # parsing prefix as integer fails + self._download_n_results(query, 1) + return - The following fields are optional: + def _download_n_results(self, query, n): + """Downloads a specified number of results for a query""" - format: The video format, defaults to ext (used for --get-format) - thumbnail: Full URL to a video thumbnail image. - description: One-line video description. - uploader_id: Nickname or id of the video uploader. - player_url: SWF Player URL (used for rtmpdump). - subtitles: The .srt file contents. - urlhandle: [internal] The urlHandle to be used to download the file, - like returned by urllib.request.urlopen + video_ids = [] + already_seen = set() + pagenum = 1 - The fields should all be Unicode strings. + while True: + self.report_download_page(query, pagenum) + result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum) + request = compat_urllib_request.Request(result_url) + try: + page = compat_urllib_request.urlopen(request).read() + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err)) + return - Subclasses of this one should re-define the _real_initialize() and - _real_extract() methods and define a _VALID_URL regexp. - Probably, they should also be added to the list of extractors. + # Extract video identifiers + for mobj in re.finditer(self._VIDEO_INDICATOR, page): + video_id = mobj.group(1) + if video_id not in already_seen: + video_ids.append(video_id) + already_seen.add(video_id) + if len(video_ids) == n: + # Specified n videos reached + for id in video_ids: + self._downloader.download(['http://video.yahoo.com/watch/%s' % id]) + return - _real_extract() must return a *list* of information dictionaries as - described above. + if re.search(self._MORE_PAGES_INDICATOR, page) is None: + for id in video_ids: + self._downloader.download(['http://video.yahoo.com/watch/%s' % id]) + return - Finally, the _WORKING attribute should be set to False for broken IEs - in order to warn the users and skip the tests. - """ + pagenum = pagenum + 1 - _ready = False - _downloader = None - _WORKING = True + +class YoutubePlaylistIE(InfoExtractor): + """Information Extractor for YouTube playlists.""" + + _VALID_URL = r"""(?: + (?:https?://)? + (?:\w+\.)? + youtube\.com/ + (?: + (?:course|view_play_list|my_playlists|artist|playlist|watch) + \? (?:.*?&)*? (?:p|a|list)= + | user/.*?/user/ + | p/ + | user/.*?#[pg]/c/ + ) + ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,}) + .* + | + ((?:PL|EC|UU)[0-9A-Za-z-_]{10,}) + )""" + _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json' + _MAX_RESULTS = 50 + IE_NAME = u'youtube:playlist' def __init__(self, downloader=None): - """Constructor. Receives an optional downloader.""" - self._ready = False - self.set_downloader(downloader) + InfoExtractor.__init__(self, downloader) - def suitable(self, url): + @classmethod + def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" - return re.match(self._VALID_URL, url) is not None + return re.match(cls._VALID_URL, url, re.VERBOSE) is not None - def working(self): - """Getter method for _WORKING.""" - return self._WORKING + def report_download_page(self, playlist_id, pagenum): + """Report attempt to download playlist page with given number.""" + self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum)) - def initialize(self): - """Initializes an instance (authentication, etc).""" - if not self._ready: - self._real_initialize() - self._ready = True + def _real_extract(self, url): + # Extract playlist id + mobj = re.match(self._VALID_URL, url, re.VERBOSE) + if mobj is None: + self._downloader.trouble(u'ERROR: invalid url: %s' % url) + return - def extract(self, url): - """Extracts URL information and returns it in list of dicts.""" - self.initialize() - return self._real_extract(url) + # Download playlist videos from API + playlist_id = mobj.group(1) or mobj.group(2) + page_num = 1 + videos = [] - def set_downloader(self, downloader): - """Sets the downloader for this IE.""" - self._downloader = downloader + while True: + self.report_download_page(playlist_id, page_num) - def _real_initialize(self): - """Real initialization process. Redefine in subclasses.""" - pass + url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1) + try: + page = compat_urllib_request.urlopen(url).read().decode('utf8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err)) + return + + try: + response = json.loads(page) + except ValueError as err: + self._downloader.trouble(u'ERROR: Invalid JSON in API response: ' + compat_str(err)) + return + + if not 'feed' in response or not 'entry' in response['feed']: + self._downloader.trouble(u'ERROR: Got a malformed response from YouTube API') + return + videos += [ (entry['yt$position']['$t'], entry['content']['src']) + for entry in response['feed']['entry'] + if 'content' in entry ] + + if len(response['feed']['entry']) < self._MAX_RESULTS: + break + page_num += 1 + + videos = [v[1] for v in sorted(videos)] + total = len(videos) + + playliststart = self._downloader.params.get('playliststart', 1) - 1 + playlistend = self._downloader.params.get('playlistend', -1) + if playlistend == -1: + videos = videos[playliststart:] + else: + videos = videos[playliststart:playlistend] + + if len(videos) == total: + self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total)) + else: + self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos))) + + for video in videos: + self._downloader.download([video]) + return + + +class YoutubeChannelIE(InfoExtractor): + """Information Extractor for YouTube channels.""" + + _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$" + _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en' + _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}" + IE_NAME = u'youtube:channel' + + def report_download_page(self, channel_id, pagenum): + """Report attempt to download channel page with given number.""" + self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum)) def _real_extract(self, url): - """Real extraction process. Redefine in subclasses.""" - pass + # Extract channel id + mobj = re.match(self._VALID_URL, url) + if mobj is None: + self._downloader.trouble(u'ERROR: invalid url: %s' % url) + return - @property - def IE_NAME(self): - return type(self).__name__[:-2] + # Download channel pages + channel_id = mobj.group(1) + video_ids = [] + pagenum = 1 -class YoutubeIE(InfoExtractor): - """Information extractor for youtube.com.""" - - _VALID_URL = r"""^ - ( - (?:https?://)? # http(s):// (optional) - (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/| - tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains - (?:.*?\#/)? # handle anchor (#/) redirect urls - (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs - (?: # the various things that can precede the ID: - (?:(?:v|embed|e)/) # v/ or embed/ or e/ - |(?: # or the v= param in all its forms - (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) - (?:\?|\#!?) # the params delimiter ? or # or #! - (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx) - v= - ) - )? # optional -> youtube.com/xxxx is OK - )? # all until now is optional -> you can pass the naked ID - ([0-9A-Za-z_-]+) # here is it! the YouTube video ID - (?(1).+)? # if we found the ID, everything can follow - $""" - _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1' - _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en' - _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en' - _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' - _NETRC_MACHINE = 'youtube' - # Listed in order of quality - _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13'] - _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13'] - _video_extensions = { - '13': '3gp', - '17': 'mp4', - '18': 'mp4', - '22': 'mp4', - '37': 'mp4', - '38': 'video', # You actually don't know if this will be MOV, AVI or whatever - '43': 'webm', - '44': 'webm', - '45': 'webm', - '46': 'webm', - } - _video_dimensions = { - '5': '240x400', - '6': '???', - '13': '???', - '17': '144x176', - '18': '360x640', - '22': '720x1280', - '34': '360x640', - '35': '480x854', - '37': '1080x1920', - '38': '3072x4096', - '43': '360x640', - '44': '480x854', - '45': '720x1280', - '46': '1080x1920', - } - IE_NAME = u'youtube' - - def suitable(self, url): - """Receives a URL and returns True if suitable for this IE.""" - return re.match(self._VALID_URL, url, re.VERBOSE) is not None - - def report_lang(self): - """Report attempt to set language.""" - self._downloader.to_screen(u'[youtube] Setting language') - - def report_login(self): - """Report attempt to log in.""" - self._downloader.to_screen(u'[youtube] Logging in') - - def report_age_confirmation(self): - """Report attempt to confirm age.""" - self._downloader.to_screen(u'[youtube] Confirming age') - - def report_video_webpage_download(self, video_id): - """Report attempt to download video webpage.""" - self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id) - - def report_video_info_webpage_download(self, video_id): - """Report attempt to download video info webpage.""" - self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id) - - def report_video_subtitles_download(self, video_id): - """Report attempt to download video info webpage.""" - self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id) - - def report_information_extraction(self, video_id): - """Report attempt to extract video information.""" - self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id) - - def report_unavailable_format(self, video_id, format): - """Report extracted video URL.""" - self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format)) - - def report_rtmp_download(self): - """Indicate the download will use the RTMP protocol.""" - self._downloader.to_screen(u'[youtube] RTMP download detected') - - def _closed_captions_xml_to_srt(self, xml_string): - srt = '' - texts = re.findall(r'([^<]+)', xml_string, re.MULTILINE) - # TODO parse xml instead of regex - for n, (start, dur_tag, dur, caption) in enumerate(texts): - if not dur: dur = '4' - start = float(start) - end = start + float(dur) - start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000) - end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000) - caption = unescapeHTML(caption) - caption = unescapeHTML(caption) # double cycle, intentional - srt += str(n+1) + '\n' - srt += start + ' --> ' + end + '\n' - srt += caption + '\n\n' - return srt - - def _extract_subtitles(self, video_id): - self.report_video_subtitles_download(video_id) - request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) - try: - srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None) - srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list) - srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list) - if not srt_lang_list: - return (u'WARNING: video has no closed captions', None) - if self._downloader.params.get('subtitleslang', False): - srt_lang = self._downloader.params.get('subtitleslang') - elif 'en' in srt_lang_list: - srt_lang = 'en' - else: - srt_lang = list(srt_lang_list.keys())[0] - if not srt_lang in srt_lang_list: - return (u'WARNING: no closed captions found in the specified language', None) - request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id)) - try: - srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None) - if not srt_xml: - return (u'WARNING: unable to download video subtitles', None) - return (None, self._closed_captions_xml_to_srt(srt_xml)) - - def _print_formats(self, formats): - print('Available formats:') - for x in formats: - print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))) - - def _real_initialize(self): - if self._downloader is None: - return - - username = None - password = None - downloader_params = self._downloader.params - - # Attempt to use provided username and password or .netrc data - if downloader_params.get('username', None) is not None: - username = downloader_params['username'] - password = downloader_params['password'] - elif downloader_params.get('usenetrc', False): - try: - info = netrc.netrc().authenticators(self._NETRC_MACHINE) - if info is not None: - username = info[0] - password = info[2] - else: - raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) - except (IOError, netrc.NetrcParseError) as err: - self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err)) - return - - # Set language - request = compat_urllib_request.Request(self._LANG_URL) - try: - self.report_lang() - compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err)) - return - - # No authentication to be performed - if username is None: - return - - # Log in - login_form = { - 'current_form': 'loginForm', - 'next': '/', - 'action_login': 'Log In', - 'username': username, - 'password': password, - } - request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form)) - try: - self.report_login() - login_results = compat_urllib_request.urlopen(request).read().decode('utf-8') - if re.search(r'(?i)]* name="loginForm"', login_results) is not None: - self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password') - return - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err)) - return - - # Confirm age - age_form = { - 'next_url': '/', - 'action_confirm': 'Confirm', - } - request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form)) - try: - self.report_age_confirmation() - age_results = compat_urllib_request.urlopen(request).read().decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err)) - return - - def _extract_id(self, url): - mobj = re.match(self._VALID_URL, url, re.VERBOSE) - if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) - return - video_id = mobj.group(2) - return video_id - - def _real_extract(self, url): - # Extract original video URL from URL with redirection, like age verification, using next_url parameter - mobj = re.search(self._NEXT_URL_RE, url) - if mobj: - url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/') - video_id = self._extract_id(url) - - # Get video webpage - self.report_video_webpage_download(video_id) - url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id - request = compat_urllib_request.Request(url) - try: - video_webpage_bytes = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err)) - return - - video_webpage = video_webpage_bytes.decode('utf-8', 'ignore') - - # Attempt to extract SWF player URL - mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) - if mobj is not None: - player_url = re.sub(r'\\(.)', r'\1', mobj.group(1)) - else: - player_url = None - - # Get video info - self.report_video_info_webpage_download(video_id) - for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']: - video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' - % (video_id, el_type)) - request = compat_urllib_request.Request(video_info_url) - try: - video_info_webpage_bytes = compat_urllib_request.urlopen(request).read() - video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore') - video_info = compat_parse_qs(video_info_webpage) - if 'token' in video_info: - break - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err)) - return - if 'token' not in video_info: - if 'reason' in video_info: - self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0]) - else: - self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason') - return - - # Check for "rental" videos - if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: - self._downloader.trouble(u'ERROR: "rental" videos not supported') - return - - # Start extracting information - self.report_information_extraction(video_id) - - # uploader - if 'author' not in video_info: - self._downloader.trouble(u'ERROR: unable to extract uploader name') - return - video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0]) - - # uploader_id - video_uploader_id = None - mobj = re.search(r'', video_webpage) - if mobj is not None: - video_uploader_id = mobj.group(1) - else: - self._downloader.trouble(u'WARNING: unable to extract uploader nickname') - - # title - if 'title' not in video_info: - self._downloader.trouble(u'ERROR: unable to extract video title') - return - video_title = compat_urllib_parse.unquote_plus(video_info['title'][0]) - - # thumbnail image - if 'thumbnail_url' not in video_info: - self._downloader.trouble(u'WARNING: unable to extract video thumbnail') - video_thumbnail = '' - else: # don't panic if we can't find it - video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0]) - - # upload date - upload_date = None - mobj = re.search(r'id="eow-date.*?>(.*?)', video_webpage, re.DOTALL) - if mobj is not None: - upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split()) - format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y'] - for expression in format_expressions: - try: - upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d') - except: - pass - - # description - video_description = get_element_by_id("eow-description", video_webpage) - if video_description: - video_description = clean_html(video_description) - else: - video_description = '' - - # closed captions - video_subtitles = None - if self._downloader.params.get('writesubtitles', False): - (srt_error, video_subtitles) = self._extract_subtitles(video_id) - if srt_error: - self._downloader.trouble(srt_error) - - if 'length_seconds' not in video_info: - self._downloader.trouble(u'WARNING: unable to extract video duration') - video_duration = '' - else: - video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]) - - # token - video_token = compat_urllib_parse.unquote_plus(video_info['token'][0]) - - # Decide which formats to download - req_format = self._downloader.params.get('format', None) - - if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'): - self.report_rtmp_download() - video_url_list = [(None, video_info['conn'][0])] - elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1: - url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',') - url_data = [compat_parse_qs(uds) for uds in url_data_strs] - url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud] - url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data) - - format_limit = self._downloader.params.get('format_limit', None) - available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats - if format_limit is not None and format_limit in available_formats: - format_list = available_formats[available_formats.index(format_limit):] - else: - format_list = available_formats - existing_formats = [x for x in format_list if x in url_map] - if len(existing_formats) == 0: - self._downloader.trouble(u'ERROR: no known formats available for video') - return - if self._downloader.params.get('listformats', None): - self._print_formats(existing_formats) - return - if req_format is None or req_format == 'best': - video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality - elif req_format == 'worst': - video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality - elif req_format in ('-1', 'all'): - video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats - else: - # Specific formats. We pick the first in a slash-delimeted sequence. - # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'. - req_formats = req_format.split('/') - video_url_list = None - for rf in req_formats: - if rf in url_map: - video_url_list = [(rf, url_map[rf])] - break - if video_url_list is None: - self._downloader.trouble(u'ERROR: requested format not available') - return - else: - self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info') - return - - results = [] - for format_param, video_real_url in video_url_list: - # Extension - video_extension = self._video_extensions.get(format_param, 'flv') - - video_format = '{0} - {1}'.format(format_param if format_param else video_extension, - self._video_dimensions.get(format_param, '???')) - - results.append({ - 'id': video_id, - 'url': video_real_url, - 'uploader': video_uploader, - 'uploader_id': video_uploader_id, - 'upload_date': upload_date, - 'title': video_title, - 'ext': video_extension, - 'format': video_format, - 'thumbnail': video_thumbnail, - 'description': video_description, - 'player_url': player_url, - 'subtitles': video_subtitles, - 'duration': video_duration - }) - return results - - -class MetacafeIE(InfoExtractor): - """Information Extractor for metacafe.com.""" - - _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*' - _DISCLAIMER = 'http://www.metacafe.com/family_filter/' - _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user' - IE_NAME = u'metacafe' - - def __init__(self, downloader=None): - InfoExtractor.__init__(self, downloader) - - def report_disclaimer(self): - """Report disclaimer retrieval.""" - self._downloader.to_screen(u'[metacafe] Retrieving disclaimer') - - def report_age_confirmation(self): - """Report attempt to confirm age.""" - self._downloader.to_screen(u'[metacafe] Confirming age') - - def report_download_webpage(self, video_id): - """Report webpage download.""" - self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id) - - def report_extraction(self, video_id): - """Report information extraction.""" - self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id) - - def _real_initialize(self): - # Retrieve disclaimer - request = compat_urllib_request.Request(self._DISCLAIMER) - try: - self.report_disclaimer() - disclaimer = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err)) - return - - # Confirm age - disclaimer_form = { - 'filters': '0', - 'submit': "Continue - I'm over 18", - } - request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form)) - try: - self.report_age_confirmation() - disclaimer = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err)) - return - - def _real_extract(self, url): - # Extract id and simplified title from URL - mobj = re.match(self._VALID_URL, url) - if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) - return - - video_id = mobj.group(1) - - # Check if video comes from YouTube - mobj2 = re.match(r'^yt-(.*)$', video_id) - if mobj2 is not None: - self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)]) - return - - # Retrieve video webpage to extract further information - request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id) - try: - self.report_download_webpage(video_id) - webpage = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err)) - return - - # Extract URL, uploader and title from webpage - self.report_extraction(video_id) - mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage) - if mobj is not None: - mediaURL = compat_urllib_parse.unquote(mobj.group(1)) - video_extension = mediaURL[-3:] - - # Extract gdaKey if available - mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage) - if mobj is None: - video_url = mediaURL - else: - gdaKey = mobj.group(1) - video_url = '%s?__gda__=%s' % (mediaURL, gdaKey) - else: - mobj = re.search(r' name="flashvars" value="(.*?)"', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract media URL') - return - vardict = compat_parse_qs(mobj.group(1)) - if 'mediaData' not in vardict: - self._downloader.trouble(u'ERROR: unable to extract media URL') - return - mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0]) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract media URL') - return - mediaURL = mobj.group(1).replace('\\/', '/') - video_extension = mediaURL[-3:] - video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2)) - - mobj = re.search(r'(?im)(.*) - Video', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract title') - return - video_title = mobj.group(1).decode('utf-8') - - mobj = re.search(r'submitter=(.*?);', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract uploader nickname') - return - video_uploader = mobj.group(1) - - return [{ - 'id': video_id.decode('utf-8'), - 'url': video_url.decode('utf-8'), - 'uploader': video_uploader.decode('utf-8'), - 'upload_date': None, - 'title': video_title, - 'ext': video_extension.decode('utf-8'), - }] - - -class DailymotionIE(InfoExtractor): - """Information Extractor for Dailymotion""" - - _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)' - IE_NAME = u'dailymotion' - - def __init__(self, downloader=None): - InfoExtractor.__init__(self, downloader) - - def report_download_webpage(self, video_id): - """Report webpage download.""" - self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id) - - def report_extraction(self, video_id): - """Report information extraction.""" - self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id) - - def _real_extract(self, url): - # Extract id and simplified title from URL - mobj = re.match(self._VALID_URL, url) - if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) - return - - video_id = mobj.group(1).split('_')[0].split('?')[0] - - video_extension = 'mp4' - - # Retrieve video webpage to extract further information - request = compat_urllib_request.Request(url) - request.add_header('Cookie', 'family_filter=off') - try: - self.report_download_webpage(video_id) - webpage_bytes = compat_urllib_request.urlopen(request).read() - webpage = webpage_bytes.decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err)) - return - - # Extract URL, uploader and title from webpage - self.report_extraction(video_id) - mobj = re.search(r'\s*var flashvars = (.*)', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract media URL') - return - flashvars = compat_urllib_parse.unquote(mobj.group(1)) - - for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']: - if key in flashvars: - max_quality = key - self._downloader.to_screen(u'[dailymotion] Using %s' % key) - break - else: - self._downloader.trouble(u'ERROR: unable to extract video URL') - return - - mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video URL') - return - - video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/') - - # TODO: support choosing qualities - - mobj = re.search(r'', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract title') - return - video_title = unescapeHTML(mobj.group('title')) - - video_uploader = None - mobj = re.search(r'(?im)[^<]+?]+?>([^<]+?)', webpage) - if mobj is None: - # lookin for official user - mobj_official = re.search(r'', webpage) - if mobj_official is None: - self._downloader.trouble(u'WARNING: unable to extract uploader nickname') - else: - video_uploader = mobj_official.group(1) - else: - video_uploader = mobj.group(1) - - video_upload_date = None - mobj = re.search(r'
([0-9]{2})-([0-9]{2})-([0-9]{4})
', webpage) - if mobj is not None: - video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1) - - return [{ - 'id': video_id, - 'url': video_url, - 'uploader': video_uploader, - 'upload_date': video_upload_date, - 'title': video_title, - 'ext': video_extension, - }] - - -class PhotobucketIE(InfoExtractor): - """Information extractor for photobucket.com.""" - - _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)' - IE_NAME = u'photobucket' - - def __init__(self, downloader=None): - InfoExtractor.__init__(self, downloader) - - def report_download_webpage(self, video_id): - """Report webpage download.""" - self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id) - - def report_extraction(self, video_id): - """Report information extraction.""" - self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id) - - def _real_extract(self, url): - # Extract id from URL - mobj = re.match(self._VALID_URL, url) - if mobj is None: - self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) - return - - video_id = mobj.group(1) - - video_extension = 'flv' - - # Retrieve video webpage to extract further information - request = compat_urllib_request.Request(url) - try: - self.report_download_webpage(video_id) - webpage = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) - return - - # Extract URL, uploader, and title from webpage - self.report_extraction(video_id) - mobj = re.search(r'', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract media URL') - return - mediaURL = compat_urllib_parse.unquote(mobj.group(1)) - - video_url = mediaURL - - mobj = re.search(r'(.*) video by (.*) - Photobucket', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract title') - return - video_title = mobj.group(1).decode('utf-8') - - video_uploader = mobj.group(2).decode('utf-8') - - return [{ - 'id': video_id.decode('utf-8'), - 'url': video_url.decode('utf-8'), - 'uploader': video_uploader, - 'upload_date': None, - 'title': video_title, - 'ext': video_extension.decode('utf-8'), - }] - - -class YahooIE(InfoExtractor): - """Information extractor for video.yahoo.com.""" - - _WORKING = False - # _VALID_URL matches all Yahoo! Video URLs - # _VPAGE_URL matches only the extractable '/watch/' URLs - _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?' - _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?' - IE_NAME = u'video.yahoo' - - def __init__(self, downloader=None): - InfoExtractor.__init__(self, downloader) - - def report_download_webpage(self, video_id): - """Report webpage download.""" - self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id) - - def report_extraction(self, video_id): - """Report information extraction.""" - self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id) - - def _real_extract(self, url, new_video=True): - # Extract ID from URL - mobj = re.match(self._VALID_URL, url) - if mobj is None: - self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) - return - - video_id = mobj.group(2) - video_extension = 'flv' - - # Rewrite valid but non-extractable URLs as - # extractable English language /watch/ URLs - if re.match(self._VPAGE_URL, url) is None: - request = compat_urllib_request.Request(url) - try: - webpage = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) - return - - mobj = re.search(r'\("id", "([0-9]+)"\);', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: Unable to extract id field') - return - yahoo_id = mobj.group(1) - - mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: Unable to extract vid field') - return - yahoo_vid = mobj.group(1) - - url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id) - return self._real_extract(url, new_video=False) - - # Retrieve video webpage to extract further information - request = compat_urllib_request.Request(url) - try: - self.report_download_webpage(video_id) - webpage = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) - return - - # Extract uploader and title from webpage - self.report_extraction(video_id) - mobj = re.search(r'', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video title') - return - video_title = mobj.group(1).decode('utf-8') - - mobj = re.search(r'

(.*)

', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video uploader') - return - video_uploader = mobj.group(1).decode('utf-8') - - # Extract video thumbnail - mobj = re.search(r'', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video thumbnail') - return - video_thumbnail = mobj.group(1).decode('utf-8') - - # Extract video description - mobj = re.search(r'', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video description') - return - video_description = mobj.group(1).decode('utf-8') - if not video_description: - video_description = 'No description available.' - - # Extract video height and width - mobj = re.search(r'', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video height') - return - yv_video_height = mobj.group(1) - - mobj = re.search(r'', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video width') - return - yv_video_width = mobj.group(1) - - # Retrieve video playlist to extract media URL - # I'm not completely sure what all these options are, but we - # seem to need most of them, otherwise the server sends a 401. - yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents - yv_bitrate = '700' # according to Wikipedia this is hard-coded - request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id + - '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height + - '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797') - try: - self.report_download_webpage(video_id) - webpage = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) - return - - # Extract media URL from playlist XML - mobj = re.search(r' 0: - video_quality = files[quality][0][2] - video_codec = files[quality][0][0] - video_extension = files[quality][0][1] - self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality)) - break - else: - self._downloader.trouble(u'ERROR: no known codec found') - return - - video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \ - %(video_id, sig, timestamp, video_quality, video_codec.upper()) - - return [{ - 'id': video_id, - 'url': video_url, - 'uploader': video_uploader, - 'uploader_id': video_uploader_id, - 'upload_date': video_upload_date, - 'title': video_title, - 'ext': video_extension, - 'thumbnail': video_thumbnail, - 'description': video_description, - }] - - -class ArteTvIE(InfoExtractor): - """arte.tv information extractor.""" - - _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*' - _LIVE_URL = r'index-[0-9]+\.html$' - - IE_NAME = u'arte.tv' - - def __init__(self, downloader=None): - InfoExtractor.__init__(self, downloader) - - def report_download_webpage(self, video_id): - """Report webpage download.""" - self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id) - - def report_extraction(self, video_id): - """Report information extraction.""" - self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id) - - def fetch_webpage(self, url): - self._downloader.incre#!/usr/bin/env python -# -*- coding: utf-8 -*- - -from __future__ import absolute_import - -import datetime -import netrc -import os -import re -import socket -import time -import email.utils -import xml.etree.ElementTree -import random -import math - -from .utils import * - - -class InfoExtractor(object): - """Information Extractor class. - - Information extractors are the classes that, given a URL, extract - information about the video (or videos) the URL refers to. This - information includes the real video URL, the video title, author and - others. The information is stored in a dictionary which is then - passed to the FileDownloader. The FileDownloader processes this - information possibly downloading the video to the file system, among - other possible outcomes. - - The dictionaries must include the following fields: - - id: Video identifier. - url: Final video URL. - title: Video title, unescaped. - ext: Video filename extension. - uploader: Full name of the video uploader. - upload_date: Video upload date (YYYYMMDD). - - The following fields are optional: - - format: The video format, defaults to ext (used for --get-format) - thumbnail: Full URL to a video thumbnail image. - description: One-line video description. - uploader_id: Nickname or id of the video uploader. - player_url: SWF Player URL (used for rtmpdump). - subtitles: The .srt file contents. - urlhandle: [internal] The urlHandle to be used to download the file, - like returned by urllib.request.urlopen - - The fields should all be Unicode strings. - - Subclasses of this one should re-define the _real_initialize() and - _real_extract() methods and define a _VALID_URL regexp. - Probably, they should also be added to the list of extractors. - - _real_extract() must return a *list* of information dictionaries as - described above. - - Finally, the _WORKING attribute should be set to False for broken IEs - in order to warn the users and skip the tests. - """ - - _ready = False - _downloader = None - _WORKING = True - - def __init__(self, downloader=None): - """Constructor. Receives an optional downloader.""" - self._ready = False - self.set_downloader(downloader) - - def suitable(self, url): - """Receives a URL and returns True if suitable for this IE.""" - return re.match(self._VALID_URL, url) is not None - - def working(self): - """Getter method for _WORKING.""" - return self._WORKING - - def initialize(self): - """Initializes an instance (authentication, etc).""" - if not self._ready: - self._real_initialize() - self._ready = True - - def extract(self, url): - """Extracts URL information and returns it in list of dicts.""" - self.initialize() - return self._real_extract(url) - - def set_downloader(self, downloader): - """Sets the downloader for this IE.""" - self._downloader = downloader - - def _real_initialize(self): - """Real initialization process. Redefine in subclasses.""" - pass - - def _real_extract(self, url): - """Real extraction process. Redefine in subclasses.""" - pass - - @property - def IE_NAME(self): - return type(self).__name__[:-2] - -class YoutubeIE(InfoExtractor): - """Information extractor for youtube.com.""" - - _VALID_URL = r"""^ - ( - (?:https?://)? # http(s):// (optional) - (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/| - tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains - (?:.*?\#/)? # handle anchor (#/) redirect urls - (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs - (?: # the various things that can precede the ID: - (?:(?:v|embed|e)/) # v/ or embed/ or e/ - |(?: # or the v= param in all its forms - (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) - (?:\?|\#!?) # the params delimiter ? or # or #! - (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx) - v= - ) - )? # optional -> youtube.com/xxxx is OK - )? # all until now is optional -> you can pass the naked ID - ([0-9A-Za-z_-]+) # here is it! the YouTube video ID - (?(1).+)? # if we found the ID, everything can follow - $""" - _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1' - _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en' - _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en' - _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' - _NETRC_MACHINE = 'youtube' - # Listed in order of quality - _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13'] - _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13'] - _video_extensions = { - '13': '3gp', - '17': 'mp4', - '18': 'mp4', - '22': 'mp4', - '37': 'mp4', - '38': 'video', # You actually don't know if this will be MOV, AVI or whatever - '43': 'webm', - '44': 'webm', - '45': 'webm', - '46': 'webm', - } - _video_dimensions = { - '5': '240x400', - '6': '???', - '13': '???', - '17': '144x176', - '18': '360x640', - '22': '720x1280', - '34': '360x640', - '35': '480x854', - '37': '1080x1920', - '38': '3072x4096', - '43': '360x640', - '44': '480x854', - '45': '720x1280', - '46': '1080x1920', - } - IE_NAME = u'youtube' - - def suitable(self, url): - """Receives a URL and returns True if suitable for this IE.""" - return re.match(self._VALID_URL, url, re.VERBOSE) is not None - - def report_lang(self): - """Report attempt to set language.""" - self._downloader.to_screen(u'[youtube] Setting language') - - def report_login(self): - """Report attempt to log in.""" - self._downloader.to_screen(u'[youtube] Logging in') - - def report_age_confirmation(self): - """Report attempt to confirm age.""" - self._downloader.to_screen(u'[youtube] Confirming age') - - def report_video_webpage_download(self, video_id): - """Report attempt to download video webpage.""" - self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id) - - def report_video_info_webpage_download(self, video_id): - """Report attempt to download video info webpage.""" - self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id) - - def report_video_subtitles_download(self, video_id): - """Report attempt to download video info webpage.""" - self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id) - - def report_information_extraction(self, video_id): - """Report attempt to extract video information.""" - self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id) - - def report_unavailable_format(self, video_id, format): - """Report extracted video URL.""" - self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format)) - - def report_rtmp_download(self): - """Indicate the download will use the RTMP protocol.""" - self._downloader.to_screen(u'[youtube] RTMP download detected') - - def _closed_captions_xml_to_srt(self, xml_string): - srt = '' - texts = re.findall(r'([^<]+)', xml_string, re.MULTILINE) - # TODO parse xml instead of regex - for n, (start, dur_tag, dur, caption) in enumerate(texts): - if not dur: dur = '4' - start = float(start) - end = start + float(dur) - start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000) - end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000) - caption = unescapeHTML(caption) - caption = unescapeHTML(caption) # double cycle, intentional - srt += str(n+1) + '\n' - srt += start + ' --> ' + end + '\n' - srt += caption + '\n\n' - return srt - - def _extract_subtitles(self, video_id): - self.report_video_subtitles_download(video_id) - request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) - try: - srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None) - srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list) - srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list) - if not srt_lang_list: - return (u'WARNING: video has no closed captions', None) - if self._downloader.params.get('subtitleslang', False): - srt_lang = self._downloader.params.get('subtitleslang') - elif 'en' in srt_lang_list: - srt_lang = 'en' - else: - srt_lang = list(srt_lang_list.keys())[0] - if not srt_lang in srt_lang_list: - return (u'WARNING: no closed captions found in the specified language', None) - request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id)) - try: - srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None) - if not srt_xml: - return (u'WARNING: unable to download video subtitles', None) - return (None, self._closed_captions_xml_to_srt(srt_xml)) - - def _print_formats(self, formats): - print('Available formats:') - for x in formats: - print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))) - - def _real_initialize(self): - if self._downloader is None: - return - - username = None - password = None - downloader_params = self._downloader.params - - # Attempt to use provided username and password or .netrc data - if downloader_params.get('username', None) is not None: - username = downloader_params['username'] - password = downloader_params['password'] - elif downloader_params.get('usenetrc', False): - try: - info = netrc.netrc().authenticators(self._NETRC_MACHINE) - if info is not None: - username = info[0] - password = info[2] - else: - raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) - except (IOError, netrc.NetrcParseError) as err: - self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err)) - return - - # Set language - request = compat_urllib_request.Request(self._LANG_URL) - try: - self.report_lang() - compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err)) - return - - # No authentication to be performed - if username is None: - return - - # Log in - login_form = { - 'current_form': 'loginForm', - 'next': '/', - 'action_login': 'Log In', - 'username': username, - 'password': password, - } - request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form)) - try: - self.report_login() - login_results = compat_urllib_request.urlopen(request).read().decode('utf-8') - if re.search(r'(?i)]* name="loginForm"', login_results) is not None: - self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password') - return - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err)) - return - - # Confirm age - age_form = { - 'next_url': '/', - 'action_confirm': 'Confirm', - } - request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form)) - try: - self.report_age_confirmation() - age_results = compat_urllib_request.urlopen(request).read().decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err)) - return - - def _extract_id(self, url): - mobj = re.match(self._VALID_URL, url, re.VERBOSE) - if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) - return - video_id = mobj.group(2) - return video_id - - def _real_extract(self, url): - # Extract original video URL from URL with redirection, like age verification, using next_url parameter - mobj = re.search(self._NEXT_URL_RE, url) - if mobj: - url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/') - video_id = self._extract_id(url) - - # Get video webpage - self.report_video_webpage_download(video_id) - url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id - request = compat_urllib_request.Request(url) - try: - video_webpage_bytes = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err)) - return - - video_webpage = video_webpage_bytes.decode('utf-8', 'ignore') - - # Attempt to extract SWF player URL - mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) - if mobj is not None: - player_url = re.sub(r'\\(.)', r'\1', mobj.group(1)) - else: - player_url = None - - # Get video info - self.report_video_info_webpage_download(video_id) - for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']: - video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' - % (video_id, el_type)) - request = compat_urllib_request.Request(video_info_url) - try: - video_info_webpage_bytes = compat_urllib_request.urlopen(request).read() - video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore') - video_info = compat_parse_qs(video_info_webpage) - if 'token' in video_info: - break - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err)) - return - if 'token' not in video_info: - if 'reason' in video_info: - self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0]) - else: - self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason') - return - - # Check for "rental" videos - if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: - self._downloader.trouble(u'ERROR: "rental" videos not supported') - return - - # Start extracting information - self.report_information_extraction(video_id) - - # uploader - if 'author' not in video_info: - self._downloader.trouble(u'ERROR: unable to extract uploader name') - return - video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0]) - - # uploader_id - video_uploader_id = None - mobj = re.search(r'', video_webpage) - if mobj is not None: - video_uploader_id = mobj.group(1) - else: - self._downloader.trouble(u'WARNING: unable to extract uploader nickname') - - # title - if 'title' not in video_info: - self._downloader.trouble(u'ERROR: unable to extract video title') - return - video_title = compat_urllib_parse.unquote_plus(video_info['title'][0]) - - # thumbnail image - if 'thumbnail_url' not in video_info: - self._downloader.trouble(u'WARNING: unable to extract video thumbnail') - video_thumbnail = '' - else: # don't panic if we can't find it - video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0]) - - # upload date - upload_date = None - mobj = re.search(r'id="eow-date.*?>(.*?)
', video_webpage, re.DOTALL) - if mobj is not None: - upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split()) - format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y'] - for expression in format_expressions: - try: - upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d') - except: - pass - - # description - video_description = get_element_by_id("eow-description", video_webpage) - if video_description: - video_description = clean_html(video_description) - else: - video_description = '' - - # closed captions - video_subtitles = None - if self._downloader.params.get('writesubtitles', False): - (srt_error, video_subtitles) = self._extract_subtitles(video_id) - if srt_error: - self._downloader.trouble(srt_error) - - if 'length_seconds' not in video_info: - self._downloader.trouble(u'WARNING: unable to extract video duration') - video_duration = '' - else: - video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]) - - # token - video_token = compat_urllib_parse.unquote_plus(video_info['token'][0]) - - # Decide which formats to download - req_format = self._downloader.params.get('format', None) - - if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'): - self.report_rtmp_download() - video_url_list = [(None, video_info['conn'][0])] - elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1: - url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',') - url_data = [compat_parse_qs(uds) for uds in url_data_strs] - url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud] - url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data) - - format_limit = self._downloader.params.get('format_limit', None) - available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats - if format_limit is not None and format_limit in available_formats: - format_list = available_formats[available_formats.index(format_limit):] - else: - format_list = available_formats - existing_formats = [x for x in format_list if x in url_map] - if len(existing_formats) == 0: - self._downloader.trouble(u'ERROR: no known formats available for video') - return - if self._downloader.params.get('listformats', None): - self._print_formats(existing_formats) - return - if req_format is None or req_format == 'best': - video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality - elif req_format == 'worst': - video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality - elif req_format in ('-1', 'all'): - video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats - else: - # Specific formats. We pick the first in a slash-delimeted sequence. - # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'. - req_formats = req_format.split('/') - video_url_list = None - for rf in req_formats: - if rf in url_map: - video_url_list = [(rf, url_map[rf])] - break - if video_url_list is None: - self._downloader.trouble(u'ERROR: requested format not available') - return - else: - self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info') - return - - results = [] - for format_param, video_real_url in video_url_list: - # Extension - video_extension = self._video_extensions.get(format_param, 'flv') - - video_format = '{0} - {1}'.format(format_param if format_param else video_extension, - self._video_dimensions.get(format_param, '???')) - - results.append({ - 'id': video_id, - 'url': video_real_url, - 'uploader': video_uploader, - 'uploader_id': video_uploader_id, - 'upload_date': upload_date, - 'title': video_title, - 'ext': video_extension, - 'format': video_format, - 'thumbnail': video_thumbnail, - 'description': video_description, - 'player_url': player_url, - 'subtitles': video_subtitles, - 'duration': video_duration - }) - return results - - -class MetacafeIE(InfoExtractor): - """Information Extractor for metacafe.com.""" - - _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*' - _DISCLAIMER = 'http://www.metacafe.com/family_filter/' - _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user' - IE_NAME = u'metacafe' - - def __init__(self, downloader=None): - InfoExtractor.__init__(self, downloader) - - def report_disclaimer(self): - """Report disclaimer retrieval.""" - self._downloader.to_screen(u'[metacafe] Retrieving disclaimer') - - def report_age_confirmation(self): - """Report attempt to confirm age.""" - self._downloader.to_screen(u'[metacafe] Confirming age') - - def report_download_webpage(self, video_id): - """Report webpage download.""" - self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id) - - def report_extraction(self, video_id): - """Report information extraction.""" - self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id) - - def _real_initialize(self): - # Retrieve disclaimer - request = compat_urllib_request.Request(self._DISCLAIMER) - try: - self.report_disclaimer() - disclaimer = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err)) - return - - # Confirm age - disclaimer_form = { - 'filters': '0', - 'submit': "Continue - I'm over 18", - } - request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form)) - try: - self.report_age_confirmation() - disclaimer = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err)) - return - - def _real_extract(self, url): - # Extract id and simplified title from URL - mobj = re.match(self._VALID_URL, url) - if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) - return - - video_id = mobj.group(1) - - # Check if video comes from YouTube - mobj2 = re.match(r'^yt-(.*)$', video_id) - if mobj2 is not None: - self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)]) - return - - # Retrieve video webpage to extract further information - request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id) - try: - self.report_download_webpage(video_id) - webpage = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err)) - return - - # Extract URL, uploader and title from webpage - self.report_extraction(video_id) - mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage) - if mobj is not None: - mediaURL = compat_urllib_parse.unquote(mobj.group(1)) - video_extension = mediaURL[-3:] - - # Extract gdaKey if available - mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage) - if mobj is None: - video_url = mediaURL - else: - gdaKey = mobj.group(1) - video_url = '%s?__gda__=%s' % (mediaURL, gdaKey) - else: - mobj = re.search(r' name="flashvars" value="(.*?)"', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract media URL') - return - vardict = compat_parse_qs(mobj.group(1)) - if 'mediaData' not in vardict: - self._downloader.trouble(u'ERROR: unable to extract media URL') - return - mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0]) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract media URL') - return - mediaURL = mobj.group(1).replace('\\/', '/') - video_extension = mediaURL[-3:] - video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2)) - - mobj = re.search(r'(?im)(.*) - Video', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract title') - return - video_title = mobj.group(1).decode('utf-8') - - mobj = re.search(r'submitter=(.*?);', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract uploader nickname') - return - video_uploader = mobj.group(1) - - return [{ - 'id': video_id.decode('utf-8'), - 'url': video_url.decode('utf-8'), - 'uploader': video_uploader.decode('utf-8'), - 'upload_date': None, - 'title': video_title, - 'ext': video_extension.decode('utf-8'), - }] - - -class DailymotionIE(InfoExtractor): - """Information Extractor for Dailymotion""" - - _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)' - IE_NAME = u'dailymotion' - - def __init__(self, downloader=None): - InfoExtractor.__init__(self, downloader) - - def report_download_webpage(self, video_id): - """Report webpage download.""" - self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id) - - def report_extraction(self, video_id): - """Report information extraction.""" - self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id) - - def _real_extract(self, url): - # Extract id and simplified title from URL - mobj = re.match(self._VALID_URL, url) - if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) - return - - video_id = mobj.group(1).split('_')[0].split('?')[0] - - video_extension = 'mp4' - - # Retrieve video webpage to extract further information - request = compat_urllib_request.Request(url) - request.add_header('Cookie', 'family_filter=off') - try: - self.report_download_webpage(video_id) - webpage_bytes = compat_urllib_request.urlopen(request).read() - webpage = webpage_bytes.decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err)) - return - - # Extract URL, uploader and title from webpage - self.report_extraction(video_id) - mobj = re.search(r'\s*var flashvars = (.*)', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract media URL') - return - flashvars = compat_urllib_parse.unquote(mobj.group(1)) - - for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']: - if key in flashvars: - max_quality = key - self._downloader.to_screen(u'[dailymotion] Using %s' % key) - break - else: - self._downloader.trouble(u'ERROR: unable to extract video URL') - return - - mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video URL') - return - - video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/') - - # TODO: support choosing qualities - - mobj = re.search(r'', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract title') - return - video_title = unescapeHTML(mobj.group('title')) - - video_uploader = None - mobj = re.search(r'(?im)[^<]+?]+?>([^<]+?)', webpage) - if mobj is None: - # lookin for official user - mobj_official = re.search(r'', webpage) - if mobj_official is None: - self._downloader.trouble(u'WARNING: unable to extract uploader nickname') - else: - video_uploader = mobj_official.group(1) - else: - video_uploader = mobj.group(1) - - video_upload_date = None - mobj = re.search(r'
([0-9]{2})-([0-9]{2})-([0-9]{4})
', webpage) - if mobj is not None: - video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1) - - return [{ - 'id': video_id, - 'url': video_url, - 'uploader': video_uploader, - 'upload_date': video_upload_date, - 'title': video_title, - 'ext': video_extension, - }] - - -class PhotobucketIE(InfoExtractor): - """Information extractor for photobucket.com.""" - - _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)' - IE_NAME = u'photobucket' - - def __init__(self, downloader=None): - InfoExtractor.__init__(self, downloader) - - def report_download_webpage(self, video_id): - """Report webpage download.""" - self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id) - - def report_extraction(self, video_id): - """Report information extraction.""" - self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id) - - def _real_extract(self, url): - # Extract id from URL - mobj = re.match(self._VALID_URL, url) - if mobj is None: - self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) - return - - video_id = mobj.group(1) - - video_extension = 'flv' - - # Retrieve video webpage to extract further information - request = compat_urllib_request.Request(url) - try: - self.report_download_webpage(video_id) - webpage = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) - return - - # Extract URL, uploader, and title from webpage - self.report_extraction(video_id) - mobj = re.search(r'', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract media URL') - return - mediaURL = compat_urllib_parse.unquote(mobj.group(1)) - - video_url = mediaURL - - mobj = re.search(r'(.*) video by (.*) - Photobucket', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract title') - return - video_title = mobj.group(1).decode('utf-8') - - video_uploader = mobj.group(2).decode('utf-8') - - return [{ - 'id': video_id.decode('utf-8'), - 'url': video_url.decode('utf-8'), - 'uploader': video_uploader, - 'upload_date': None, - 'title': video_title, - 'ext': video_extension.decode('utf-8'), - }] - - -class YahooIE(InfoExtractor): - """Information extractor for video.yahoo.com.""" - - _WORKING = False - # _VALID_URL matches all Yahoo! Video URLs - # _VPAGE_URL matches only the extractable '/watch/' URLs - _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?' - _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?' - IE_NAME = u'video.yahoo' - - def __init__(self, downloader=None): - InfoExtractor.__init__(self, downloader) - - def report_download_webpage(self, video_id): - """Report webpage download.""" - self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id) - - def report_extraction(self, video_id): - """Report information extraction.""" - self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id) - - def _real_extract(self, url, new_video=True): - # Extract ID from URL - mobj = re.match(self._VALID_URL, url) - if mobj is None: - self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) - return - - video_id = mobj.group(2) - video_extension = 'flv' - - # Rewrite valid but non-extractable URLs as - # extractable English language /watch/ URLs - if re.match(self._VPAGE_URL, url) is None: - request = compat_urllib_request.Request(url) - try: - webpage = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) - return - - mobj = re.search(r'\("id", "([0-9]+)"\);', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: Unable to extract id field') - return - yahoo_id = mobj.group(1) - - mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: Unable to extract vid field') - return - yahoo_vid = mobj.group(1) - - url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id) - return self._real_extract(url, new_video=False) - - # Retrieve video webpage to extract further information - request = compat_urllib_request.Request(url) - try: - self.report_download_webpage(video_id) - webpage = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) - return - - # Extract uploader and title from webpage - self.report_extraction(video_id) - mobj = re.search(r'', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video title') - return - video_title = mobj.group(1).decode('utf-8') - - mobj = re.search(r'

(.*)

', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video uploader') - return - video_uploader = mobj.group(1).decode('utf-8') - - # Extract video thumbnail - mobj = re.search(r'', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video thumbnail') - return - video_thumbnail = mobj.group(1).decode('utf-8') - - # Extract video description - mobj = re.search(r'', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video description') - return - video_description = mobj.group(1).decode('utf-8') - if not video_description: - video_description = 'No description available.' - - # Extract video height and width - mobj = re.search(r'', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video height') - return - yv_video_height = mobj.group(1) - - mobj = re.search(r'', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video width') - return - yv_video_width = mobj.group(1) - - # Retrieve video playlist to extract media URL - # I'm not completely sure what all these options are, but we - # seem to need most of them, otherwise the server sends a 401. - yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents - yv_bitrate = '700' # according to Wikipedia this is hard-coded - request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id + - '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height + - '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797') - try: - self.report_download_webpage(video_id) - webpage = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) - return - - # Extract media URL from playlist XML - mobj = re.search(r' 0: - video_quality = files[quality][0][2] - video_codec = files[quality][0][0] - video_extension = files[quality][0][1] - self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality)) - break - else: - self._downloader.trouble(u'ERROR: no known codec found') - return - - video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \ - %(video_id, sig, timestamp, video_quality, video_codec.upper()) - - return [{ - 'id': video_id, - 'url': video_url, - 'uploader': video_uploader, - 'uploader_id': video_uploader_id, - 'upload_date': video_upload_date, - 'title': video_title, - 'ext': video_extension, - 'thumbnail': video_thumbnail, - 'description': video_description, - }] - - -class ArteTvIE(InfoExtractor): - """arte.tv information extractor.""" - - _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*' - _LIVE_URL = r'index-[0-9]+\.html$' - - IE_NAME = u'arte.tv' - - def __init__(self, downloader=None): - InfoExtractor.__init__(self, downloader) - - def report_download_webpage(self, video_id): - """Report webpage download.""" - self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id) - - def report_extraction(self, video_id): - """Report information extraction.""" - self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id) - - def fetch_webpage(self, url): - self._downloader.increment_downloads() - request = compat_urllib_request.Request(url) - try: - self.report_download_webpage(url) - webpage = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) - return - except ValueError as err: - self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) - return - return webpage - - def grep_webpage(self, url, regex, regexFlags, matchTuples): - page = self.fetch_webpage(url) - mobj = re.search(regex, page, regexFlags) - info = {} - - if mobj is None: - self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) - return - - for (i, key, err) in matchTuples: - if mobj.group(i) is None: - self._downloader.trouble(err) - return - else: - info[key] = mobj.group(i) - - return info - - def extractLiveStream(self, url): - video_lang = url.split('/')[-4] - info = self.grep_webpage( - url, - r'src="(.*?/videothek_js.*?\.js)', - 0, - [ - (1, 'url', u'ERROR: Invalid URL: %s' % url) - ] - ) - http_host = url.split('/')[2] - next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url'))) - info = self.grep_webpage( - next_url, - r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' + - '(http://.*?\.swf).*?' + - '(rtmp://.*?)\'', - re.DOTALL, - [ - (1, 'path', u'ERROR: could not extract video path: %s' % url), - (2, 'player', u'ERROR: could not extract video player: %s' % url), - (3, 'url', u'ERROR: could not extract video url: %s' % url) - ] - ) - video_url = u'%s/%s' % (info.get('url'), info.get('path')) - - def extractPlus7Stream(self, url): - video_lang = url.split('/')[-3] - info = self.grep_webpage( - url, - r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)', - 0, - [ - (1, 'url', u'ERROR: Invalid URL: %s' % url) - ] - ) - next_url = compat_urllib_parse.unquote(info.get('url')) - info = self.grep_webpage( - next_url, - r'
""" + mobj = re.search(_title, webpage_src) + if mobj is not None: + title = mobj.group(1) + thumbnail = None + + m = re.match(self._VALID_URL, url) + video_id = m.group('id') + + results = [{ + 'id': video_id, + 'url' : video_url, + 'title' : title, + 'thumbnail' : thumbnail, + 'ext' : ext, + }] + return results + +class RBMARadioIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P[^/]+)$' + + def _real_extract(self, url): + m = re.match(self._VALID_URL, url) + video_id = m.group('videoID') + + webpage = self._download_webpage(url, video_id) + m = re.search(r'', webpage) + if not m: + raise ExtractorError(u'Cannot find metadata') + json_data = m.group(1) + + try: + data = json.loads(json_data) + except ValueError as e: + raise ExtractorError(u'Invalid JSON: ' + str(e)) + + video_url = data['akamai_url'] + '&cbr=256' + url_parts = compat_urllib_parse_urlparse(video_url) + video_ext = url_parts.path.rpartition('.')[2] + info = { + 'id': video_id, + 'url': video_url, + 'ext': video_ext, + 'title': data['title'], + 'description': data.get('teaser_text'), + 'location': data.get('country_of_origin'), + 'uploader': data.get('host', {}).get('name'), + 'uploader_id': data.get('host', {}).get('slug'), + 'thumbnail': data.get('image', {}).get('large_url_2x'), + 'duration': data.get('duration'), + } + return [info] + + +class YouPornIE(InfoExtractor): + """Information extractor for youporn.com.""" + _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P[0-9]+)/(?P[^/]+)' + + def _print_formats(self, formats): + """Print all available formats""" + print(u'Available formats:') + print(u'ext\t\tformat') + print(u'---------------------------------') + for format in formats: + print(u'%s\t\t%s' % (format['ext'], format['format'])) + + def _specific(self, req_format, formats): + for x in formats: + if(x["format"]==req_format): + return x + return None def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -6315,49 +3770,436 @@ class TweetReelIE(InfoExtractor): self._downloader.trouble(u'ERROR: invalid URL: %s' % url) return - video_id = mobj.group('id') - self.report_extraction(video_id) - try: - urlh = compat_urllib_request.urlopen(url) - webpage_bytes = urlh.read() - webpage = webpage_bytes.decode('utf-8', 'ignore') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err)) + video_id = mobj.group('videoid') + + req = compat_urllib_request.Request(url) + req.add_header('Cookie', 'age_verified=1') + webpage = self._download_webpage(req, video_id) + + # Get the video title + result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage) + if result is None: + raise ExtractorError(u'Unable to extract video title') + video_title = result.group('title').strip() + + # Get the video date + result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage) + if result is None: + self._downloader.report_warning(u'unable to extract video date') + upload_date = None + else: + upload_date = result.group('date').strip() + + # Get the video uploader + result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage) + if result is None: + self._downloader.report_warning(u'unable to extract uploader') + video_uploader = None + else: + video_uploader = result.group('uploader').strip() + video_uploader = clean_html( video_uploader ) + + # Get all of the formats available + DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>' + result = re.search(DOWNLOAD_LIST_RE, webpage) + if result is None: + raise ExtractorError(u'Unable to extract download list') + download_list_html = result.group('download_list').strip() + + # Get all of the links from the page + LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">' + links = re.findall(LINK_RE, download_list_html) + if(len(links) == 0): + raise ExtractorError(u'ERROR: no known formats available for video') + + self._downloader.to_screen(u'[youporn] Links found: %d' % len(links)) + + formats = [] + for link in links: + + # A link looks like this: + # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0 + # A path looks like this: + # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4 + video_url = unescapeHTML( link ) + path = compat_urllib_parse_urlparse( video_url ).path + extension = os.path.splitext( path )[1][1:] + format = path.split('/')[4].split('_')[:2] + size = format[0] + bitrate = format[1] + format = "-".join( format ) + title = u'%s-%s-%s' % (video_title, size, bitrate) + + formats.append({ + 'id': video_id, + 'url': video_url, + 'uploader': video_uploader, + 'upload_date': upload_date, + 'title': title, + 'ext': extension, + 'format': format, + 'thumbnail': None, + 'description': None, + 'player_url': None + }) + + if self._downloader.params.get('listformats', None): + self._print_formats(formats) return - m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage) - if not m: - self._downloader.trouble(u'ERROR: Cannot find status ID') - status_id = m.group(1) + req_format = self._downloader.params.get('format', None) + self._downloader.to_screen(u'[youporn] Format: %s' % req_format) - m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL) - if not m: - self._downloader.trouble(u'WARNING: Cannot find description') - desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip() + if req_format is None or req_format == 'best': + return [formats[0]] + elif req_format == 'worst': + return [formats[-1]] + elif req_format in ('-1', 'all'): + return formats + else: + format = self._specific( req_format, formats ) + if result is None: + self._downloader.trouble(u'ERROR: requested format not available') + return + return [format] + + + +class PornotubeIE(InfoExtractor): + """Information extractor for pornotube.com.""" + _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + return + + video_id = mobj.group('videoid') + video_title = mobj.group('title') + + # Get webpage content + webpage = self._download_webpage(url, video_id) + + # Get the video URL + VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",' + result = re.search(VIDEO_URL_RE, webpage) + if result is None: + self._downloader.trouble(u'ERROR: unable to extract video url') + return + video_url = compat_urllib_parse.unquote(result.group('url')) + + #Get the uploaded date + VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by' + result = re.search(VIDEO_UPLOADED_RE, webpage) + if result is None: + self._downloader.trouble(u'ERROR: unable to extract video title') + return + upload_date = result.group('date') + + info = {'id': video_id, + 'url': video_url, + 'uploader': None, + 'upload_date': upload_date, + 'title': video_title, + 'ext': 'flv', + 'format': 'flv'} + + return [info] + +class YouJizzIE(InfoExtractor): + """Information extractor for youjizz.com.""" + _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$' - m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL) + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + return + + video_id = mobj.group('videoid') + + # Get webpage content + webpage = self._download_webpage(url, video_id) + + # Get the video title + result = re.search(r'<title>(?P<title>.*)', webpage) + if result is None: + raise ExtractorError(u'ERROR: unable to extract video title') + video_title = result.group('title').strip() + + # Get the embed page + result = re.search(r'https?://www.youjizz.com/videos/embed/(?P[0-9]+)', webpage) + if result is None: + raise ExtractorError(u'ERROR: unable to extract embed page') + + embed_page_url = result.group(0).strip() + video_id = result.group('videoid') + + webpage = self._download_webpage(embed_page_url, video_id) + + # Get the video URL + result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P[^"]+)"\)\);', webpage) + if result is None: + raise ExtractorError(u'ERROR: unable to extract video url') + video_url = result.group('source') + + info = {'id': video_id, + 'url': video_url, + 'title': video_title, + 'ext': 'flv', + 'format': 'flv', + 'player_url': embed_page_url} + + return [info] + +class EightTracksIE(InfoExtractor): + IE_NAME = '8tracks' + _VALID_URL = r'https?://8tracks.com/(?P[^/]+)/(?P[^/#]+)(?:#.*)?$' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError(u'Invalid URL: %s' % url) + playlist_id = mobj.group('id') + + webpage = self._download_webpage(url, playlist_id) + + m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL) if not m: - self._downloader.trouble(u'ERROR: Cannot find uploader') + raise ExtractorError(u'Cannot find trax information') + json_like = m.group(1) + data = json.loads(json_like) + + session = str(random.randint(0, 1000000000)) + mix_id = data['id'] + track_count = data['tracks_count'] + first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id) + next_url = first_url + res = [] + for i in itertools.count(): + api_json = self._download_webpage(next_url, playlist_id, + note=u'Downloading song information %s/%s' % (str(i+1), track_count), + errnote=u'Failed to download song information') + api_data = json.loads(api_json) + track_data = api_data[u'set']['track'] + info = { + 'id': track_data['id'], + 'url': track_data['track_file_stream_url'], + 'title': track_data['performer'] + u' - ' + track_data['name'], + 'raw_title': track_data['name'], + 'uploader_id': data['user']['login'], + 'ext': 'm4a', + } + res.append(info) + if api_data['set']['at_last_track']: + break + next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id']) + return res + +class KeekIE(InfoExtractor): + _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P\w+)' + IE_NAME = u'keek' + + def _real_extract(self, url): + m = re.match(self._VALID_URL, url) + video_id = m.group('videoID') + video_url = u'http://cdn.keek.com/keek/video/%s' % video_id + thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id + webpage = self._download_webpage(url, video_id) + m = re.search(r'[\s\n]+

(?P\w+)

', webpage) uploader = unescapeHTML(m.group('uploader')) - uploader_id = unescapeHTML(m.group('uploader_id')) + info = { + 'id':video_id, + 'url':video_url, + 'ext': 'mp4', + 'title': title, + 'thumbnail': thumbnail, + 'uploader': uploader + } + return [info] - m = re.search(r'playlists)/(?P\d+)) # We have a playlist + | + ((?Ptalks)) # We have a simple talk + ) + /(?P\w+) # Here goes the name and then ".html" + ''' + + @classmethod + def suitable(cls, url): + """Receives a URL and returns True if suitable for this IE.""" + return re.match(cls._VALID_URL, url, re.VERBOSE) is not None + + def _real_extract(self, url): + m=re.match(self._VALID_URL, url, re.VERBOSE) + if m.group('type_talk'): + return [self._talk_info(url)] + else : + playlist_id=m.group('playlist_id') + name=m.group('name') + self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name)) + return self._playlist_videos_info(url,name,playlist_id) + + def _talk_video_link(self,mediaSlug): + '''Returns the video link for that mediaSlug''' + return 'http://download.ted.com/talks/%s.mp4' % mediaSlug + + def _playlist_videos_info(self,url,name,playlist_id=0): + '''Returns the videos of the playlist''' + video_RE=r''' +
(?P.+?)

' + webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage') + m_videos=re.finditer(video_RE,webpage,re.VERBOSE) + m_names=re.finditer(video_name_RE,webpage) + info=[] + for m_video, m_name in zip(m_videos,m_names): + video_id=m_video.group('video_id') + talk_url='http://www.ted.com%s' % m_name.group('talk_url') + info.append(self._talk_info(talk_url,video_id)) + return info + + def _talk_info(self, url, video_id=0): + """Return the video for the talk in the url""" + m=re.match(self._VALID_URL, url,re.VERBOSE) + videoName=m.group('name') + webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName) + # If the url includes the language we get the title translated + title_RE=r'

(?P.*)</span></h1>' + title=re.search(title_RE, webpage).group('title') + info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?) + "id":(?P<videoID>[\d]+).*? + "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"''' + thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"' + thumb_match=re.search(thumb_RE,webpage) + info_match=re.search(info_RE,webpage,re.VERBOSE) + video_id=info_match.group('videoID') + mediaSlug=info_match.group('mediaSlug') + video_url=self._talk_video_link(mediaSlug) + info = { + 'id': video_id, + 'url': video_url, + 'ext': 'mp4', + 'title': title, + 'thumbnail': thumb_match.group('thumbnail') + } + return info - title = desc - video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov' +class MySpassIE(InfoExtractor): + _VALID_URL = r'http://www.myspass.de/.*' + def _real_extract(self, url): + META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s' + + # video id is the last path element of the URL + # usually there is a trailing slash, so also try the second but last + url_path = compat_urllib_parse_urlparse(url).path + url_parent_path, video_id = os.path.split(url_path) + if not video_id: + _, video_id = os.path.split(url_parent_path) + + # get metadata + metadata_url = META_DATA_URL_TEMPLATE % video_id + metadata_text = self._download_webpage(metadata_url, video_id) + metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8')) + + # extract values from metadata + url_flv_el = metadata.find('url_flv') + if url_flv_el is None: + self._downloader.trouble(u'ERROR: unable to extract download url') + return + video_url = url_flv_el.text + extension = os.path.splitext(video_url)[1][1:] + title_el = metadata.find('title') + if title_el is None: + self._downloader.trouble(u'ERROR: unable to extract title') + return + title = title_el.text + format_id_el = metadata.find('format_id') + if format_id_el is None: + format = ext + else: + format = format_id_el.text + description_el = metadata.find('description') + if description_el is not None: + description = description_el.text + else: + description = None + imagePreview_el = metadata.find('imagePreview') + if imagePreview_el is not None: + thumbnail = imagePreview_el.text + else: + thumbnail = None info = { 'id': video_id, 'url': video_url, - 'ext': 'mov', 'title': title, - 'description': desc, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'internal_id': status_id, - 'upload_date': upload_date + 'ext': extension, + 'format': format, + 'thumbnail': thumbnail, + 'description': description } return [info] + +def gen_extractors(): + """ Return a list of an instance of every supported extractor. + The order does matter; the first extractor matched is the one handling the URL. + """ + return [ + YoutubePlaylistIE(), + YoutubeChannelIE(), + YoutubeUserIE(), + YoutubeSearchIE(), + YoutubeIE(), + MetacafeIE(), + DailymotionIE(), + GoogleSearchIE(), + PhotobucketIE(), + YahooIE(), + YahooSearchIE(), + DepositFilesIE(), + FacebookIE(), + BlipTVUserIE(), + BlipTVIE(), + VimeoIE(), + MyVideoIE(), + ComedyCentralIE(), + EscapistIE(), + CollegeHumorIE(), + XVideosIE(), + SoundcloudIE(), + InfoQIE(), + MixcloudIE(), + StanfordOpenClassroomIE(), + MTVIE(), + YoukuIE(), + XNXXIE(), + YouJizzIE(), + PornotubeIE(), + YouPornIE(), + GooglePlusIE(), + ArteTvIE(), + NBAIE(), + WorldStarHipHopIE(), + JustinTVIE(), + FunnyOrDieIE(), + SteamIE(), + UstreamIE(), + RBMARadioIE(), + EightTracksIE(), + KeekIE(), + TEDIE(), + MySpassIE(), + GenericIE() + ] + +