X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;ds=inline;f=youtube_dl%2FInfoExtractors.py;h=5811ef0da634358ff41cd9b70492522796a34db4;hb=71e458d43792d6fb225b25e8a40dd5f1561c310b;hp=139173016a9f7139e9e3caae11dc563cf066f499;hpb=2f58b12dad1b5e19f2daf338cdba958be0b0a87c;p=youtube-dl diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 139173016..5811ef0da 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -16,6 +16,9 @@ import xml.etree.ElementTree import random import math import operator +import hashlib +import binascii +import urllib from .utils import * @@ -188,6 +191,45 @@ class InfoExtractor(object): video_info['title'] = playlist_title return video_info +class SearchInfoExtractor(InfoExtractor): + """ + Base class for paged search queries extractors. + They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query} + Instances should define _SEARCH_KEY and _MAX_RESULTS. + """ + + @classmethod + def _make_valid_url(cls): + return r'%s(?P|[1-9][0-9]*|all):(?P[\s\S]+)' % cls._SEARCH_KEY + + @classmethod + def suitable(cls, url): + return re.match(cls._make_valid_url(), url) is not None + + def _real_extract(self, query): + mobj = re.match(self._make_valid_url(), query) + if mobj is None: + raise ExtractorError(u'Invalid search query "%s"' % query) + + prefix = mobj.group('prefix') + query = mobj.group('query') + if prefix == '': + return self._get_n_results(query, 1) + elif prefix == 'all': + return self._get_n_results(query, self._MAX_RESULTS) + else: + n = int(prefix) + if n <= 0: + raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query)) + elif n > self._MAX_RESULTS: + self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n)) + n = self._MAX_RESULTS + return self._get_n_results(query, n) + + def _get_n_results(self, query, n): + """Get a specified number of results for a query""" + raise NotImplementedError("This method must be implemented by sublclasses") + class YoutubeIE(InfoExtractor): """Information extractor for youtube.com.""" @@ -334,6 +376,34 @@ class YoutubeIE(InfoExtractor): return (u'Did not fetch video subtitles', None, None) return (None, sub_lang, sub) + def _request_automatic_caption(self, video_id, webpage): + """We need the webpage for getting the captions url, pass it as an + argument to speed up the process.""" + sub_lang = self._downloader.params.get('subtitleslang') + sub_format = self._downloader.params.get('subtitlesformat') + self.to_screen(u'%s: Looking for automatic captions' % video_id) + mobj = re.search(r';ytplayer.config = ({.*?});', webpage) + err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang + if mobj is None: + return [(err_msg, None, None)] + player_config = json.loads(mobj.group(1)) + try: + args = player_config[u'args'] + caption_url = args[u'ttsurl'] + timestamp = args[u'timestamp'] + params = compat_urllib_parse.urlencode({ + 'lang': 'en', + 'tlang': sub_lang, + 'fmt': sub_format, + 'ts': timestamp, + 'kind': 'asr', + }) + subtitles_url = caption_url + '&' + params + sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions') + return [(None, sub_lang, sub)] + except KeyError: + return [(err_msg, None, None)] + def _extract_subtitle(self, video_id): """ Return a list with a tuple: @@ -581,7 +651,14 @@ class YoutubeIE(InfoExtractor): if video_subtitles: (sub_error, sub_lang, sub) = video_subtitles[0] if sub_error: - self._downloader.report_error(sub_error) + # We try with the automatic captions + video_subtitles = self._request_automatic_caption(video_id, video_webpage) + (sub_error_auto, sub_lang, sub) = video_subtitles[0] + if sub is not None: + pass + else: + # We report the original error + self._downloader.report_error(sub_error) if self._downloader.params.get('allsubtitles', False): video_subtitles = self._extract_all_subtitles(video_id) @@ -983,7 +1060,7 @@ class VimeoIE(InfoExtractor): """Information extractor for vimeo.com.""" # _VALID_URL matches Vimeo URLs - _VALID_URL = r'(?Phttps?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?Pplay_redirect_hls\?clip_id=)?(?:videos?/)?(?P[0-9]+)' + _VALID_URL = r'(?Phttps?://)?(?:(?:www|player)\.)?vimeo(?Ppro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?Pplay_redirect_hls\?clip_id=)?(?:videos?/)?(?P[0-9]+)' IE_NAME = u'vimeo' def _real_extract(self, url, new_video=True): @@ -995,7 +1072,7 @@ class VimeoIE(InfoExtractor): video_id = mobj.group('id') if not mobj.group('proto'): url = 'https://' + url - if mobj.group('direct_link'): + if mobj.group('direct_link') or mobj.group('pro'): url = 'https://vimeo.com/' + video_id # Retrieve video webpage to extract further information @@ -1265,6 +1342,8 @@ class GenericIE(InfoExtractor): opener.add_handler(handler()) response = opener.open(HeadRequest(url)) + if response is None: + raise ExtractorError(u'Invalid URL protocol') new_url = response.geturl() if url == new_url: @@ -1336,42 +1415,18 @@ class GenericIE(InfoExtractor): }] -class YoutubeSearchIE(InfoExtractor): +class YoutubeSearchIE(SearchInfoExtractor): """Information Extractor for YouTube search queries.""" - _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+' _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc' - _max_youtube_results = 1000 + _MAX_RESULTS = 1000 IE_NAME = u'youtube:search' + _SEARCH_KEY = 'ytsearch' def report_download_page(self, query, pagenum): """Report attempt to download search page with given number.""" query = query.decode(preferredencoding()) self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum)) - def _real_extract(self, query): - mobj = re.match(self._VALID_URL, query) - if mobj is None: - raise ExtractorError(u'Invalid search query "%s"' % query) - - prefix, query = query.split(':') - prefix = prefix[8:] - query = query.encode('utf-8') - if prefix == '': - return self._get_n_results(query, 1) - elif prefix == 'all': - self._get_n_results(query, self._max_youtube_results) - else: - try: - n = int(prefix) - if n <= 0: - raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query)) - elif n > self._max_youtube_results: - self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n)) - n = self._max_youtube_results - return self._get_n_results(query, n) - except ValueError: # parsing prefix as integer fails - return self._get_n_results(query, 1) - def _get_n_results(self, query, n): """Get a specified number of results for a query""" @@ -1401,33 +1456,15 @@ class YoutubeSearchIE(InfoExtractor): if len(video_ids) > n: video_ids = video_ids[:n] videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids] - return videos + return self.playlist_result(videos, query) -class GoogleSearchIE(InfoExtractor): +class GoogleSearchIE(SearchInfoExtractor): """Information Extractor for Google Video search queries.""" - _VALID_URL = r'gvsearch(?P|\d+|all):(?P[\s\S]+)' - _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"' - _max_google_results = 1000 + _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"' + _MAX_RESULTS = 1000 IE_NAME = u'video.google:search' - - def _real_extract(self, query): - mobj = re.match(self._VALID_URL, query) - - prefix = mobj.group('prefix') - query = mobj.group('query') - if prefix == '': - return self._get_n_results(query, 1) - elif prefix == 'all': - return self._get_n_results(query, self._max_google_results) - else: - n = int(prefix) - if n <= 0: - raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query)) - elif n > self._max_google_results: - self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n)) - n = self._max_google_results - return self._get_n_results(query, n) + _SEARCH_KEY = 'gvsearch' def _get_n_results(self, query, n): """Get a specified number of results for a query""" @@ -1439,7 +1476,7 @@ class GoogleSearchIE(InfoExtractor): } for pagenum in itertools.count(1): - result_url = u'http://video.google.com/videosearch?q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10) + result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10) webpage = self._download_webpage(result_url, u'gvsearch:' + query, note='Downloading result page ' + str(pagenum)) @@ -1453,84 +1490,39 @@ class GoogleSearchIE(InfoExtractor): if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage): return res -class YahooSearchIE(InfoExtractor): +class YahooSearchIE(SearchInfoExtractor): """Information Extractor for Yahoo! Video search queries.""" - _WORKING = False - _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+' - _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s' - _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"' - _MORE_PAGES_INDICATOR = r'\s*Next' - _max_yahoo_results = 1000 - IE_NAME = u'video.yahoo:search' - - def report_download_page(self, query, pagenum): - """Report attempt to download playlist page with given number.""" - query = query.decode(preferredencoding()) - self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum)) - - def _real_extract(self, query): - mobj = re.match(self._VALID_URL, query) - if mobj is None: - raise ExtractorError(u'Invalid search query "%s"' % query) + _MAX_RESULTS = 1000 + IE_NAME = u'screen.yahoo:search' + _SEARCH_KEY = 'yvsearch' - prefix, query = query.split(':') - prefix = prefix[8:] - query = query.encode('utf-8') - if prefix == '': - self._download_n_results(query, 1) - return - elif prefix == 'all': - self._download_n_results(query, self._max_yahoo_results) - return - else: - try: - n = int(prefix) - if n <= 0: - raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query)) - elif n > self._max_yahoo_results: - self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n)) - n = self._max_yahoo_results - self._download_n_results(query, n) - return - except ValueError: # parsing prefix as integer fails - self._download_n_results(query, 1) - return - - def _download_n_results(self, query, n): - """Downloads a specified number of results for a query""" - - video_ids = [] - already_seen = set() - pagenum = 1 - - while True: - self.report_download_page(query, pagenum) - result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum) - request = compat_urllib_request.Request(result_url) - try: - page = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to download webpage: %s' % compat_str(err)) + def _get_n_results(self, query, n): + """Get a specified number of results for a query""" - # Extract video identifiers - for mobj in re.finditer(self._VIDEO_INDICATOR, page): - video_id = mobj.group(1) - if video_id not in already_seen: - video_ids.append(video_id) - already_seen.add(video_id) - if len(video_ids) == n: - # Specified n videos reached - for id in video_ids: - self._downloader.download(['http://video.yahoo.com/watch/%s' % id]) - return - - if re.search(self._MORE_PAGES_INDICATOR, page) is None: - for id in video_ids: - self._downloader.download(['http://video.yahoo.com/watch/%s' % id]) - return + res = { + '_type': 'playlist', + 'id': query, + 'entries': [] + } + for pagenum in itertools.count(0): + result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30) + webpage = self._download_webpage(result_url, query, + note='Downloading results page '+str(pagenum+1)) + info = json.loads(webpage) + m = info[u'm'] + results = info[u'results'] + + for (i, r) in enumerate(results): + if (pagenum * 30) +i >= n: + break + mobj = re.search(r'(?Pscreen\.yahoo\.com/.*?-\d*?\.html)"', r) + e = self.url_result('http://' + mobj.group('url'), 'Yahoo') + res['entries'].append(e) + if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )): + break - pagenum = pagenum + 1 + return res class YoutubePlaylistIE(InfoExtractor): @@ -1927,7 +1919,7 @@ class FacebookIE(InfoExtractor): class BlipTVIE(InfoExtractor): """Information extractor for blip.tv""" - _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$' + _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$' _URL_EXT = r'^.*\.([a-z0-9]+)$' IE_NAME = u'blip.tv' @@ -1940,6 +1932,10 @@ class BlipTVIE(InfoExtractor): if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) + # See https://github.com/rg3/youtube-dl/issues/857 + api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P[\d\w]+)', url) + if api_mobj is not None: + url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id') urlp = compat_urllib_parse_urlparse(url) if urlp.path.startswith('/play/'): request = compat_urllib_request.Request(url) @@ -2025,37 +2021,158 @@ class MyVideoIE(InfoExtractor): _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*' IE_NAME = u'myvideo' + # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git + # Released into the Public Domain by Tristan Fischer on 2013-05-19 + # https://github.com/rg3/youtube-dl/pull/842 + def __rc4crypt(self,data, key): + x = 0 + box = list(range(256)) + for i in list(range(256)): + x = (x + box[i] + compat_ord(key[i % len(key)])) % 256 + box[i], box[x] = box[x], box[i] + x = 0 + y = 0 + out = '' + for char in data: + x = (x + 1) % 256 + y = (y + box[x]) % 256 + box[x], box[y] = box[y], box[x] + out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256]) + return out + + def __md5(self,s): + return hashlib.md5(s).hexdigest().encode() + def _real_extract(self,url): mobj = re.match(self._VALID_URL, url) if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) + raise ExtractorError(u'invalid URL: %s' % url) video_id = mobj.group(1) + GK = ( + b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt' + b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3' + b'TnpsbA0KTVRkbU1tSTRNdz09' + ) + # Get video webpage webpage_url = 'http://www.myvideo.de/watch/%s' % video_id webpage = self._download_webpage(webpage_url, video_id) + mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage) + if mobj is not None: + self.report_extraction(video_id) + video_url = mobj.group(1) + '.flv' + + mobj = re.search('([^<]+)', webpage) + if mobj is None: + raise ExtractorError(u'Unable to extract title') + video_title = mobj.group(1) + + mobj = re.search('[.](.+?)$', video_url) + if mobj is None: + raise ExtractorError(u'Unable to extract extention') + video_ext = mobj.group(1) + + return [{ + 'id': video_id, + 'url': video_url, + 'uploader': None, + 'upload_date': None, + 'title': video_title, + 'ext': u'flv', + }] + + # try encxml + mobj = re.search('var flashvars={(.+?)}', webpage) + if mobj is None: + raise ExtractorError(u'Unable to extract video') + + params = {} + encxml = '' + sec = mobj.group(1) + for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec): + if not a == '_encxml': + params[a] = b + else: + encxml = compat_urllib_parse.unquote(b) + if not params.get('domain'): + params['domain'] = 'www.myvideo.de' + xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params)) + if 'flash_playertype=MTV' in xmldata_url: + self._downloader.report_warning(u'avoiding MTV player') + xmldata_url = ( + 'http://www.myvideo.de/dynamic/get_player_video_xml.php' + '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes' + ) % video_id + + # get enc data + enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1] + enc_data_b = binascii.unhexlify(enc_data) + sk = self.__md5( + base64.b64decode(base64.b64decode(GK)) + + self.__md5( + str(video_id).encode('utf-8') + ) + ) + dec_data = self.__rc4crypt(enc_data_b, sk) + + # extracting infos self.report_extraction(video_id) - mobj = re.search(r'([^<]+)', webpage) + mobj = re.search('source=\'(.*?)\'', dec_data) if mobj is None: - raise ExtractorError(u'Unable to extract title') + raise ExtractorError(u'unable to extract swfobj') + video_file = compat_urllib_parse.unquote(mobj.group(1)) + + if not video_file.endswith('f4m'): + ppath, prefix = video_file.split('.') + video_playpath = '%s:%s' % (prefix, ppath) + video_hls_playlist = '' + else: + video_playpath = '' + video_hls_playlist = ( + video_filepath + video_file + ).replace('.f4m', '.m3u8') + mobj = re.search('swfobject.embedSWF\(\'(.+?)\'', webpage) + if mobj is None: + raise ExtractorError(u'unable to extract swfobj') + video_swfobj = compat_urllib_parse.unquote(mobj.group(1)) + + mobj = re.search("(.*?)", webpage) + if mobj is None: + raise ExtractorError(u'unable to extract title') video_title = mobj.group(1) return [{ - 'id': video_id, - 'url': video_url, - 'uploader': None, - 'upload_date': None, - 'title': video_title, - 'ext': u'flv', + 'id': video_id, + 'url': video_rtmpurl, + 'tc_url': video_rtmpurl, + 'uploader': None, + 'upload_date': None, + 'title': video_title, + 'ext': u'flv', + 'play_path': video_playpath, + 'video_file': video_file, + 'video_hls_playlist': video_hls_playlist, + 'player_url': video_swfobj, }] class ComedyCentralIE(InfoExtractor): @@ -3347,18 +3464,26 @@ class UstreamIE(InfoExtractor): video_id = m.group('videoID') video_url = u'http://tcdn.ustream.tv/video/%s' % video_id webpage = self._download_webpage(url, video_id) - m = re.search(r'data-title="(?P.+)"',webpage) - title = m.group('title') - m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage) - uploader = m.group('uploader') + self.report_extraction(video_id) + try: + m = re.search(r'data-title="(?P<title>.+)"',webpage) + title = m.group('title') + m = re.search(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>', + webpage, re.DOTALL) + uploader = unescapeHTML(m.group('uploader').strip()) + m = re.search(r'<link rel="image_src" href="(?P<thumb>.*?)"', webpage) + thumb = m.group('thumb') + except AttributeError: + raise ExtractorError(u'Unable to extract info') info = { 'id':video_id, 'url':video_url, 'ext': 'flv', 'title': title, - 'uploader': uploader + 'uploader': uploader, + 'thumbnail': thumb, } - return [info] + return info class WorldStarHipHopIE(InfoExtractor): _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)' @@ -3980,7 +4105,7 @@ class TumblrIE(InfoExtractor): re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id) video = re.search(re_video, webpage) if video is None: - self.to_screen("No video founded") + self.to_screen("No video found") return [] video_url = video.group('video_url') ext = video.group('ext') @@ -4079,7 +4204,7 @@ class RedTubeIE(InfoExtractor): class InaIE(InfoExtractor): """Information Extractor for Ina.fr""" - _VALID_URL = r'(?:http://)?(?:www.)?ina\.fr/video/(?P<id>I[0-9]+)/.*' + _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*' def _real_extract(self,url): mobj = re.match(self._VALID_URL, url) @@ -4106,6 +4231,258 @@ class InaIE(InfoExtractor): 'title': video_title, }] +class HowcastIE(InfoExtractor): + """Information Extractor for Howcast.com""" + _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + video_id = mobj.group('id') + webpage_url = 'http://www.howcast.com/videos/' + video_id + webpage = self._download_webpage(webpage_url, video_id) + + self.report_extraction(video_id) + + mobj = re.search(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)"', webpage) + if mobj is None: + raise ExtractorError(u'Unable to extract video URL') + video_url = mobj.group(1) + + mobj = re.search(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'', webpage) + if mobj is None: + raise ExtractorError(u'Unable to extract title') + video_title = mobj.group(1) or mobj.group(2) + + mobj = re.search(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'', webpage) + if mobj is None: + self._downloader.report_warning(u'unable to extract description') + video_description = None + else: + video_description = mobj.group(1) or mobj.group(2) + + mobj = re.search(r'<meta content=\'(.+?)\' property=\'og:image\'', webpage) + if mobj is None: + raise ExtractorError(u'Unable to extract thumbnail') + thumbnail = mobj.group(1) + + return [{ + 'id': video_id, + 'url': video_url, + 'ext': 'mp4', + 'title': video_title, + 'description': video_description, + 'thumbnail': thumbnail, + }] + +class VineIE(InfoExtractor): + """Information Extractor for Vine.co""" + _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)' + + def _real_extract(self, url): + + mobj = re.match(self._VALID_URL, url) + + video_id = mobj.group('id') + webpage_url = 'https://vine.co/v/' + video_id + webpage = self._download_webpage(webpage_url, video_id) + + self.report_extraction(video_id) + + mobj = re.search(r'<meta property="twitter:player:stream" content="(.+?)"', webpage) + if mobj is None: + raise ExtractorError(u'Unable to extract video URL') + video_url = mobj.group(1) + + mobj = re.search(r'<meta property="og:title" content="(.+?)"', webpage) + if mobj is None: + raise ExtractorError(u'Unable to extract title') + video_title = mobj.group(1) + + mobj = re.search(r'<meta property="og:image" content="(.+?)(\?.*?)?"', webpage) + if mobj is None: + raise ExtractorError(u'Unable to extract thumbnail') + thumbnail = mobj.group(1) + + mobj = re.search(r'<div class="user">.*?<h2>(.+?)</h2>', webpage, re.DOTALL) + if mobj is None: + raise ExtractorError(u'Unable to extract uploader') + uploader = mobj.group(1) + + return [{ + 'id': video_id, + 'url': video_url, + 'ext': 'mp4', + 'title': video_title, + 'thumbnail': thumbnail, + 'uploader': uploader, + }] + +class FlickrIE(InfoExtractor): + """Information Extractor for Flickr videos""" + _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + video_id = mobj.group('id') + video_uploader_id = mobj.group('uploader_id') + webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id + webpage = self._download_webpage(webpage_url, video_id) + + mobj = re.search(r"photo_secret: '(\w+)'", webpage) + if mobj is None: + raise ExtractorError(u'Unable to extract video secret') + secret = mobj.group(1) + + first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self' + first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage') + + mobj = re.search(r'<Item id="id">(\d+-\d+)</Item>', first_xml) + if mobj is None: + raise ExtractorError(u'Unable to extract node_id') + node_id = mobj.group(1) + + second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1' + second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage') + + self.report_extraction(video_id) + + mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml) + if mobj is None: + raise ExtractorError(u'Unable to extract video url') + video_url = mobj.group(1) + unescapeHTML(mobj.group(2)) + + mobj = re.search(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')', webpage) + if mobj is None: + raise ExtractorError(u'Unable to extract title') + video_title = mobj.group(1) or mobj.group(2) + + mobj = re.search(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')', webpage) + if mobj is None: + self._downloader.report_warning(u'unable to extract description') + video_description = None + else: + video_description = mobj.group(1) or mobj.group(2) + + mobj = re.search(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')', webpage) + if mobj is None: + raise ExtractorError(u'Unable to extract thumbnail') + thumbnail = mobj.group(1) or mobj.group(2) + + return [{ + 'id': video_id, + 'url': video_url, + 'ext': 'mp4', + 'title': video_title, + 'description': video_description, + 'thumbnail': thumbnail, + 'uploader_id': video_uploader_id, + }] + +class TeamcocoIE(InfoExtractor): + _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError(u'Invalid URL: %s' % url) + url_title = mobj.group('url_title') + webpage = self._download_webpage(url, url_title) + + mobj = re.search(r'<article class="video" data-id="(\d+?)"', webpage) + video_id = mobj.group(1) + + self.report_extraction(video_id) + + mobj = re.search(r'<meta property="og:title" content="(.+?)"', webpage) + if mobj is None: + raise ExtractorError(u'Unable to extract title') + video_title = mobj.group(1) + + mobj = re.search(r'<meta property="og:image" content="(.+?)"', webpage) + if mobj is None: + raise ExtractorError(u'Unable to extract thumbnail') + thumbnail = mobj.group(1) + + mobj = re.search(r'<meta property="og:description" content="(.*?)"', webpage) + if mobj is None: + raise ExtractorError(u'Unable to extract description') + description = mobj.group(1) + + data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id + data = self._download_webpage(data_url, video_id, 'Downloading data webpage') + mobj = re.search(r'<file type="high".*?>(.*?)</file>', data) + if mobj is None: + raise ExtractorError(u'Unable to extract video url') + video_url = mobj.group(1) + + return [{ + 'id': video_id, + 'url': video_url, + 'ext': 'mp4', + 'title': video_title, + 'thumbnail': thumbnail, + 'description': description, + }] + +class XHamsterIE(InfoExtractor): + """Information Extractor for xHamster""" + _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html' + + def _real_extract(self,url): + mobj = re.match(self._VALID_URL, url) + + video_id = mobj.group('id') + mrss_url='http://xhamster.com/movies/%s/.html' % video_id + webpage = self._download_webpage(mrss_url, video_id) + mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage) + if mobj is None: + raise ExtractorError(u'Unable to extract media URL') + if len(mobj.group('server')) == 0: + video_url = compat_urllib_parse.unquote(mobj.group('file')) + else: + video_url = mobj.group('server')+'/key='+mobj.group('file') + video_extension = video_url.split('.')[-1] + + mobj = re.search(r'<title>(?P<title>.+?) - xHamster\.com', webpage) + if mobj is None: + raise ExtractorError(u'Unable to extract title') + video_title = unescapeHTML(mobj.group('title')) + + mobj = re.search(r'Description: (?P[^<]+)', webpage) + if mobj is None: + video_description = u'' + else: + video_description = unescapeHTML(mobj.group('description')) + + mobj = re.search(r'hint=\'(?P[0-9]{4})-(?P[0-9]{2})-(?P[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage) + if mobj is None: + raise ExtractorError(u'Unable to extract upload date') + video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d') + + mobj = re.search(r']+>(?P[^>]+)', webpage) + if mobj is None: + video_uploader_id = u'anonymous' + else: + video_uploader_id = mobj.group('uploader_id') + + mobj = re.search(r'\'image\':\'(?P[^\']+)\'', webpage) + if mobj is None: + raise ExtractorError(u'Unable to extract thumbnail URL') + video_thumbnail = mobj.group('thumbnail') + + return [{ + 'id': video_id, + 'url': video_url, + 'ext': video_extension, + 'title': video_title, + 'description': video_description, + 'upload_date': video_upload_date, + 'uploader_id': video_uploader_id, + 'thumbnail': video_thumbnail + }] + def gen_extractors(): """ Return a list of an instance of every supported extractor. The order does matter; the first extractor matched is the one handling the URL. @@ -4124,8 +4501,8 @@ def gen_extractors(): YahooSearchIE(), DepositFilesIE(), FacebookIE(), - BlipTVUserIE(), BlipTVIE(), + BlipTVUserIE(), VimeoIE(), MyVideoIE(), ComedyCentralIE(), @@ -4163,6 +4540,11 @@ def gen_extractors(): BandcampIE(), RedTubeIE(), InaIE(), + HowcastIE(), + VineIE(), + FlickrIE(), + TeamcocoIE(), + XHamsterIE(), GenericIE() ]