X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2FInfoExtractors.py;h=c06ecbe52aad148ce036cd98c74ef1f371b27c78;hb=9f5daf0006091124799e13c834db17201ebdbbc5;hp=d950880add02538c8904b3f60dae5e99d5f6def6;hpb=c5e8d7af0ed867d70502491e3a80ee09b78ed2ce;p=youtube-dl diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index d950880ad..c06ecbe52 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -1,3254 +1,108 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -from __future__ import absolute_import - -import base64 -import datetime -import itertools -import netrc -import os -import re -import socket -import time -import email.utils -import xml.etree.ElementTree -import random -import math -import operator -import hashlib -import binascii -import urllib - -from .utils import * - - -from .extractor.common import InfoExtractor, SearchInfoExtractor -from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeUserIE, YoutubeChannelIE - - - -class MetacafeIE(InfoExtractor): - """Information Extractor for metacafe.com.""" - - _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*' - _DISCLAIMER = 'http://www.metacafe.com/family_filter/' - _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user' - IE_NAME = u'metacafe' - - def report_disclaimer(self): - """Report disclaimer retrieval.""" - self.to_screen(u'Retrieving disclaimer') - - def _real_initialize(self): - # Retrieve disclaimer - request = compat_urllib_request.Request(self._DISCLAIMER) - try: - self.report_disclaimer() - disclaimer = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err)) - - # Confirm age - disclaimer_form = { - 'filters': '0', - 'submit': "Continue - I'm over 18", - } - request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form)) - try: - self.report_age_confirmation() - disclaimer = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err)) - - def _real_extract(self, url): - # Extract id and simplified title from URL - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - - video_id = mobj.group(1) - - # Check if video comes from YouTube - mobj2 = re.match(r'^yt-(.*)$', video_id) - if mobj2 is not None: - return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')] - - # Retrieve video webpage to extract further information - webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id) - - # Extract URL, uploader and title from webpage - self.report_extraction(video_id) - mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage) - if mobj is not None: - mediaURL = compat_urllib_parse.unquote(mobj.group(1)) - video_extension = mediaURL[-3:] - - # Extract gdaKey if available - mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage) - if mobj is None: - video_url = mediaURL - else: - gdaKey = mobj.group(1) - video_url = '%s?__gda__=%s' % (mediaURL, gdaKey) - else: - mobj = re.search(r' name="flashvars" value="(.*?)"', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract media URL') - vardict = compat_parse_qs(mobj.group(1)) - if 'mediaData' not in vardict: - raise ExtractorError(u'Unable to extract media URL') - mobj = re.search(r'"mediaURL":"(?Phttp.*?)",(.*?)"key":"(?P.*?)"', vardict['mediaData'][0]) - if mobj is None: - raise ExtractorError(u'Unable to extract media URL') - mediaURL = mobj.group('mediaURL').replace('\\/', '/') - video_extension = mediaURL[-3:] - video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key')) - - mobj = re.search(r'(?im)(.*) - Video', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = mobj.group(1).decode('utf-8') - - mobj = re.search(r'submitter=(.*?);', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract uploader nickname') - video_uploader = mobj.group(1) - - return [{ - 'id': video_id.decode('utf-8'), - 'url': video_url.decode('utf-8'), - 'uploader': video_uploader.decode('utf-8'), - 'upload_date': None, - 'title': video_title, - 'ext': video_extension.decode('utf-8'), - }] - -class DailymotionIE(InfoExtractor): - """Information Extractor for Dailymotion""" - - _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)' - IE_NAME = u'dailymotion' - - def _real_extract(self, url): - # Extract id and simplified title from URL - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - - video_id = mobj.group(1).split('_')[0].split('?')[0] - - video_extension = 'mp4' - - # Retrieve video webpage to extract further information - request = compat_urllib_request.Request(url) - request.add_header('Cookie', 'family_filter=off') - webpage = self._download_webpage(request, video_id) - - # Extract URL, uploader and title from webpage - self.report_extraction(video_id) - mobj = re.search(r'\s*var flashvars = (.*)', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract media URL') - flashvars = compat_urllib_parse.unquote(mobj.group(1)) - - for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']: - if key in flashvars: - max_quality = key - self.to_screen(u'Using %s' % key) - break - else: - raise ExtractorError(u'Unable to extract video URL') - - mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars) - if mobj is None: - raise ExtractorError(u'Unable to extract video URL') - - video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/') - - # TODO: support choosing qualities - - mobj = re.search(r'', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = unescapeHTML(mobj.group('title')) - - video_uploader = None - video_uploader = self._search_regex([r'(?im)[^<]+?]+?>([^<]+?)', - # Looking for official user - r'<(?:span|a) .*?rel="author".*?>([^<]+?)([0-9]{2})-([0-9]{2})-([0-9]{4})', webpage) - if mobj is not None: - video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1) - - return [{ - 'id': video_id, - 'url': video_url, - 'uploader': video_uploader, - 'upload_date': video_upload_date, - 'title': video_title, - 'ext': video_extension, - }] - - -class PhotobucketIE(InfoExtractor): - """Information extractor for photobucket.com.""" - - # TODO: the original _VALID_URL was: - # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)' - # Check if it's necessary to keep the old extracion process - _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P.*)\.(?P(flv)|(mp4))' - IE_NAME = u'photobucket' - - def _real_extract(self, url): - # Extract id from URL - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - - video_id = mobj.group('id') - - video_extension = mobj.group('ext') - - # Retrieve video webpage to extract further information - webpage = self._download_webpage(url, video_id) - - # Extract URL, uploader, and title from webpage - self.report_extraction(video_id) - # We try first by looking the javascript code: - mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P.*?)\);', webpage) - if mobj is not None: - info = json.loads(mobj.group('json')) - return [{ - 'id': video_id, - 'url': info[u'downloadUrl'], - 'uploader': info[u'username'], - 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'), - 'title': info[u'title'], - 'ext': video_extension, - 'thumbnail': info[u'thumbUrl'], - }] - - # We try looking in other parts of the webpage - video_url = self._search_regex(r'', - webpage, u'video URL') - - mobj = re.search(r'(.*) video by (.*) - Photobucket', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = mobj.group(1).decode('utf-8') - video_uploader = mobj.group(2).decode('utf-8') - - return [{ - 'id': video_id.decode('utf-8'), - 'url': video_url.decode('utf-8'), - 'uploader': video_uploader, - 'upload_date': None, - 'title': video_title, - 'ext': video_extension.decode('utf-8'), - }] - - -class YahooIE(InfoExtractor): - """Information extractor for screen.yahoo.com.""" - _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P\d*?)\.html' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - video_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) - m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P.+?)";', webpage) - - if m_id is None: - # TODO: Check which url parameters are required - info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id - webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage') - info_re = r'''<!\[CDATA\[(?P<title>.*?)\]\]>.* - .*?)\]\]>.* - .*?)\ .*\]\]>.* - https?://)?(?:(?:www|player)\.)?vimeo(?Ppro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?Pplay_redirect_hls\?clip_id=)?(?:videos?/)?(?P[0-9]+)' - IE_NAME = u'vimeo' - - def _verify_video_password(self, url, video_id, webpage): - password = self._downloader.params.get('password', None) - if password is None: - raise ExtractorError(u'This video is protected by a password, use the --password option') - token = re.search(r'xsrft: \'(.*?)\'', webpage).group(1) - data = compat_urllib_parse.urlencode({'password': password, - 'token': token}) - # I didn't manage to use the password with https - if url.startswith('https'): - pass_url = url.replace('https','http') - else: - pass_url = url - password_request = compat_urllib_request.Request(pass_url+'/password', data) - password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') - password_request.add_header('Cookie', 'xsrft=%s' % token) - pass_web = self._download_webpage(password_request, video_id, - u'Verifying the password', - u'Wrong password') - - def _real_extract(self, url, new_video=True): - # Extract ID from URL - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - - video_id = mobj.group('id') - if not mobj.group('proto'): - url = 'https://' + url - if mobj.group('direct_link') or mobj.group('pro'): - url = 'https://vimeo.com/' + video_id - - # Retrieve video webpage to extract further information - request = compat_urllib_request.Request(url, None, std_headers) - webpage = self._download_webpage(request, video_id) - - # Now we begin extracting as much information as we can from what we - # retrieved. First we extract the information common to all extractors, - # and latter we extract those that are Vimeo specific. - self.report_extraction(video_id) - - # Extract the config JSON - try: - config = webpage.split(' = {config:')[1].split(',assets:')[0] - config = json.loads(config) - except: - if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage): - raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option') - - if re.search('If so please provide the correct password.', webpage): - self._verify_video_password(url, video_id, webpage) - return self._real_extract(url) - else: - raise ExtractorError(u'Unable to extract info section') - - # Extract title - video_title = config["video"]["title"] - - # Extract uploader and uploader_id - video_uploader = config["video"]["owner"]["name"] - video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None - - # Extract video thumbnail - video_thumbnail = config["video"]["thumbnail"] - - # Extract video description - video_description = get_element_by_attribute("itemprop", "description", webpage) - if video_description: video_description = clean_html(video_description) - else: video_description = u'' - - # Extract upload date - video_upload_date = None - mobj = re.search(r' 0: - video_quality = files[quality][0][2] - video_codec = files[quality][0][0] - video_extension = files[quality][0][1] - self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality)) - break - else: - raise ExtractorError(u'No known codec found') - - video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \ - %(video_id, sig, timestamp, video_quality, video_codec.upper()) - - return [{ - 'id': video_id, - 'url': video_url, - 'uploader': video_uploader, - 'uploader_id': video_uploader_id, - 'upload_date': video_upload_date, - 'title': video_title, - 'ext': video_extension, - 'thumbnail': video_thumbnail, - 'description': video_description, - }] - - -class ArteTvIE(InfoExtractor): - """arte.tv information extractor.""" - - _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*' - _LIVE_URL = r'index-[0-9]+\.html$' - - IE_NAME = u'arte.tv' - - def fetch_webpage(self, url): - request = compat_urllib_request.Request(url) - try: - self.report_download_webpage(url) - webpage = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err)) - except ValueError as err: - raise ExtractorError(u'Invalid URL: %s' % url) - return webpage - - def grep_webpage(self, url, regex, regexFlags, matchTuples): - page = self.fetch_webpage(url) - mobj = re.search(regex, page, regexFlags) - info = {} - - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - - for (i, key, err) in matchTuples: - if mobj.group(i) is None: - raise ExtractorError(err) - else: - info[key] = mobj.group(i) - - return info - - def extractLiveStream(self, url): - video_lang = url.split('/')[-4] - info = self.grep_webpage( - url, - r'src="(.*?/videothek_js.*?\.js)', - 0, - [ - (1, 'url', u'Invalid URL: %s' % url) - ] - ) - http_host = url.split('/')[2] - next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url'))) - info = self.grep_webpage( - next_url, - r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' + - '(http://.*?\.swf).*?' + - '(rtmp://.*?)\'', - re.DOTALL, - [ - (1, 'path', u'could not extract video path: %s' % url), - (2, 'player', u'could not extract video player: %s' % url), - (3, 'url', u'could not extract video url: %s' % url) - ] - ) - video_url = u'%s/%s' % (info.get('url'), info.get('path')) - - def extractPlus7Stream(self, url): - video_lang = url.split('/')[-3] - info = self.grep_webpage( - url, - r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)', - 0, - [ - (1, 'url', u'Invalid URL: %s' % url) - ] - ) - next_url = compat_urllib_parse.unquote(info.get('url')) - info = self.grep_webpage( - next_url, - r'""" - mobj = re.search(_title, webpage_src) - if mobj is not None: - video_title = mobj.group(1) - - results = [{ - 'id': video_id, - 'url' : video_url, - 'title' : video_title, - 'thumbnail' : thumbnail, - 'ext' : ext, - }] - return results - -class RBMARadioIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P[^/]+)$' - - def _real_extract(self, url): - m = re.match(self._VALID_URL, url) - video_id = m.group('videoID') - - webpage = self._download_webpage(url, video_id) - - json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$', - webpage, u'json data', flags=re.MULTILINE) - - try: - data = json.loads(json_data) - except ValueError as e: - raise ExtractorError(u'Invalid JSON: ' + str(e)) - - video_url = data['akamai_url'] + '&cbr=256' - url_parts = compat_urllib_parse_urlparse(video_url) - video_ext = url_parts.path.rpartition('.')[2] - info = { - 'id': video_id, - 'url': video_url, - 'ext': video_ext, - 'title': data['title'], - 'description': data.get('teaser_text'), - 'location': data.get('country_of_origin'), - 'uploader': data.get('host', {}).get('name'), - 'uploader_id': data.get('host', {}).get('slug'), - 'thumbnail': data.get('image', {}).get('large_url_2x'), - 'duration': data.get('duration'), - } - return [info] - - -class YouPornIE(InfoExtractor): - """Information extractor for youporn.com.""" - _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P[0-9]+)/(?P[^/]+)' - - def _print_formats(self, formats): - """Print all available formats""" - print(u'Available formats:') - print(u'ext\t\tformat') - print(u'---------------------------------') - for format in formats: - print(u'%s\t\t%s' % (format['ext'], format['format'])) - - def _specific(self, req_format, formats): - for x in formats: - if(x["format"]==req_format): - return x - return None - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - video_id = mobj.group('videoid') - - req = compat_urllib_request.Request(url) - req.add_header('Cookie', 'age_verified=1') - webpage = self._download_webpage(req, video_id) - - # Get JSON parameters - json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters') - try: - params = json.loads(json_params) - except: - raise ExtractorError(u'Invalid JSON') - - self.report_extraction(video_id) - try: - video_title = params['title'] - upload_date = unified_strdate(params['release_date_f']) - video_description = params['description'] - video_uploader = params['submitted_by'] - thumbnail = params['thumbnails'][0]['image'] - except KeyError: - raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1]) - - # Get all of the formats available - DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>' - download_list_html = self._search_regex(DOWNLOAD_LIST_RE, - webpage, u'download list').strip() - - # Get all of the links from the page - LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">' - links = re.findall(LINK_RE, download_list_html) - if(len(links) == 0): - raise ExtractorError(u'ERROR: no known formats available for video') - - self.to_screen(u'Links found: %d' % len(links)) - - formats = [] - for link in links: - - # A link looks like this: - # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0 - # A path looks like this: - # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4 - video_url = unescapeHTML( link ) - path = compat_urllib_parse_urlparse( video_url ).path - extension = os.path.splitext( path )[1][1:] - format = path.split('/')[4].split('_')[:2] - size = format[0] - bitrate = format[1] - format = "-".join( format ) - # title = u'%s-%s-%s' % (video_title, size, bitrate) - - formats.append({ - 'id': video_id, - 'url': video_url, - 'uploader': video_uploader, - 'upload_date': upload_date, - 'title': video_title, - 'ext': extension, - 'format': format, - 'thumbnail': thumbnail, - 'description': video_description - }) - - if self._downloader.params.get('listformats', None): - self._print_formats(formats) - return - - req_format = self._downloader.params.get('format', None) - self.to_screen(u'Format: %s' % req_format) - - if req_format is None or req_format == 'best': - return [formats[0]] - elif req_format == 'worst': - return [formats[-1]] - elif req_format in ('-1', 'all'): - return formats - else: - format = self._specific( req_format, formats ) - if result is None: - raise ExtractorError(u'Requested format not available') - return [format] - - - -class PornotubeIE(InfoExtractor): - """Information extractor for pornotube.com.""" - _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - - video_id = mobj.group('videoid') - video_title = mobj.group('title') - - # Get webpage content - webpage = self._download_webpage(url, video_id) - - # Get the video URL - VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",' - video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url') - video_url = compat_urllib_parse.unquote(video_url) - - #Get the uploaded date - VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by' - upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False) - if upload_date: upload_date = unified_strdate(upload_date) - - info = {'id': video_id, - 'url': video_url, - 'uploader': None, - 'upload_date': upload_date, - 'title': video_title, - 'ext': 'flv', - 'format': 'flv'} - - return [info] - -class YouJizzIE(InfoExtractor): - """Information extractor for youjizz.com.""" - _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - - video_id = mobj.group('videoid') - - # Get webpage content - webpage = self._download_webpage(url, video_id) - - # Get the video title - video_title = self._html_search_regex(r'<title>(?P<title>.*)', - webpage, u'title').strip() - - # Get the embed page - result = re.search(r'https?://www.youjizz.com/videos/embed/(?P[0-9]+)', webpage) - if result is None: - raise ExtractorError(u'ERROR: unable to extract embed page') - - embed_page_url = result.group(0).strip() - video_id = result.group('videoid') - - webpage = self._download_webpage(embed_page_url, video_id) - - # Get the video URL - video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P[^"]+)"\)\);', - webpage, u'video URL') - - info = {'id': video_id, - 'url': video_url, - 'title': video_title, - 'ext': 'flv', - 'format': 'flv', - 'player_url': embed_page_url} - - return [info] - -class EightTracksIE(InfoExtractor): - IE_NAME = '8tracks' - _VALID_URL = r'https?://8tracks.com/(?P[^/]+)/(?P[^/#]+)(?:#.*)?$' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - playlist_id = mobj.group('id') - - webpage = self._download_webpage(url, playlist_id) - - json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL) - data = json.loads(json_like) - - session = str(random.randint(0, 1000000000)) - mix_id = data['id'] - track_count = data['tracks_count'] - first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id) - next_url = first_url - res = [] - for i in itertools.count(): - api_json = self._download_webpage(next_url, playlist_id, - note=u'Downloading song information %s/%s' % (str(i+1), track_count), - errnote=u'Failed to download song information') - api_data = json.loads(api_json) - track_data = api_data[u'set']['track'] - info = { - 'id': track_data['id'], - 'url': track_data['track_file_stream_url'], - 'title': track_data['performer'] + u' - ' + track_data['name'], - 'raw_title': track_data['name'], - 'uploader_id': data['user']['login'], - 'ext': 'm4a', - } - res.append(info) - if api_data['set']['at_last_track']: - break - next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id']) - return res - -class KeekIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P\w+)' - IE_NAME = u'keek' - - def _real_extract(self, url): - m = re.match(self._VALID_URL, url) - video_id = m.group('videoID') - - video_url = u'http://cdn.keek.com/keek/video/%s' % video_id - thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id - webpage = self._download_webpage(url, video_id) - - video_title = self._html_search_regex(r'[\S\s]+?

(?P.+?)

', - webpage, u'uploader', fatal=False) - - info = { - 'id': video_id, - 'url': video_url, - 'ext': 'mp4', - 'title': video_title, - 'thumbnail': thumbnail, - 'uploader': uploader - } - return [info] - -class TEDIE(InfoExtractor): - _VALID_URL=r'''http://www\.ted\.com/ - ( - ((?Pplaylists)/(?P\d+)) # We have a playlist - | - ((?Ptalks)) # We have a simple talk - ) - (/lang/(.*?))? # The url may contain the language - /(?P\w+) # Here goes the name and then ".html" - ''' - - @classmethod - def suitable(cls, url): - """Receives a URL and returns True if suitable for this IE.""" - return re.match(cls._VALID_URL, url, re.VERBOSE) is not None - - def _real_extract(self, url): - m=re.match(self._VALID_URL, url, re.VERBOSE) - if m.group('type_talk'): - return [self._talk_info(url)] - else : - playlist_id=m.group('playlist_id') - name=m.group('name') - self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name)) - return [self._playlist_videos_info(url,name,playlist_id)] - - def _playlist_videos_info(self,url,name,playlist_id=0): - '''Returns the videos of the playlist''' - video_RE=r''' -
(?P.+?)

' - webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage') - m_videos=re.finditer(video_RE,webpage,re.VERBOSE) - m_names=re.finditer(video_name_RE,webpage) - - playlist_title = self._html_search_regex(r'div class="headline">\s*?

\s*?(.*?)', - webpage, 'playlist title') - - playlist_entries = [] - for m_video, m_name in zip(m_videos,m_names): - video_id=m_video.group('video_id') - talk_url='http://www.ted.com%s' % m_name.group('talk_url') - playlist_entries.append(self.url_result(talk_url, 'TED')) - return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title) - - def _talk_info(self, url, video_id=0): - """Return the video for the talk in the url""" - m = re.match(self._VALID_URL, url,re.VERBOSE) - video_name = m.group('name') - webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name) - self.report_extraction(video_name) - # If the url includes the language we get the title translated - title = self._html_search_regex(r'(?P.*)</span>', - webpage, 'title') - json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>', - webpage, 'json data') - info = json.loads(json_data) - desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>', - webpage, 'description', flags = re.DOTALL) - - thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"', - webpage, 'thumbnail') - info = { - 'id': info['id'], - 'url': info['htmlStreams'][-1]['file'], - 'ext': 'mp4', - 'title': title, - 'thumbnail': thumbnail, - 'description': desc, - } - return info - -class MySpassIE(InfoExtractor): - _VALID_URL = r'http://www.myspass.de/.*' +import base64 +import datetime +import itertools +import netrc +import os +import re +import socket +import time +import email.utils +import xml.etree.ElementTree +import random +import math +import operator +import hashlib +import binascii +import urllib - def _real_extract(self, url): - META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s' - - # video id is the last path element of the URL - # usually there is a trailing slash, so also try the second but last - url_path = compat_urllib_parse_urlparse(url).path - url_parent_path, video_id = os.path.split(url_path) - if not video_id: - _, video_id = os.path.split(url_parent_path) - - # get metadata - metadata_url = META_DATA_URL_TEMPLATE % video_id - metadata_text = self._download_webpage(metadata_url, video_id) - metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8')) - - # extract values from metadata - url_flv_el = metadata.find('url_flv') - if url_flv_el is None: - raise ExtractorError(u'Unable to extract download url') - video_url = url_flv_el.text - extension = os.path.splitext(video_url)[1][1:] - title_el = metadata.find('title') - if title_el is None: - raise ExtractorError(u'Unable to extract title') - title = title_el.text - format_id_el = metadata.find('format_id') - if format_id_el is None: - format = ext - else: - format = format_id_el.text - description_el = metadata.find('description') - if description_el is not None: - description = description_el.text - else: - description = None - imagePreview_el = metadata.find('imagePreview') - if imagePreview_el is not None: - thumbnail = imagePreview_el.text - else: - thumbnail = None - info = { - 'id': video_id, - 'url': video_url, - 'title': title, - 'ext': extension, - 'format': format, - 'thumbnail': thumbnail, - 'description': description - } - return [info] +from .utils import * +from .extractor.common import InfoExtractor, SearchInfoExtractor -class SpiegelIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$' +from .extractor.ard import ARDIE +from .extractor.arte import ArteTvIE +from .extractor.bandcamp import BandcampIE +from .extractor.bliptv import BlipTVIE, BlipTVUserIE +from .extractor.comedycentral import ComedyCentralIE +from .extractor.collegehumor import CollegeHumorIE +from .extractor.dailymotion import DailymotionIE +from .extractor.depositfiles import DepositFilesIE +from .extractor.eighttracks import EightTracksIE +from .extractor.escapist import EscapistIE +from .extractor.facebook import FacebookIE +from .extractor.funnyordie import FunnyOrDieIE +from .extractor.gametrailers import GametrailersIE +from .extractor.generic import GenericIE +from .extractor.googleplus import GooglePlusIE +from .extractor.googlesearch import GoogleSearchIE +from .extractor.infoq import InfoQIE +from .extractor.justintv import JustinTVIE +from .extractor.keek import KeekIE +from .extractor.liveleak import LiveLeakIE +from .extractor.metacafe import MetacafeIE +from .extractor.mixcloud import MixcloudIE +from .extractor.mtv import MTVIE +from .extractor.myspass import MySpassIE +from .extractor.myvideo import MyVideoIE +from .extractor.nba import NBAIE +from .extractor.statigram import StatigramIE +from .extractor.photobucket import PhotobucketIE +from .extractor.pornotube import PornotubeIE +from .extractor.rbmaradio import RBMARadioIE +from .extractor.redtube import RedTubeIE +from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE +from .extractor.spiegel import SpiegelIE +from .extractor.stanfordoc import StanfordOpenClassroomIE +from .extractor.steam import SteamIE +from .extractor.ted import TEDIE +from .extractor.tumblr import TumblrIE +from .extractor.ustream import UstreamIE +from .extractor.vbox7 import Vbox7IE +from .extractor.vimeo import VimeoIE +from .extractor.vine import VineIE +from .extractor.worldstarhiphop import WorldStarHipHopIE +from .extractor.xnxx import XNXXIE +from .extractor.xvideos import XVideosIE +from .extractor.yahoo import YahooIE, YahooSearchIE +from .extractor.youjizz import YouJizzIE +from .extractor.youku import YoukuIE +from .extractor.youporn import YouPornIE +from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE +from .extractor.zdf import ZDFIE - def _real_extract(self, url): - m = re.match(self._VALID_URL, url) - video_id = m.group('videoID') - webpage = self._download_webpage(url, video_id) - video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>', - webpage, u'title') - xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml' - xml_code = self._download_webpage(xml_url, video_id, - note=u'Downloading XML', errnote=u'Failed to download XML') - idoc = xml.etree.ElementTree.fromstring(xml_code) - last_type = idoc[-1] - filename = last_type.findall('./filename')[0].text - duration = float(last_type.findall('./duration')[0].text) - video_url = 'http://video2.spiegel.de/flash/' + filename - video_ext = filename.rpartition('.')[2] - info = { - 'id': video_id, - 'url': video_url, - 'ext': video_ext, - 'title': video_title, - 'duration': duration, - } - return [info] -class LiveLeakIE(InfoExtractor): - _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)' - IE_NAME = u'liveleak' - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - video_id = mobj.group('video_id') - webpage = self._download_webpage(url, video_id) - video_url = self._search_regex(r'file: "(.*?)",', - webpage, u'video URL') - video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"', - webpage, u'title').replace('LiveLeak.com -', '').strip() - video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"', - webpage, u'description', fatal=False) - video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>', - webpage, u'uploader', fatal=False) - info = { - 'id': video_id, - 'url': video_url, - 'ext': 'mp4', - 'title': video_title, - 'description': video_description, - 'uploader': video_uploader - } - return [info] -class ARDIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?' - _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>' - _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)' - def _real_extract(self, url): - # determine video id from url - m = re.match(self._VALID_URL, url) - numid = re.search(r'documentId=([0-9]+)', url) - if numid: - video_id = numid.group(1) - else: - video_id = m.group('video_id') - - # determine title and media streams from webpage - html = self._download_webpage(url, video_id) - title = re.search(self._TITLE, html).group('title') - streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)] - if not streams: - assert '"fsk"' in html - raise ExtractorError(u'This video is only available after 8:00 pm') - - # choose default media type and highest quality for now - stream = max([s for s in streams if int(s["media_type"]) == 0], - key=lambda s: int(s["quality"])) - - # there's two possibilities: RTMP stream or HTTP download - info = {'id': video_id, 'title': title, 'ext': 'mp4'} - if stream['rtmp_url']: - self.to_screen(u'RTMP download detected') - assert stream['video_url'].startswith('mp4:') - info["url"] = stream["rtmp_url"] - info["play_path"] = stream['video_url'] - else: - assert stream["video_url"].endswith('.mp4') - info["url"] = stream["video_url"] - return [info] -class ZDFIE(InfoExtractor): - _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?' - _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>' - _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>' - _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"' - _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)' - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - video_id = mobj.group('video_id') - - html = self._download_webpage(url, video_id) - streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)] - if streams is None: - raise ExtractorError(u'No media url found.') - - # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url - # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url - # choose first/default media type and highest quality for now - for s in streams: #find 300 - dsl1000mbit - if s['quality'] == '300' and s['media_type'] == 'wstreaming': - stream_=s - break - for s in streams: #find veryhigh - dsl2000mbit - if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working - stream_=s - break - if stream_ is None: - raise ExtractorError(u'No stream found.') - - media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL') - self.report_extraction(video_id) - mobj = re.search(self._TITLE, html) - if mobj is None: - raise ExtractorError(u'Cannot extract title') - title = unescapeHTML(mobj.group('title')) - mobj = re.search(self._MMS_STREAM, media_link) - if mobj is None: - mobj = re.search(self._RTSP_STREAM, media_link) - if mobj is None: - raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL') - mms_url = mobj.group('video_url') - mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url) - if mobj is None: - raise ExtractorError(u'Cannot extract extention') - ext = mobj.group('ext') - return [{'id': video_id, - 'url': mms_url, - 'title': title, - 'ext': ext - }] -class TumblrIE(InfoExtractor): - _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)' - def _real_extract(self, url): - m_url = re.match(self._VALID_URL, url) - video_id = m_url.group('id') - blog = m_url.group('blog_name') - - url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id) - webpage = self._download_webpage(url, video_id) - - re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id) - video = re.search(re_video, webpage) - if video is None: - raise ExtractorError(u'Unable to extract video') - video_url = video.group('video_url') - ext = video.group('ext') - - video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22', - webpage, u'thumbnail', fatal=False) # We pick the first poster - if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '') - - # The only place where you can get a title, it's not complete, - # but searching in other places doesn't work for all videos - video_title = self._html_search_regex(r'<title>(?P<title>.*?)', - webpage, u'title', flags=re.DOTALL) - - return [{'id': video_id, - 'url': video_url, - 'title': video_title, - 'thumbnail': video_thumbnail, - 'ext': ext - }] - -class BandcampIE(InfoExtractor): - _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P.*)' - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - title = mobj.group('title') - webpage = self._download_webpage(url, title) - # We get the link to the free download page - m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage) - if m_download is None: - raise ExtractorError(u'No free songs found') - - download_link = m_download.group(1) - id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', - webpage, re.MULTILINE|re.DOTALL).group('id') - - download_webpage = self._download_webpage(download_link, id, - 'Downloading free downloads page') - # We get the dictionary of the track from some javascrip code - info = re.search(r'items: (.*?),$', - download_webpage, re.MULTILINE).group(1) - info = json.loads(info)[0] - # We pick mp3-320 for now, until format selection can be easily implemented. - mp3_info = info[u'downloads'][u'mp3-320'] - # If we try to use this url it says the link has expired - initial_url = mp3_info[u'url'] - re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$' - m_url = re.match(re_url, initial_url) - #We build the url we will use to get the final track url - # This url is build in Bandcamp in the script download_bunde_*.js - request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts')) - final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url') - # If we could correctly generate the .rand field the url would be - #in the "download_url" key - final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1) - - track_info = {'id':id, - 'title' : info[u'title'], - 'ext' : 'mp3', - 'url' : final_url, - 'thumbnail' : info[u'thumb_url'], - 'uploader' : info[u'artist'] - } - - return [track_info] - -class RedTubeIE(InfoExtractor): - """Information Extractor for redtube""" - _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)' - def _real_extract(self,url): - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - video_id = mobj.group('id') - video_extension = 'mp4' - webpage = self._download_webpage(url, video_id) - self.report_extraction(video_id) - video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">', - webpage, u'video URL') - video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>', - webpage, u'title') - return [{ - 'id': video_id, - 'url': video_url, - 'ext': video_extension, - 'title': video_title, - }] - class InaIE(InfoExtractor): """Information Extractor for Ina.fr""" _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*' @@ -3310,39 +164,6 @@ class HowcastIE(InfoExtractor): 'thumbnail': thumbnail, }] -class VineIE(InfoExtractor): - """Information Extractor for Vine.co""" - _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - - video_id = mobj.group('id') - webpage_url = 'https://vine.co/v/' + video_id - webpage = self._download_webpage(webpage_url, video_id) - - self.report_extraction(video_id) - - video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"', - webpage, u'video URL') - - video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"', - webpage, u'title') - - thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"', - webpage, u'thumbnail', fatal=False) - - uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>', - webpage, u'uploader', fatal=False, flags=re.DOTALL) - - return [{ - 'id': video_id, - 'url': video_url, - 'ext': 'mp4', - 'title': video_title, - 'thumbnail': thumbnail, - 'uploader': uploader, - }] class FlickrIE(InfoExtractor): """Information Extractor for Flickr videos""" @@ -3534,123 +355,7 @@ class HypemIE(InfoExtractor): 'artist': artist, }] -class Vbox7IE(InfoExtractor): - """Information Extractor for Vbox7""" - _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)' - - def _real_extract(self,url): - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - video_id = mobj.group(1) - - redirect_page, urlh = self._download_webpage_handle(url, video_id) - new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location') - redirect_url = urlh.geturl() + new_location - webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page') - - title = self._html_search_regex(r'<title>(.*)', - webpage, u'title').split('/')[0].strip() - - ext = "flv" - info_url = "http://vbox7.com/play/magare.do" - data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id}) - info_request = compat_urllib_request.Request(info_url, data) - info_request.add_header('Content-Type', 'application/x-www-form-urlencoded') - info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage') - if info_response is None: - raise ExtractorError(u'Unable to extract the media url') - (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&')) - - return [{ - 'id': video_id, - 'url': final_url, - 'ext': ext, - 'title': title, - 'thumbnail': thumbnail_url, - }] - -class GametrailersIE(InfoExtractor): - _VALID_URL = r'http://www.gametrailers.com/(?Pvideos|reviews|full-episodes)/(?P.*?)/(?P.*)' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - video_id = mobj.group('id') - video_type = mobj.group('type') - webpage = self._download_webpage(url, video_id) - if video_type == 'full-episodes': - mgid_re = r'data-video="(?P<mgid>mgid:.*?)"' - else: - mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\'' - mgid = self._search_regex(mgid_re, webpage, u'mgid') - data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'}) - - info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data, - video_id, u'Downloading video info') - links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data, - video_id, u'Downloading video urls info') - - self.report_extraction(video_id) - info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]>.* - .*?)\]\]>.* - .* - (?P.*?).* - ''' - - m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL) - if m_info is None: - raise ExtractorError(u'Unable to extract video info') - video_title = m_info.group('title') - video_description = m_info.group('description') - video_thumb = m_info.group('thumb') - - m_urls = list(re.finditer(r'(?P.*)', links_webpage)) - if m_urls is None or len(m_urls) == 0: - raise ExtractError(u'Unable to extrat video url') - # They are sorted from worst to best quality - video_url = m_urls[-1].group('url') - - return {'url': video_url, - 'id': video_id, - 'title': video_title, - # Videos are actually flv not mp4 - 'ext': 'flv', - 'thumbnail': video_thumb, - 'description': video_description, - } - -class StatigramIE(InfoExtractor): - _VALID_URL = r'(?:http://)?(?:www\.)?statigr\.am/p/([^/]+)' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - - video_id = mobj.group(1) - webpage = self._download_webpage(url, video_id) - video_url = self._html_search_regex( - r'', - webpage, u'video URL') - thumbnail_url = self._html_search_regex( - r'', - webpage, u'thumbnail URL', fatal=False) - html_title = self._html_search_regex( - r'(.+?)', - webpage, u'title') - title = html_title.rpartition(u' | Statigram')[0] - uploader_id = self._html_search_regex( - r'@([^ ]+)', title, u'uploader name', fatal=False) - ext = 'mp4' - return [{ - 'id': video_id, - 'url': video_url, - 'ext': ext, - 'title': title, - 'thumbnail': thumbnail_url, - 'uploader_id' : uploader_id - }] def gen_extractors(): """ Return a list of an instance of every supported extractor.