from .utils import *
-class InfoExtractor(object):
- """Information Extractor class.
-
- Information extractors are the classes that, given a URL, extract
- information about the video (or videos) the URL refers to. This
- information includes the real video URL, the video title, author and
- others. The information is stored in a dictionary which is then
- passed to the FileDownloader. The FileDownloader processes this
- information possibly downloading the video to the file system, among
- other possible outcomes.
-
- The dictionaries must include the following fields:
-
- id: Video identifier.
- url: Final video URL.
- title: Video title, unescaped.
- ext: Video filename extension.
-
- The following fields are optional:
-
- format: The video format, defaults to ext (used for --get-format)
- thumbnail: Full URL to a video thumbnail image.
- description: One-line video description.
- uploader: Full name of the video uploader.
- upload_date: Video upload date (YYYYMMDD).
- uploader_id: Nickname or id of the video uploader.
- location: Physical location of the video.
- player_url: SWF Player URL (used for rtmpdump).
- subtitles: The subtitle file contents.
- urlhandle: [internal] The urlHandle to be used to download the file,
- like returned by urllib.request.urlopen
-
- The fields should all be Unicode strings.
-
- Subclasses of this one should re-define the _real_initialize() and
- _real_extract() methods and define a _VALID_URL regexp.
- Probably, they should also be added to the list of extractors.
-
- _real_extract() must return a *list* of information dictionaries as
- described above.
-
- Finally, the _WORKING attribute should be set to False for broken IEs
- in order to warn the users and skip the tests.
- """
-
- _ready = False
- _downloader = None
- _WORKING = True
-
- def __init__(self, downloader=None):
- """Constructor. Receives an optional downloader."""
- self._ready = False
- self.set_downloader(downloader)
-
- @classmethod
- def suitable(cls, url):
- """Receives a URL and returns True if suitable for this IE."""
- return re.match(cls._VALID_URL, url) is not None
-
- @classmethod
- def working(cls):
- """Getter method for _WORKING."""
- return cls._WORKING
-
- def initialize(self):
- """Initializes an instance (authentication, etc)."""
- if not self._ready:
- self._real_initialize()
- self._ready = True
-
- def extract(self, url):
- """Extracts URL information and returns it in list of dicts."""
- self.initialize()
- return self._real_extract(url)
-
- def set_downloader(self, downloader):
- """Sets the downloader for this IE."""
- self._downloader = downloader
-
- def _real_initialize(self):
- """Real initialization process. Redefine in subclasses."""
- pass
-
- def _real_extract(self, url):
- """Real extraction process. Redefine in subclasses."""
- pass
-
- @property
- def IE_NAME(self):
- return type(self).__name__[:-2]
-
- def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
- """ Returns the response handle """
- if note is None:
- self.report_download_webpage(video_id)
- elif note is not False:
- self.to_screen(u'%s: %s' % (video_id, note))
- try:
- return compat_urllib_request.urlopen(url_or_request)
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- if errnote is None:
- errnote = u'Unable to download webpage'
- raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
-
- def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
- """ Returns a tuple (page content as string, URL handle) """
- urlh = self._request_webpage(url_or_request, video_id, note, errnote)
- content_type = urlh.headers.get('Content-Type', '')
- m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
- if m:
- encoding = m.group(1)
- else:
- encoding = 'utf-8'
- webpage_bytes = urlh.read()
- if self._downloader.params.get('dump_intermediate_pages', False):
- try:
- url = url_or_request.get_full_url()
- except AttributeError:
- url = url_or_request
- self.to_screen(u'Dumping request to ' + url)
- dump = base64.b64encode(webpage_bytes).decode('ascii')
- self._downloader.to_screen(dump)
- content = webpage_bytes.decode(encoding, 'replace')
- return (content, urlh)
-
- def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
- """ Returns the data of the page as a string """
- return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
-
- def to_screen(self, msg):
- """Print msg to screen, prefixing it with '[ie_name]'"""
- self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
-
- def report_extraction(self, id_or_name):
- """Report information extraction."""
- self.to_screen(u'%s: Extracting information' % id_or_name)
-
- def report_download_webpage(self, video_id):
- """Report webpage download."""
- self.to_screen(u'%s: Downloading webpage' % video_id)
-
- def report_age_confirmation(self):
- """Report attempt to confirm age."""
- self.to_screen(u'Confirming age')
-
- #Methods for following #608
- #They set the correct value of the '_type' key
- def video_result(self, video_info):
- """Returns a video"""
- video_info['_type'] = 'video'
- return video_info
- def url_result(self, url, ie=None):
- """Returns a url that points to a page that should be processed"""
- #TODO: ie should be the class used for getting the info
- video_info = {'_type': 'url',
- 'url': url,
- 'ie_key': ie}
- return video_info
- def playlist_result(self, entries, playlist_id=None, playlist_title=None):
- """Returns a playlist"""
- video_info = {'_type': 'playlist',
- 'entries': entries}
- if playlist_id:
- video_info['id'] = playlist_id
- if playlist_title:
- video_info['title'] = playlist_title
- return video_info
-
- def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
- """
- Perform a regex search on the given string, using a single or a list of
- patterns returning the first matching group.
- In case of failure return a default value or raise a WARNING or a
- ExtractorError, depending on fatal, specifying the field name.
- """
- if isinstance(pattern, (str, compat_str, compiled_regex_type)):
- mobj = re.search(pattern, string, flags)
- else:
- for p in pattern:
- mobj = re.search(p, string, flags)
- if mobj: break
-
- if sys.stderr.isatty() and os.name != 'nt':
- _name = u'\033[0;34m%s\033[0m' % name
- else:
- _name = name
-
- if mobj:
- # return the first matching group
- return next(g for g in mobj.groups() if g is not None)
- elif default is not None:
- return default
- elif fatal:
- raise ExtractorError(u'Unable to extract %s' % _name)
- else:
- self._downloader.report_warning(u'unable to extract %s; '
- u'please report this issue on GitHub.' % _name)
- return None
-
-class SearchInfoExtractor(InfoExtractor):
- """
- Base class for paged search queries extractors.
- They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
- Instances should define _SEARCH_KEY and _MAX_RESULTS.
- """
-
- @classmethod
- def _make_valid_url(cls):
- return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
-
- @classmethod
- def suitable(cls, url):
- return re.match(cls._make_valid_url(), url) is not None
-
- def _real_extract(self, query):
- mobj = re.match(self._make_valid_url(), query)
- if mobj is None:
- raise ExtractorError(u'Invalid search query "%s"' % query)
-
- prefix = mobj.group('prefix')
- query = mobj.group('query')
- if prefix == '':
- return self._get_n_results(query, 1)
- elif prefix == 'all':
- return self._get_n_results(query, self._MAX_RESULTS)
- else:
- n = int(prefix)
- if n <= 0:
- raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
- elif n > self._MAX_RESULTS:
- self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
- n = self._MAX_RESULTS
- return self._get_n_results(query, n)
-
- def _get_n_results(self, query, n):
- """Get a specified number of results for a query"""
- raise NotImplementedError("This method must be implemented by sublclasses")
+from .extractor.common import InfoExtractor, SearchInfoExtractor
class YoutubeIE(InfoExtractor):
"""Indicate the download will use the RTMP protocol."""
self.to_screen(u'RTMP download detected')
+ @staticmethod
+ def _decrypt_signature(s):
+ """Decrypt the key the two subkeys must have a length of 43"""
+ (a,b) = s.split('.')
+ if len(a) != 43 or len(b) != 43:
+ raise ExtractorError(u'Unable to decrypt signature, subkeys lengths not valid')
+ b = ''.join([b[:8],a[0],b[9:18],b[-4],b[19:39], b[18]])[0:40]
+ a = a[-40:]
+ s_dec = '.'.join((a,b))[::-1]
+ return s_dec
+
def _get_available_subtitles(self, video_id):
self.report_video_subtitles_download(video_id)
request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
def _request_automatic_caption(self, video_id, webpage):
"""We need the webpage for getting the captions url, pass it as an
argument to speed up the process."""
- sub_lang = self._downloader.params.get('subtitleslang')
+ sub_lang = self._downloader.params.get('subtitleslang') or 'en'
sub_format = self._downloader.params.get('subtitlesformat')
self.to_screen(u'%s: Looking for automatic captions' % video_id)
mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
pass
else:
# We report the original error
- self._downloader.report_error(sub_error)
+ self._downloader.report_warning(sub_error)
if self._downloader.params.get('allsubtitles', False):
video_subtitles = self._extract_all_subtitles(video_id)
for video_subtitle in video_subtitles:
(sub_error, sub_lang, sub) = video_subtitle
if sub_error:
- self._downloader.report_error(sub_error)
+ self._downloader.report_warning(sub_error)
if self._downloader.params.get('listsubtitles', False):
sub_lang_list = self._list_available_subtitles(video_id)
# Decide which formats to download
req_format = self._downloader.params.get('format', None)
+ try:
+ mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
+ info = json.loads(mobj.group(1))
+ args = info['args']
+ if args.get('ptk','') == 'vevo' or 'dashmpd':
+ # Vevo videos with encrypted signatures
+ self.to_screen(u'%s: Vevo video detected.' % video_id)
+ video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
+ except ValueError:
+ pass
+
if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
self.report_rtmp_download()
video_url_list = [(None, video_info['conn'][0])]
for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
url_data = compat_parse_qs(url_data_str)
if 'itag' in url_data and 'url' in url_data:
- url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
- if not 'ratebypass' in url: url += '&ratebypass=yes'
+ url = url_data['url'][0]
+ if 'sig' in url_data:
+ url += '&signature=' + url_data['sig'][0]
+ elif 's' in url_data:
+ signature = self._decrypt_signature(url_data['s'][0])
+ url += '&signature=' + signature
+ if 'ratebypass' not in url:
+ url += '&ratebypass=yes'
url_map[url_data['itag'][0]] = url
format_limit = self._downloader.params.get('format_limit', None)
video_title = unescapeHTML(mobj.group('title'))
video_uploader = None
- mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
- if mobj is None:
- # lookin for official user
- mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
- if mobj_official is None:
- self._downloader.report_warning(u'unable to extract uploader nickname')
- else:
- video_uploader = mobj_official.group(1)
- else:
- video_uploader = mobj.group(1)
+ video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
+ # Looking for official user
+ r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
+ webpage, 'video uploader')
video_upload_date = None
mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
_VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
IE_NAME = u'vimeo'
+ def _verify_video_password(self, url, video_id, webpage):
+ password = self._downloader.params.get('password', None)
+ if password is None:
+ raise ExtractorError(u'This video is protected by a password, use the --password option')
+ token = re.search(r'xsrft: \'(.*?)\'', webpage).group(1)
+ data = compat_urllib_parse.urlencode({'password': password,
+ 'token': token})
+ # I didn't manage to use the password with https
+ if url.startswith('https'):
+ pass_url = url.replace('https','http')
+ else:
+ pass_url = url
+ password_request = compat_urllib_request.Request(pass_url+'/password', data)
+ password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+ password_request.add_header('Cookie', 'xsrft=%s' % token)
+ pass_web = self._download_webpage(password_request, video_id,
+ u'Verifying the password',
+ u'Wrong password')
+
def _real_extract(self, url, new_video=True):
# Extract ID from URL
mobj = re.match(self._VALID_URL, url)
except:
if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
+
+ if re.search('If so please provide the correct password.', webpage):
+ self._verify_video_password(url, video_id, webpage)
+ return self._real_extract(url)
else:
raise ExtractorError(u'Unable to extract info section')
if mobj is None:
# Broaden the search a little bit: JWPlayer JS loader
mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
+ if mobj is None:
+ # Try to find twitter cards info
+ mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
+ if mobj is None:
+ # We look for Open Graph info:
+ # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
+ m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
+ # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
+ if m_video_type is not None:
+ mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
# Site Name | Video Title
# Video Title - Tagline | Site Name
# and so on and so forth; it's just not practical
- mobj = re.search(r'<title>(.*)</title>', webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract title')
- video_title = mobj.group(1)
+ video_title = self._html_search_regex(r'<title>(.*)</title>',
+ webpage, u'video title')
# video uploader is domain name
- mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
- if mobj is None:
- raise ExtractorError(u'Unable to extract title')
- video_uploader = mobj.group(1)
+ video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
+ url, u'video uploader')
return [{
'id': video_id,
def report_download_page(self, query, pagenum):
"""Report attempt to download search page with given number."""
- query = query.decode(preferredencoding())
self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
def _get_n_results(self, query, n):
|
((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
)"""
- _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
+ _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
_MAX_RESULTS = 50
IE_NAME = u'youtube:playlist'
# Number of videos is a multiple of self._MAX_RESULTS
break
- videos += [ (entry['yt$position']['$t'], entry['content']['src'])
- for entry in response['feed']['entry']
- if 'content' in entry ]
+ for entry in response['feed']['entry']:
+ index = entry['yt$position']['$t']
+ if 'media$group' in entry and 'media$player' in entry['media$group']:
+ videos.append((index, entry['media$group']['media$player']['url']))
if len(response['feed']['entry']) < self._MAX_RESULTS:
break
video_duration = int(video_data['video_duration'])
thumbnail = video_data['thumbnail_src']
- video_title = self._search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
+ video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
webpage, u'title')
- video_title = unescapeHTML(video_title)
info = {
'id': video_id,
self.report_extraction(video_id)
video_url = mobj.group(1) + '.flv'
- video_title = self._search_regex('<title>([^<]+)</title>',
+ video_title = self._html_search_regex('<title>([^<]+)</title>',
webpage, u'title')
video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
video_swfobj = compat_urllib_parse.unquote(video_swfobj)
- video_title = self._search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
+ video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
webpage, u'title')
return [{
showName = mobj.group('showname')
videoId = mobj.group('episode')
- self.report_extraction(showName)
- webpage = self._download_webpage(url, showName)
+ self.report_extraction(videoId)
+ webpage = self._download_webpage(url, videoId)
- videoDesc = self._search_regex('<meta name="description" content="([^"]*)"',
+ videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
webpage, u'description', fatal=False)
- if videoDesc: videoDesc = unescapeHTML(videoDesc)
- imgUrl = self._search_regex('<meta property="og:image" content="([^"]*)"',
+ imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
webpage, u'thumbnail', fatal=False)
- if imgUrl: imgUrl = unescapeHTML(imgUrl)
- playerUrl = self._search_regex('<meta property="og:video" content="([^"]*)"',
+ playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
webpage, u'player url')
- playerUrl = unescapeHTML(playerUrl)
+
+ title = self._html_search_regex('<meta name="title" content="([^"]*)"',
+ webpage, u'player url').split(' : ')[-1]
configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
configUrl = compat_urllib_parse.unquote(configUrl)
- configJSON = self._download_webpage(configUrl, showName,
+ configJSON = self._download_webpage(configUrl, videoId,
u'Downloading configuration',
u'unable to download configuration')
'url': videoUrl,
'uploader': showName,
'upload_date': None,
- 'title': showName,
+ 'title': title,
'ext': 'mp4',
'thumbnail': imgUrl,
'description': videoDesc,
webpage, u'video URL'))
# Extract title
- video_title = self._search_regex(r'<title>(.*?)\s+-\s+XVID',
+ video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
webpage, u'title')
# Extract video thumbnail
webpage, u'title')
# Extract description
- video_description = self._search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
+ video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
webpage, u'description', fatal=False)
video_filename = video_url.split('/')[-1]
note='Downloading course info page',
errnote='Unable to download course info page')
- info['title'] = self._search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
- info['title'] = unescapeHTML(info['title'])
+ info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
- info['description'] = self._search_regex('<description>([^<]+)</description>',
+ info['description'] = self._html_search_regex('<description>([^<]+)</description>',
coursepage, u'description', fatal=False)
- if info['description']: info['description'] = unescapeHTML(info['description'])
links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
info['list'] = [
webpage = self._download_webpage(url, video_id)
- song_name = self._search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
+ song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
webpage, u'song name', fatal=False)
- if song_name: song_name = unescapeHTML(song_name)
- video_title = self._search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
+ video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
webpage, u'title')
- video_title = unescapeHTML(video_title)
- mtvn_uri = self._search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
+ mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
webpage, u'mtvn_uri', fatal=False)
content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
webpage, u'video URL')
video_url = compat_urllib_parse.unquote(video_url)
- video_title = self._search_regex(self.VIDEO_TITLE_RE,
+ video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
webpage, u'title')
video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
self.report_extraction(video_id)
# Extract update date
- upload_date = self._search_regex('title="Timestamp">(.*?)</a>',
+ upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
webpage, u'upload date', fatal=False)
if upload_date:
# Convert timestring to a format suitable for filename
upload_date = upload_date.strftime('%Y%m%d')
# Extract uploader
- uploader = self._search_regex(r'rel\="author".*?>(.*?)</a>',
+ uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
webpage, u'uploader', fatal=False)
# Extract title
# Get the first line for title
- video_title = self._search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
+ video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
webpage, 'title', default=u'NA')
# Step 2, Stimulate clicking the image box to launch video
video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
shortened_video_id = video_id.rpartition('/')[2]
- title = self._search_regex(r'<meta property="og:title" content="(.*?)"',
+ title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
# It isn't there in the HTML it returns to us
- # uploader_date = self._search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
+ # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
- description = self._search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
+ description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
info = {
'id': shortened_video_id,
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
- video_url = self._search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
+ video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
webpage, u'video URL', flags=re.DOTALL)
- video_url = unescapeHTML(video_url)
- title = self._search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
+ title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
- title = clean_html(title)
- video_description = self._search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
+ video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
webpage, u'description', fatal=False, flags=re.DOTALL)
- if video_description: video_description = unescapeHTML(video_description)
info = {
'id': video_id,
(?P<gameID>\d+)/?
(?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
"""
+ _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
+ _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
@classmethod
def suitable(cls, url):
def _real_extract(self, url):
m = re.match(self._VALID_URL, url, re.VERBOSE)
gameID = m.group('gameID')
- videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
- self.report_age_confirmation()
+
+ videourl = self._VIDEO_PAGE_TEMPLATE % gameID
webpage = self._download_webpage(videourl, gameID)
- game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
-
+
+ if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
+ videourl = self._AGECHECK_TEMPLATE % gameID
+ self.report_age_confirmation()
+ webpage = self._download_webpage(videourl, gameID)
+
+ self.report_extraction(gameID)
+ game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
+ webpage, 'game title')
+
urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
mweb = re.finditer(urlRE, webpage)
namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
self.report_extraction(video_id)
- video_title = self._search_regex(r'data-title="(?P<title>.+)"',
+ video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
webpage, u'title')
- uploader = self._search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
+ uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
webpage, u'uploader', fatal=False, flags=re.DOTALL)
- if uploader: uploader = unescapeHTML(uploader.strip())
- thumbnail = self._search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
+ thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
webpage, u'thumbnail', fatal=False)
info = {
else:
ext = 'flv'
- video_title = self._search_regex(r"<title>(.*)</title>",
+ video_title = self._html_search_regex(r"<title>(.*)</title>",
webpage_src, u'title')
# Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
- thumbnail = self._search_regex(r'rel="image_src" href="(.*)" />',
+ thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
webpage_src, u'thumbnail', fatal=False)
if not thumbnail:
webpage = self._download_webpage(url, video_id)
- json_data = self._search_regex(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>',
- webpage, u'json data')
+ json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
+ webpage, u'json data', flags=re.MULTILINE)
try:
data = json.loads(json_data)
size = format[0]
bitrate = format[1]
format = "-".join( format )
- title = u'%s-%s-%s' % (video_title, size, bitrate)
+ # title = u'%s-%s-%s' % (video_title, size, bitrate)
formats.append({
'id': video_id,
'url': video_url,
'uploader': video_uploader,
'upload_date': upload_date,
- 'title': title,
+ 'title': video_title,
'ext': extension,
'format': format,
'thumbnail': thumbnail,
#Get the uploaded date
VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
- upload_date = self._search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
+ upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
if upload_date: upload_date = unified_strdate(upload_date)
info = {'id': video_id,
webpage = self._download_webpage(url, video_id)
# Get the video title
- video_title = self._search_regex(r'<title>(?P<title>.*)</title>',
+ video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
webpage, u'title').strip()
# Get the embed page
thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
webpage = self._download_webpage(url, video_id)
- video_title = self._search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
+ video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
webpage, u'title')
- video_title = unescapeHTML(video_title)
- uploader = self._search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
+ uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
webpage, u'uploader', fatal=False)
- if uploader: uploader = clean_html(uploader)
info = {
'id': video_id,
self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
return [self._playlist_videos_info(url,name,playlist_id)]
- def _talk_video_link(self,mediaSlug):
- '''Returns the video link for that mediaSlug'''
- return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
-
def _playlist_videos_info(self,url,name,playlist_id=0):
'''Returns the videos of the playlist'''
video_RE=r'''
m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
m_names=re.finditer(video_name_RE,webpage)
- playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
- m_playlist = re.search(playlist_RE, webpage)
- playlist_title = m_playlist.group('playlist_title')
+ playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
+ webpage, 'playlist title')
playlist_entries = []
for m_video, m_name in zip(m_videos,m_names):
def _talk_info(self, url, video_id=0):
"""Return the video for the talk in the url"""
- m=re.match(self._VALID_URL, url,re.VERBOSE)
- videoName=m.group('name')
- webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
+ m = re.match(self._VALID_URL, url,re.VERBOSE)
+ video_name = m.group('name')
+ webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
+ self.report_extraction(video_name)
# If the url includes the language we get the title translated
- title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
- title=re.search(title_RE, webpage).group('title')
- info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
- "id":(?P<videoID>[\d]+).*?
- "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
- thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
- thumb_match=re.search(thumb_RE,webpage)
- info_match=re.search(info_RE,webpage,re.VERBOSE)
- video_id=info_match.group('videoID')
- mediaSlug=info_match.group('mediaSlug')
- video_url=self._talk_video_link(mediaSlug)
+ title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
+ webpage, 'title')
+ json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
+ webpage, 'json data')
+ info = json.loads(json_data)
+ desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
+ webpage, 'description', flags = re.DOTALL)
+
+ thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
+ webpage, 'thumbnail')
info = {
- 'id': video_id,
- 'url': video_url,
+ 'id': info['id'],
+ 'url': info['htmlStreams'][-1]['file'],
'ext': 'mp4',
'title': title,
- 'thumbnail': thumb_match.group('thumbnail')
+ 'thumbnail': thumbnail,
+ 'description': desc,
}
return info
webpage = self._download_webpage(url, video_id)
- video_title = self._search_regex(r'<div class="module-title">(.*?)</div>',
+ video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
webpage, u'title')
- video_title = unescapeHTML(video_title)
xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
xml_code = self._download_webpage(xml_url, video_id,
video_url = self._search_regex(r'file: "(.*?)",',
webpage, u'video URL')
- video_title = self._search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
- webpage, u'title')
- video_title = unescapeHTML(video_title).replace('LiveLeak.com -', '').strip()
+ video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
+ webpage, u'title').replace('LiveLeak.com -', '').strip()
- video_description = self._search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
+ video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
webpage, u'description', fatal=False)
- if video_description: video_description = unescapeHTML(video_description)
- video_uploader = self._search_regex(r'By:.*?(\w+)</a>',
+ video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
webpage, u'uploader', fatal=False)
info = {
info["url"] = stream["video_url"]
return [info]
+class ZDFIE(InfoExtractor):
+ _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
+ _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
+ _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
+ _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
+ _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ if mobj is None:
+ raise ExtractorError(u'Invalid URL: %s' % url)
+ video_id = mobj.group('video_id')
+
+ html = self._download_webpage(url, video_id)
+ streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
+ if streams is None:
+ raise ExtractorError(u'No media url found.')
+
+ # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
+ # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
+ # choose first/default media type and highest quality for now
+ for s in streams: #find 300 - dsl1000mbit
+ if s['quality'] == '300' and s['media_type'] == 'wstreaming':
+ stream_=s
+ break
+ for s in streams: #find veryhigh - dsl2000mbit
+ if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
+ stream_=s
+ break
+ if stream_ is None:
+ raise ExtractorError(u'No stream found.')
+
+ media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
+
+ self.report_extraction(video_id)
+ mobj = re.search(self._TITLE, html)
+ if mobj is None:
+ raise ExtractorError(u'Cannot extract title')
+ title = unescapeHTML(mobj.group('title'))
+
+ mobj = re.search(self._MMS_STREAM, media_link)
+ if mobj is None:
+ mobj = re.search(self._RTSP_STREAM, media_link)
+ if mobj is None:
+ raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
+ mms_url = mobj.group('video_url')
+
+ mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
+ if mobj is None:
+ raise ExtractorError(u'Cannot extract extention')
+ ext = mobj.group('ext')
+
+ return [{'id': video_id,
+ 'url': mms_url,
+ 'title': title,
+ 'ext': ext
+ }]
+
class TumblrIE(InfoExtractor):
_VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
# The only place where you can get a title, it's not complete,
# but searching in other places doesn't work for all videos
- video_title = self._search_regex(r'<title>(?P<title>.*?)</title>',
+ video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
webpage, u'title', flags=re.DOTALL)
- video_title = unescapeHTML(video_title)
return [{'id': video_id,
'url': video_url,
self.report_extraction(video_id)
- video_url = self._search_regex(r'<source src="(.+?)" type="video/mp4">',
+ video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
webpage, u'video URL')
- video_title = self._search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
+ video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
webpage, u'title')
return [{
self.report_extraction(video_id)
- video_url = self._search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
+ video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
webpage, u'video URL')
video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
webpage, u'video URL')
- video_title = self._search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
+ video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
webpage, u'title')
- video_description = self._search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
+ video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
webpage, u'description', fatal=False)
- thumbnail = self._search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
+ thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
webpage, u'thumbnail', fatal=False)
return [{
self.report_extraction(video_id)
- video_url = self._search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
+ video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
webpage, u'video URL')
- video_title = self._search_regex(r'<meta property="og:title" content="(.+?)"',
+ video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
webpage, u'title')
- thumbnail = self._search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
+ thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
webpage, u'thumbnail', fatal=False)
- uploader = self._search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
+ uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
webpage, u'uploader', fatal=False, flags=re.DOTALL)
return [{
first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
- node_id = self._search_regex(r'<Item id="id">(\d+-\d+)</Item>',
+ node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
first_xml, u'node_id')
second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
raise ExtractorError(u'Unable to extract video url')
video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
- video_title = self._search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
+ video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
webpage, u'video title')
- video_description = self._search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
+ video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
webpage, u'description', fatal=False)
- thumbnail = self._search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
+ thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
webpage, u'thumbnail', fatal=False)
return [{
url_title = mobj.group('url_title')
webpage = self._download_webpage(url, url_title)
- video_id = self._search_regex(r'<article class="video" data-id="(\d+?)"',
+ video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
webpage, u'video id')
self.report_extraction(video_id)
- video_title = self._search_regex(r'<meta property="og:title" content="(.+?)"',
+ video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
webpage, u'title')
- thumbnail = self._search_regex(r'<meta property="og:image" content="(.+?)"',
+ thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
webpage, u'thumbnail', fatal=False)
- video_description = self._search_regex(r'<meta property="og:description" content="(.*?)"',
+ video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
webpage, u'description', fatal=False)
data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
- video_url = self._search_regex(r'<file type="high".*?>(.*?)</file>',
+ video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
data, u'video URL')
return [{
video_url = mobj.group('server')+'/key='+mobj.group('file')
video_extension = video_url.split('.')[-1]
- video_title = self._search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
+ video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
webpage, u'title')
- video_title = unescapeHTML(video_title)
# Can't see the description anywhere in the UI
- # video_description = self._search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
+ # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
# webpage, u'description', fatal=False)
# if video_description: video_description = unescapeHTML(video_description)
video_upload_date = None
self._downloader.report_warning(u'Unable to extract upload date')
- video_uploader_id = self._search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^>]+)',
+ video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
webpage, u'uploader id', default=u'anonymous')
video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
self.report_extraction(track_id)
- html_tracks = self._search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
+ html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
try:
track_list = json.loads(html_tracks)
'artist': artist,
}]
+class Vbox7IE(InfoExtractor):
+ """Information Extractor for Vbox7"""
+ _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
+
+ def _real_extract(self,url):
+ mobj = re.match(self._VALID_URL, url)
+ if mobj is None:
+ raise ExtractorError(u'Invalid URL: %s' % url)
+ video_id = mobj.group(1)
+
+ redirect_page, urlh = self._download_webpage_handle(url, video_id)
+ new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
+ redirect_url = urlh.geturl() + new_location
+ webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
+
+ title = self._html_search_regex(r'<title>(.*)</title>',
+ webpage, u'title').split('/')[0].strip()
+
+ ext = "flv"
+ info_url = "http://vbox7.com/play/magare.do"
+ data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
+ info_request = compat_urllib_request.Request(info_url, data)
+ info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+ info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
+ if info_response is None:
+ raise ExtractorError(u'Unable to extract the media url')
+ (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
+
+ return [{
+ 'id': video_id,
+ 'url': final_url,
+ 'ext': ext,
+ 'title': title,
+ 'thumbnail': thumbnail_url,
+ }]
+
+class GametrailersIE(InfoExtractor):
+ _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ if mobj is None:
+ raise ExtractorError(u'Invalid URL: %s' % url)
+ video_id = mobj.group('id')
+ video_type = mobj.group('type')
+ webpage = self._download_webpage(url, video_id)
+ if video_type == 'full-episodes':
+ mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
+ else:
+ mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
+ mgid = self._search_regex(mgid_re, webpage, u'mgid')
+ data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
+
+ info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
+ video_id, u'Downloading video info')
+ links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
+ video_id, u'Downloading video urls info')
+
+ self.report_extraction(video_id)
+ info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
+ <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
+ <image>.*
+ <url>(?P<thumb>.*?)</url>.*
+ </image>'''
+
+ m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
+ if m_info is None:
+ raise ExtractorError(u'Unable to extract video info')
+ video_title = m_info.group('title')
+ video_description = m_info.group('description')
+ video_thumb = m_info.group('thumb')
+
+ m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
+ if m_urls is None or len(m_urls) == 0:
+ raise ExtractError(u'Unable to extrat video url')
+ # They are sorted from worst to best quality
+ video_url = m_urls[-1].group('url')
+
+ return {'url': video_url,
+ 'id': video_id,
+ 'title': video_title,
+ # Videos are actually flv not mp4
+ 'ext': 'flv',
+ 'thumbnail': video_thumb,
+ 'description': video_description,
+ }
+
+class StatigramIE(InfoExtractor):
+ _VALID_URL = r'(?:http://)?(?:www\.)?statigr\.am/p/([^/]+)'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+
+ video_id = mobj.group(1)
+ webpage = self._download_webpage(url, video_id)
+ video_url = self._html_search_regex(
+ r'<meta property="og:video:secure_url" content="(.+?)">',
+ webpage, u'video URL')
+ thumbnail_url = self._html_search_regex(
+ r'<meta property="og:image" content="(.+?)" />',
+ webpage, u'thumbnail URL', fatal=False)
+ html_title = self._html_search_regex(
+ r'<title>(.+?)</title>',
+ webpage, u'title')
+ title = html_title.rpartition(u' | Statigram')[0]
+ uploader_id = self._html_search_regex(
+ r'@([^ ]+)', title, u'uploader name', fatal=False)
+ ext = 'mp4'
+
+ return [{
+ 'id': video_id,
+ 'url': video_url,
+ 'ext': ext,
+ 'title': title,
+ 'thumbnail': thumbnail_url,
+ 'uploader_id' : uploader_id
+ }]
def gen_extractors():
""" Return a list of an instance of every supported extractor.
SpiegelIE(),
LiveLeakIE(),
ARDIE(),
+ ZDFIE(),
TumblrIE(),
BandcampIE(),
RedTubeIE(),
TeamcocoIE(),
XHamsterIE(),
HypemIE(),
+ Vbox7IE(),
+ GametrailersIE(),
+ StatigramIE(),
GenericIE()
]