_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19 import hashlib
  20 import binascii
  21 import urllib
  22
  23 from .utils import *
  24
  25
  26 from .extractor.common import InfoExtractor, SearchInfoExtractor
  27 from .extractor.dailymotion import DailymotionIE
  28 from .extractor.metacafe import MetacafeIE
  29 from .extractor.statigram import StatigramIE
  30 from .extractor.photobucket import PhotobucketIE
  31 from .extractor.vimeo import VimeoIE
  32 from .extractor.yahoo import YahooIE
  33 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeUserIE, YoutubeChannelIE
  34
  35
  36
  37
  38
  39
  40
  41
  42
  43 class ArteTvIE(InfoExtractor):
  44     """arte.tv information extractor."""
  45
  46     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
  47     _LIVE_URL = r'index-[0-9]+\.html$'
  48
  49     IE_NAME = u'arte.tv'
  50
  51     def fetch_webpage(self, url):
  52         request = compat_urllib_request.Request(url)
  53         try:
  54             self.report_download_webpage(url)
  55             webpage = compat_urllib_request.urlopen(request).read()
  56         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
  57             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
  58         except ValueError as err:
  59             raise ExtractorError(u'Invalid URL: %s' % url)
  60         return webpage
  61
  62     def grep_webpage(self, url, regex, regexFlags, matchTuples):
  63         page = self.fetch_webpage(url)
  64         mobj = re.search(regex, page, regexFlags)
  65         info = {}
  66
  67         if mobj is None:
  68             raise ExtractorError(u'Invalid URL: %s' % url)
  69
  70         for (i, key, err) in matchTuples:
  71             if mobj.group(i) is None:
  72                 raise ExtractorError(err)
  73             else:
  74                 info[key] = mobj.group(i)
  75
  76         return info
  77
  78     def extractLiveStream(self, url):
  79         video_lang = url.split('/')[-4]
  80         info = self.grep_webpage(
  81             url,
  82             r'src="(.*?/videothek_js.*?\.js)',
  83             0,
  84             [
  85                 (1, 'url', u'Invalid URL: %s' % url)
  86             ]
  87         )
  88         http_host = url.split('/')[2]
  89         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
  90         info = self.grep_webpage(
  91             next_url,
  92             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
  93                 '(http://.*?\.swf).*?' +
  94                 '(rtmp://.*?)\'',
  95             re.DOTALL,
  96             [
  97                 (1, 'path',   u'could not extract video path: %s' % url),
  98                 (2, 'player', u'could not extract video player: %s' % url),
  99                 (3, 'url',    u'could not extract video url: %s' % url)
 100             ]
 101         )
 102         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
 103
 104     def extractPlus7Stream(self, url):
 105         video_lang = url.split('/')[-3]
 106         info = self.grep_webpage(
 107             url,
 108             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
 109             0,
 110             [
 111                 (1, 'url', u'Invalid URL: %s' % url)
 112             ]
 113         )
 114         next_url = compat_urllib_parse.unquote(info.get('url'))
 115         info = self.grep_webpage(
 116             next_url,
 117             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
 118             0,
 119             [
 120                 (1, 'url', u'Could not find <video> tag: %s' % url)
 121             ]
 122         )
 123         next_url = compat_urllib_parse.unquote(info.get('url'))
 124
 125         info = self.grep_webpage(
 126             next_url,
 127             r'<video id="(.*?)".*?>.*?' +
 128                 '<name>(.*?)</name>.*?' +
 129                 '<dateVideo>(.*?)</dateVideo>.*?' +
 130                 '<url quality="hd">(.*?)</url>',
 131             re.DOTALL,
 132             [
 133                 (1, 'id',    u'could not extract video id: %s' % url),
 134                 (2, 'title', u'could not extract video title: %s' % url),
 135                 (3, 'date',  u'could not extract video date: %s' % url),
 136                 (4, 'url',   u'could not extract video url: %s' % url)
 137             ]
 138         )
 139
 140         return {
 141             'id':           info.get('id'),
 142             'url':          compat_urllib_parse.unquote(info.get('url')),
 143             'uploader':     u'arte.tv',
 144             'upload_date':  unified_strdate(info.get('date')),
 145             'title':        info.get('title').decode('utf-8'),
 146             'ext':          u'mp4',
 147             'format':       u'NA',
 148             'player_url':   None,
 149         }
 150
 151     def _real_extract(self, url):
 152         video_id = url.split('/')[-1]
 153         self.report_extraction(video_id)
 154
 155         if re.search(self._LIVE_URL, video_id) is not None:
 156             self.extractLiveStream(url)
 157             return
 158         else:
 159             info = self.extractPlus7Stream(url)
 160
 161         return [info]
 162
 163
 164 class GenericIE(InfoExtractor):
 165     """Generic last-resort information extractor."""
 166
 167     _VALID_URL = r'.*'
 168     IE_NAME = u'generic'
 169
 170     def report_download_webpage(self, video_id):
 171         """Report webpage download."""
 172         if not self._downloader.params.get('test', False):
 173             self._downloader.report_warning(u'Falling back on generic information extractor.')
 174         super(GenericIE, self).report_download_webpage(video_id)
 175
 176     def report_following_redirect(self, new_url):
 177         """Report information extraction."""
 178         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
 179
 180     def _test_redirect(self, url):
 181         """Check if it is a redirect, like url shorteners, in case return the new url."""
 182         class HeadRequest(compat_urllib_request.Request):
 183             def get_method(self):
 184                 return "HEAD"
 185
 186         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
 187             """
 188             Subclass the HTTPRedirectHandler to make it use our
 189             HeadRequest also on the redirected URL
 190             """
 191             def redirect_request(self, req, fp, code, msg, headers, newurl):
 192                 if code in (301, 302, 303, 307):
 193                     newurl = newurl.replace(' ', '%20')
 194                     newheaders = dict((k,v) for k,v in req.headers.items()
 195                                       if k.lower() not in ("content-length", "content-type"))
 196                     return HeadRequest(newurl,
 197                                        headers=newheaders,
 198                                        origin_req_host=req.get_origin_req_host(),
 199                                        unverifiable=True)
 200                 else:
 201                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
 202
 203         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
 204             """
 205             Fallback to GET if HEAD is not allowed (405 HTTP error)
 206             """
 207             def http_error_405(self, req, fp, code, msg, headers):
 208                 fp.read()
 209                 fp.close()
 210
 211                 newheaders = dict((k,v) for k,v in req.headers.items()
 212                                   if k.lower() not in ("content-length", "content-type"))
 213                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
 214                                                  headers=newheaders,
 215                                                  origin_req_host=req.get_origin_req_host(),
 216                                                  unverifiable=True))
 217
 218         # Build our opener
 219         opener = compat_urllib_request.OpenerDirector()
 220         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
 221                         HTTPMethodFallback, HEADRedirectHandler,
 222                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
 223             opener.add_handler(handler())
 224
 225         response = opener.open(HeadRequest(url))
 226         if response is None:
 227             raise ExtractorError(u'Invalid URL protocol')
 228         new_url = response.geturl()
 229
 230         if url == new_url:
 231             return False
 232
 233         self.report_following_redirect(new_url)
 234         return new_url
 235
 236     def _real_extract(self, url):
 237         new_url = self._test_redirect(url)
 238         if new_url: return [self.url_result(new_url)]
 239
 240         video_id = url.split('/')[-1]
 241         try:
 242             webpage = self._download_webpage(url, video_id)
 243         except ValueError as err:
 244             # since this is the last-resort InfoExtractor, if
 245             # this error is thrown, it'll be thrown here
 246             raise ExtractorError(u'Invalid URL: %s' % url)
 247
 248         self.report_extraction(video_id)
 249         # Start with something easy: JW Player in SWFObject
 250         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
 251         if mobj is None:
 252             # Broaden the search a little bit
 253             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
 254         if mobj is None:
 255             # Broaden the search a little bit: JWPlayer JS loader
 256             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
 257         if mobj is None:
 258             # Try to find twitter cards info
 259             mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
 260         if mobj is None:
 261             # We look for Open Graph info:
 262             # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
 263             m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
 264             # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
 265             if m_video_type is not None:
 266                 mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
 267         if mobj is None:
 268             raise ExtractorError(u'Invalid URL: %s' % url)
 269
 270         # It's possible that one of the regexes
 271         # matched, but returned an empty group:
 272         if mobj.group(1) is None:
 273             raise ExtractorError(u'Invalid URL: %s' % url)
 274
 275         video_url = compat_urllib_parse.unquote(mobj.group(1))
 276         video_id = os.path.basename(video_url)
 277
 278         # here's a fun little line of code for you:
 279         video_extension = os.path.splitext(video_id)[1][1:]
 280         video_id = os.path.splitext(video_id)[0]
 281
 282         # it's tempting to parse this further, but you would
 283         # have to take into account all the variations like
 284         #   Video Title - Site Name
 285         #   Site Name | Video Title
 286         #   Video Title - Tagline | Site Name
 287         # and so on and so forth; it's just not practical
 288         video_title = self._html_search_regex(r'<title>(.*)</title>',
 289             webpage, u'video title')
 290
 291         # video uploader is domain name
 292         video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
 293             url, u'video uploader')
 294
 295         return [{
 296             'id':       video_id,
 297             'url':      video_url,
 298             'uploader': video_uploader,
 299             'upload_date':  None,
 300             'title':    video_title,
 301             'ext':      video_extension,
 302         }]
 303
 304
 305 class YoutubeSearchIE(SearchInfoExtractor):
 306     """Information Extractor for YouTube search queries."""
 307     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
 308     _MAX_RESULTS = 1000
 309     IE_NAME = u'youtube:search'
 310     _SEARCH_KEY = 'ytsearch'
 311
 312     def report_download_page(self, query, pagenum):
 313         """Report attempt to download search page with given number."""
 314         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
 315
 316     def _get_n_results(self, query, n):
 317         """Get a specified number of results for a query"""
 318
 319         video_ids = []
 320         pagenum = 0
 321         limit = n
 322
 323         while (50 * pagenum) < limit:
 324             self.report_download_page(query, pagenum+1)
 325             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
 326             request = compat_urllib_request.Request(result_url)
 327             try:
 328                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
 329             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 330                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
 331             api_response = json.loads(data)['data']
 332
 333             if not 'items' in api_response:
 334                 raise ExtractorError(u'[youtube] No video results')
 335
 336             new_ids = list(video['id'] for video in api_response['items'])
 337             video_ids += new_ids
 338
 339             limit = min(n, api_response['totalItems'])
 340             pagenum += 1
 341
 342         if len(video_ids) > n:
 343             video_ids = video_ids[:n]
 344         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
 345         return self.playlist_result(videos, query)
 346
 347
 348 class GoogleSearchIE(SearchInfoExtractor):
 349     """Information Extractor for Google Video search queries."""
 350     _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
 351     _MAX_RESULTS = 1000
 352     IE_NAME = u'video.google:search'
 353     _SEARCH_KEY = 'gvsearch'
 354
 355     def _get_n_results(self, query, n):
 356         """Get a specified number of results for a query"""
 357
 358         res = {
 359             '_type': 'playlist',
 360             'id': query,
 361             'entries': []
 362         }
 363
 364         for pagenum in itertools.count(1):
 365             result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
 366             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
 367                                              note='Downloading result page ' + str(pagenum))
 368
 369             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
 370                 e = {
 371                     '_type': 'url',
 372                     'url': mobj.group(1)
 373                 }
 374                 res['entries'].append(e)
 375
 376             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
 377                 return res
 378
 379 class YahooSearchIE(SearchInfoExtractor):
 380     """Information Extractor for Yahoo! Video search queries."""
 381
 382     _MAX_RESULTS = 1000
 383     IE_NAME = u'screen.yahoo:search'
 384     _SEARCH_KEY = 'yvsearch'
 385
 386     def _get_n_results(self, query, n):
 387         """Get a specified number of results for a query"""
 388
 389         res = {
 390             '_type': 'playlist',
 391             'id': query,
 392             'entries': []
 393         }
 394         for pagenum in itertools.count(0):
 395             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
 396             webpage = self._download_webpage(result_url, query,
 397                                              note='Downloading results page '+str(pagenum+1))
 398             info = json.loads(webpage)
 399             m = info[u'm']
 400             results = info[u'results']
 401
 402             for (i, r) in enumerate(results):
 403                 if (pagenum * 30) +i >= n:
 404                     break
 405                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
 406                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
 407                 res['entries'].append(e)
 408             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
 409                 break
 410
 411         return res
 412
 413
 414 class BlipTVUserIE(InfoExtractor):
 415     """Information Extractor for blip.tv users."""
 416
 417     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
 418     _PAGE_SIZE = 12
 419     IE_NAME = u'blip.tv:user'
 420
 421     def _real_extract(self, url):
 422         # Extract username
 423         mobj = re.match(self._VALID_URL, url)
 424         if mobj is None:
 425             raise ExtractorError(u'Invalid URL: %s' % url)
 426
 427         username = mobj.group(1)
 428
 429         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
 430
 431         page = self._download_webpage(url, username, u'Downloading user page')
 432         mobj = re.search(r'data-users-id="([^"]+)"', page)
 433         page_base = page_base % mobj.group(1)
 434
 435
 436         # Download video ids using BlipTV Ajax calls. Result size per
 437         # query is limited (currently to 12 videos) so we need to query
 438         # page by page until there are no video ids - it means we got
 439         # all of them.
 440
 441         video_ids = []
 442         pagenum = 1
 443
 444         while True:
 445             url = page_base + "&page=" + str(pagenum)
 446             page = self._download_webpage(url, username,
 447                                           u'Downloading video ids from page %d' % pagenum)
 448
 449             # Extract video identifiers
 450             ids_in_page = []
 451
 452             for mobj in re.finditer(r'href="/([^"]+)"', page):
 453                 if mobj.group(1) not in ids_in_page:
 454                     ids_in_page.append(unescapeHTML(mobj.group(1)))
 455
 456             video_ids.extend(ids_in_page)
 457
 458             # A little optimization - if current page is not
 459             # "full", ie. does not contain PAGE_SIZE video ids then
 460             # we can assume that this page is the last one - there
 461             # are no more ids on further pages - no need to query
 462             # again.
 463
 464             if len(ids_in_page) < self._PAGE_SIZE:
 465                 break
 466
 467             pagenum += 1
 468
 469         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
 470         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
 471         return [self.playlist_result(url_entries, playlist_title = username)]
 472
 473
 474 class DepositFilesIE(InfoExtractor):
 475     """Information extractor for depositfiles.com"""
 476
 477     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
 478
 479     def _real_extract(self, url):
 480         file_id = url.split('/')[-1]
 481         # Rebuild url in english locale
 482         url = 'http://depositfiles.com/en/files/' + file_id
 483
 484         # Retrieve file webpage with 'Free download' button pressed
 485         free_download_indication = { 'gateway_result' : '1' }
 486         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
 487         try:
 488             self.report_download_webpage(file_id)
 489             webpage = compat_urllib_request.urlopen(request).read()
 490         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 491             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
 492
 493         # Search for the real file URL
 494         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
 495         if (mobj is None) or (mobj.group(1) is None):
 496             # Try to figure out reason of the error.
 497             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
 498             if (mobj is not None) and (mobj.group(1) is not None):
 499                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
 500                 raise ExtractorError(u'%s' % restriction_message)
 501             else:
 502                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
 503
 504         file_url = mobj.group(1)
 505         file_extension = os.path.splitext(file_url)[1][1:]
 506
 507         # Search for file title
 508         file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
 509
 510         return [{
 511             'id':       file_id.decode('utf-8'),
 512             'url':      file_url.decode('utf-8'),
 513             'uploader': None,
 514             'upload_date':  None,
 515             'title':    file_title,
 516             'ext':      file_extension.decode('utf-8'),
 517         }]
 518
 519
 520 class FacebookIE(InfoExtractor):
 521     """Information Extractor for Facebook"""
 522
 523     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
 524     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
 525     _NETRC_MACHINE = 'facebook'
 526     IE_NAME = u'facebook'
 527
 528     def report_login(self):
 529         """Report attempt to log in."""
 530         self.to_screen(u'Logging in')
 531
 532     def _real_initialize(self):
 533         if self._downloader is None:
 534             return
 535
 536         useremail = None
 537         password = None
 538         downloader_params = self._downloader.params
 539
 540         # Attempt to use provided username and password or .netrc data
 541         if downloader_params.get('username', None) is not None:
 542             useremail = downloader_params['username']
 543             password = downloader_params['password']
 544         elif downloader_params.get('usenetrc', False):
 545             try:
 546                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 547                 if info is not None:
 548                     useremail = info[0]
 549                     password = info[2]
 550                 else:
 551                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 552             except (IOError, netrc.NetrcParseError) as err:
 553                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 554                 return
 555
 556         if useremail is None:
 557             return
 558
 559         # Log in
 560         login_form = {
 561             'email': useremail,
 562             'pass': password,
 563             'login': 'Log+In'
 564             }
 565         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
 566         try:
 567             self.report_login()
 568             login_results = compat_urllib_request.urlopen(request).read()
 569             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
 570                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
 571                 return
 572         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 573             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 574             return
 575
 576     def _real_extract(self, url):
 577         mobj = re.match(self._VALID_URL, url)
 578         if mobj is None:
 579             raise ExtractorError(u'Invalid URL: %s' % url)
 580         video_id = mobj.group('ID')
 581
 582         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
 583         webpage = self._download_webpage(url, video_id)
 584
 585         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
 586         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
 587         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
 588         if not m:
 589             raise ExtractorError(u'Cannot parse data')
 590         data = dict(json.loads(m.group(1)))
 591         params_raw = compat_urllib_parse.unquote(data['params'])
 592         params = json.loads(params_raw)
 593         video_data = params['video_data'][0]
 594         video_url = video_data.get('hd_src')
 595         if not video_url:
 596             video_url = video_data['sd_src']
 597         if not video_url:
 598             raise ExtractorError(u'Cannot find video URL')
 599         video_duration = int(video_data['video_duration'])
 600         thumbnail = video_data['thumbnail_src']
 601
 602         video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
 603             webpage, u'title')
 604
 605         info = {
 606             'id': video_id,
 607             'title': video_title,
 608             'url': video_url,
 609             'ext': 'mp4',
 610             'duration': video_duration,
 611             'thumbnail': thumbnail,
 612         }
 613         return [info]
 614
 615
 616 class BlipTVIE(InfoExtractor):
 617     """Information extractor for blip.tv"""
 618
 619     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
 620     _URL_EXT = r'^.*\.([a-z0-9]+)$'
 621     IE_NAME = u'blip.tv'
 622
 623     def report_direct_download(self, title):
 624         """Report information extraction."""
 625         self.to_screen(u'%s: Direct download detected' % title)
 626
 627     def _real_extract(self, url):
 628         mobj = re.match(self._VALID_URL, url)
 629         if mobj is None:
 630             raise ExtractorError(u'Invalid URL: %s' % url)
 631
 632         # See https://github.com/rg3/youtube-dl/issues/857
 633         api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
 634         if api_mobj is not None:
 635             url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
 636         urlp = compat_urllib_parse_urlparse(url)
 637         if urlp.path.startswith('/play/'):
 638             request = compat_urllib_request.Request(url)
 639             response = compat_urllib_request.urlopen(request)
 640             redirecturl = response.geturl()
 641             rurlp = compat_urllib_parse_urlparse(redirecturl)
 642             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
 643             url = 'http://blip.tv/a/a-' + file_id
 644             return self._real_extract(url)
 645
 646
 647         if '?' in url:
 648             cchar = '&'
 649         else:
 650             cchar = '?'
 651         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
 652         request = compat_urllib_request.Request(json_url)
 653         request.add_header('User-Agent', 'iTunes/10.6.1')
 654         self.report_extraction(mobj.group(1))
 655         info = None
 656         try:
 657             urlh = compat_urllib_request.urlopen(request)
 658             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
 659                 basename = url.split('/')[-1]
 660                 title,ext = os.path.splitext(basename)
 661                 title = title.decode('UTF-8')
 662                 ext = ext.replace('.', '')
 663                 self.report_direct_download(title)
 664                 info = {
 665                     'id': title,
 666                     'url': url,
 667                     'uploader': None,
 668                     'upload_date': None,
 669                     'title': title,
 670                     'ext': ext,
 671                     'urlhandle': urlh
 672                 }
 673         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 674             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 675         if info is None: # Regular URL
 676             try:
 677                 json_code_bytes = urlh.read()
 678                 json_code = json_code_bytes.decode('utf-8')
 679             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 680                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
 681
 682             try:
 683                 json_data = json.loads(json_code)
 684                 if 'Post' in json_data:
 685                     data = json_data['Post']
 686                 else:
 687                     data = json_data
 688
 689                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
 690                 video_url = data['media']['url']
 691                 umobj = re.match(self._URL_EXT, video_url)
 692                 if umobj is None:
 693                     raise ValueError('Can not determine filename extension')
 694                 ext = umobj.group(1)
 695
 696                 info = {
 697                     'id': data['item_id'],
 698                     'url': video_url,
 699                     'uploader': data['display_name'],
 700                     'upload_date': upload_date,
 701                     'title': data['title'],
 702                     'ext': ext,
 703                     'format': data['media']['mimeType'],
 704                     'thumbnail': data['thumbnailUrl'],
 705                     'description': data['description'],
 706                     'player_url': data['embedUrl'],
 707                     'user_agent': 'iTunes/10.6.1',
 708                 }
 709             except (ValueError,KeyError) as err:
 710                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
 711
 712         return [info]
 713
 714
 715 class MyVideoIE(InfoExtractor):
 716     """Information Extractor for myvideo.de."""
 717
 718     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
 719     IE_NAME = u'myvideo'
 720
 721     # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
 722     # Released into the Public Domain by Tristan Fischer on 2013-05-19
 723     # https://github.com/rg3/youtube-dl/pull/842
 724     def __rc4crypt(self,data, key):
 725         x = 0
 726         box = list(range(256))
 727         for i in list(range(256)):
 728             x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
 729             box[i], box[x] = box[x], box[i]
 730         x = 0
 731         y = 0
 732         out = ''
 733         for char in data:
 734             x = (x + 1) % 256
 735             y = (y + box[x]) % 256
 736             box[x], box[y] = box[y], box[x]
 737             out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
 738         return out
 739
 740     def __md5(self,s):
 741         return hashlib.md5(s).hexdigest().encode()
 742
 743     def _real_extract(self,url):
 744         mobj = re.match(self._VALID_URL, url)
 745         if mobj is None:
 746             raise ExtractorError(u'invalid URL: %s' % url)
 747
 748         video_id = mobj.group(1)
 749
 750         GK = (
 751           b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
 752           b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
 753           b'TnpsbA0KTVRkbU1tSTRNdz09'
 754         )
 755
 756         # Get video webpage
 757         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
 758         webpage = self._download_webpage(webpage_url, video_id)
 759
 760         mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
 761         if mobj is not None:
 762             self.report_extraction(video_id)
 763             video_url = mobj.group(1) + '.flv'
 764
 765             video_title = self._html_search_regex('<title>([^<]+)</title>',
 766                 webpage, u'title')
 767
 768             video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
 769
 770             return [{
 771                 'id':       video_id,
 772                 'url':      video_url,
 773                 'uploader': None,
 774                 'upload_date':  None,
 775                 'title':    video_title,
 776                 'ext':      u'flv',
 777             }]
 778
 779         # try encxml
 780         mobj = re.search('var flashvars={(.+?)}', webpage)
 781         if mobj is None:
 782             raise ExtractorError(u'Unable to extract video')
 783
 784         params = {}
 785         encxml = ''
 786         sec = mobj.group(1)
 787         for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
 788             if not a == '_encxml':
 789                 params[a] = b
 790             else:
 791                 encxml = compat_urllib_parse.unquote(b)
 792         if not params.get('domain'):
 793             params['domain'] = 'www.myvideo.de'
 794         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
 795         if 'flash_playertype=MTV' in xmldata_url:
 796             self._downloader.report_warning(u'avoiding MTV player')
 797             xmldata_url = (
 798                 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
 799                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
 800             ) % video_id
 801
 802         # get enc data
 803         enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
 804         enc_data_b = binascii.unhexlify(enc_data)
 805         sk = self.__md5(
 806             base64.b64decode(base64.b64decode(GK)) +
 807             self.__md5(
 808                 str(video_id).encode('utf-8')
 809             )
 810         )
 811         dec_data = self.__rc4crypt(enc_data_b, sk)
 812
 813         # extracting infos
 814         self.report_extraction(video_id)
 815
 816         video_url = None
 817         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
 818         if mobj:
 819             video_url = compat_urllib_parse.unquote(mobj.group(1))
 820             if 'myvideo2flash' in video_url:
 821                 self._downloader.report_warning(u'forcing RTMPT ...')
 822                 video_url = video_url.replace('rtmpe://', 'rtmpt://')
 823
 824         if not video_url:
 825             # extract non rtmp videos
 826             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
 827             if mobj is None:
 828                 raise ExtractorError(u'unable to extract url')
 829             video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
 830
 831         video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
 832         video_file = compat_urllib_parse.unquote(video_file)
 833
 834         if not video_file.endswith('f4m'):
 835             ppath, prefix = video_file.split('.')
 836             video_playpath = '%s:%s' % (prefix, ppath)
 837             video_hls_playlist = ''
 838         else:
 839             video_playpath = ''
 840             video_hls_playlist = (
 841                 video_filepath + video_file
 842             ).replace('.f4m', '.m3u8')
 843
 844         video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
 845         video_swfobj = compat_urllib_parse.unquote(video_swfobj)
 846
 847         video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
 848             webpage, u'title')
 849
 850         return [{
 851             'id':                 video_id,
 852             'url':                video_url,
 853             'tc_url':             video_url,
 854             'uploader':           None,
 855             'upload_date':        None,
 856             'title':              video_title,
 857             'ext':                u'flv',
 858             'play_path':          video_playpath,
 859             'video_file':         video_file,
 860             'video_hls_playlist': video_hls_playlist,
 861             'player_url':         video_swfobj,
 862         }]
 863
 864
 865 class ComedyCentralIE(InfoExtractor):
 866     """Information extractor for The Daily Show and Colbert Report """
 867
 868     # urls can be abbreviations like :thedailyshow or :colbert
 869     # urls for episodes like:
 870     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
 871     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
 872     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
 873     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
 874                       |(https?://)?(www\.)?
 875                           (?P<showname>thedailyshow|colbertnation)\.com/
 876                          (full-episodes/(?P<episode>.*)|
 877                           (?P<clip>
 878                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
 879                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
 880                      $"""
 881
 882     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
 883
 884     _video_extensions = {
 885         '3500': 'mp4',
 886         '2200': 'mp4',
 887         '1700': 'mp4',
 888         '1200': 'mp4',
 889         '750': 'mp4',
 890         '400': 'mp4',
 891     }
 892     _video_dimensions = {
 893         '3500': '1280x720',
 894         '2200': '960x540',
 895         '1700': '768x432',
 896         '1200': '640x360',
 897         '750': '512x288',
 898         '400': '384x216',
 899     }
 900
 901     @classmethod
 902     def suitable(cls, url):
 903         """Receives a URL and returns True if suitable for this IE."""
 904         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 905
 906     def _print_formats(self, formats):
 907         print('Available formats:')
 908         for x in formats:
 909             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
 910
 911
 912     def _real_extract(self, url):
 913         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 914         if mobj is None:
 915             raise ExtractorError(u'Invalid URL: %s' % url)
 916
 917         if mobj.group('shortname'):
 918             if mobj.group('shortname') in ('tds', 'thedailyshow'):
 919                 url = u'http://www.thedailyshow.com/full-episodes/'
 920             else:
 921                 url = u'http://www.colbertnation.com/full-episodes/'
 922             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 923             assert mobj is not None
 924
 925         if mobj.group('clip'):
 926             if mobj.group('showname') == 'thedailyshow':
 927                 epTitle = mobj.group('tdstitle')
 928             else:
 929                 epTitle = mobj.group('cntitle')
 930             dlNewest = False
 931         else:
 932             dlNewest = not mobj.group('episode')
 933             if dlNewest:
 934                 epTitle = mobj.group('showname')
 935             else:
 936                 epTitle = mobj.group('episode')
 937
 938         self.report_extraction(epTitle)
 939         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
 940         if dlNewest:
 941             url = htmlHandle.geturl()
 942             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 943             if mobj is None:
 944                 raise ExtractorError(u'Invalid redirected URL: ' + url)
 945             if mobj.group('episode') == '':
 946                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
 947             epTitle = mobj.group('episode')
 948
 949         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
 950
 951         if len(mMovieParams) == 0:
 952             # The Colbert Report embeds the information in a without
 953             # a URL prefix; so extract the alternate reference
 954             # and then add the URL prefix manually.
 955
 956             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
 957             if len(altMovieParams) == 0:
 958                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
 959             else:
 960                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
 961
 962         uri = mMovieParams[0][1]
 963         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
 964         indexXml = self._download_webpage(indexUrl, epTitle,
 965                                           u'Downloading show index',
 966                                           u'unable to download episode index')
 967
 968         results = []
 969
 970         idoc = xml.etree.ElementTree.fromstring(indexXml)
 971         itemEls = idoc.findall('.//item')
 972         for partNum,itemEl in enumerate(itemEls):
 973             mediaId = itemEl.findall('./guid')[0].text
 974             shortMediaId = mediaId.split(':')[-1]
 975             showId = mediaId.split(':')[-2].replace('.com', '')
 976             officialTitle = itemEl.findall('./title')[0].text
 977             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
 978
 979             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
 980                         compat_urllib_parse.urlencode({'uri': mediaId}))
 981             configXml = self._download_webpage(configUrl, epTitle,
 982                                                u'Downloading configuration for %s' % shortMediaId)
 983
 984             cdoc = xml.etree.ElementTree.fromstring(configXml)
 985             turls = []
 986             for rendition in cdoc.findall('.//rendition'):
 987                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
 988                 turls.append(finfo)
 989
 990             if len(turls) == 0:
 991                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
 992                 continue
 993
 994             if self._downloader.params.get('listformats', None):
 995                 self._print_formats([i[0] for i in turls])
 996                 return
 997
 998             # For now, just pick the highest bitrate
 999             format,rtmp_video_url = turls[-1]
1000
1001             # Get the format arg from the arg stream
1002             req_format = self._downloader.params.get('format', None)
1003
1004             # Select format if we can find one
1005             for f,v in turls:
1006                 if f == req_format:
1007                     format, rtmp_video_url = f, v
1008                     break
1009
1010             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
1011             if not m:
1012                 raise ExtractorError(u'Cannot transform RTMP url')
1013             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
1014             video_url = base + m.group('finalid')
1015
1016             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
1017             info = {
1018                 'id': shortMediaId,
1019                 'url': video_url,
1020                 'uploader': showId,
1021                 'upload_date': officialDate,
1022                 'title': effTitle,
1023                 'ext': 'mp4',
1024                 'format': format,
1025                 'thumbnail': None,
1026                 'description': officialTitle,
1027             }
1028             results.append(info)
1029
1030         return results
1031
1032
1033 class EscapistIE(InfoExtractor):
1034     """Information extractor for The Escapist """
1035
1036     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
1037     IE_NAME = u'escapist'
1038
1039     def _real_extract(self, url):
1040         mobj = re.match(self._VALID_URL, url)
1041         if mobj is None:
1042             raise ExtractorError(u'Invalid URL: %s' % url)
1043         showName = mobj.group('showname')
1044         videoId = mobj.group('episode')
1045
1046         self.report_extraction(videoId)
1047         webpage = self._download_webpage(url, videoId)
1048
1049         videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
1050             webpage, u'description', fatal=False)
1051
1052         imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
1053             webpage, u'thumbnail', fatal=False)
1054
1055         playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
1056             webpage, u'player url')
1057
1058         title = self._html_search_regex('<meta name="title" content="([^"]*)"',
1059             webpage, u'player url').split(' : ')[-1]
1060
1061         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
1062         configUrl = compat_urllib_parse.unquote(configUrl)
1063
1064         configJSON = self._download_webpage(configUrl, videoId,
1065                                             u'Downloading configuration',
1066                                             u'unable to download configuration')
1067
1068         # Technically, it's JavaScript, not JSON
1069         configJSON = configJSON.replace("'", '"')
1070
1071         try:
1072             config = json.loads(configJSON)
1073         except (ValueError,) as err:
1074             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
1075
1076         playlist = config['playlist']
1077         videoUrl = playlist[1]['url']
1078
1079         info = {
1080             'id': videoId,
1081             'url': videoUrl,
1082             'uploader': showName,
1083             'upload_date': None,
1084             'title': title,
1085             'ext': 'mp4',
1086             'thumbnail': imgUrl,
1087             'description': videoDesc,
1088             'player_url': playerUrl,
1089         }
1090
1091         return [info]
1092
1093 class CollegeHumorIE(InfoExtractor):
1094     """Information extractor for collegehumor.com"""
1095
1096     _WORKING = False
1097     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
1098     IE_NAME = u'collegehumor'
1099
1100     def report_manifest(self, video_id):
1101         """Report information extraction."""
1102         self.to_screen(u'%s: Downloading XML manifest' % video_id)
1103
1104     def _real_extract(self, url):
1105         mobj = re.match(self._VALID_URL, url)
1106         if mobj is None:
1107             raise ExtractorError(u'Invalid URL: %s' % url)
1108         video_id = mobj.group('videoid')
1109
1110         info = {
1111             'id': video_id,
1112             'uploader': None,
1113             'upload_date': None,
1114         }
1115
1116         self.report_extraction(video_id)
1117         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
1118         try:
1119             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
1120         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1121             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1122
1123         mdoc = xml.etree.ElementTree.fromstring(metaXml)
1124         try:
1125             videoNode = mdoc.findall('./video')[0]
1126             info['description'] = videoNode.findall('./description')[0].text
1127             info['title'] = videoNode.findall('./caption')[0].text
1128             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
1129             manifest_url = videoNode.findall('./file')[0].text
1130         except IndexError:
1131             raise ExtractorError(u'Invalid metadata XML file')
1132
1133         manifest_url += '?hdcore=2.10.3'
1134         self.report_manifest(video_id)
1135         try:
1136             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
1137         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1138             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1139
1140         adoc = xml.etree.ElementTree.fromstring(manifestXml)
1141         try:
1142             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
1143             node_id = media_node.attrib['url']
1144             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
1145         except IndexError as err:
1146             raise ExtractorError(u'Invalid manifest file')
1147
1148         url_pr = compat_urllib_parse_urlparse(manifest_url)
1149         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
1150
1151         info['url'] = url
1152         info['ext'] = 'f4f'
1153         return [info]
1154
1155
1156 class XVideosIE(InfoExtractor):
1157     """Information extractor for xvideos.com"""
1158
1159     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
1160     IE_NAME = u'xvideos'
1161
1162     def _real_extract(self, url):
1163         mobj = re.match(self._VALID_URL, url)
1164         if mobj is None:
1165             raise ExtractorError(u'Invalid URL: %s' % url)
1166         video_id = mobj.group(1)
1167
1168         webpage = self._download_webpage(url, video_id)
1169
1170         self.report_extraction(video_id)
1171
1172         # Extract video URL
1173         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
1174             webpage, u'video URL'))
1175
1176         # Extract title
1177         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
1178             webpage, u'title')
1179
1180         # Extract video thumbnail
1181         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
1182             webpage, u'thumbnail', fatal=False)
1183
1184         info = {
1185             'id': video_id,
1186             'url': video_url,
1187             'uploader': None,
1188             'upload_date': None,
1189             'title': video_title,
1190             'ext': 'flv',
1191             'thumbnail': video_thumbnail,
1192             'description': None,
1193         }
1194
1195         return [info]
1196
1197
1198 class SoundcloudIE(InfoExtractor):
1199     """Information extractor for soundcloud.com
1200        To access the media, the uid of the song and a stream token
1201        must be extracted from the page source and the script must make
1202        a request to media.soundcloud.com/crossdomain.xml. Then
1203        the media can be grabbed by requesting from an url composed
1204        of the stream token and uid
1205      """
1206
1207     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
1208     IE_NAME = u'soundcloud'
1209
1210     def report_resolve(self, video_id):
1211         """Report information extraction."""
1212         self.to_screen(u'%s: Resolving id' % video_id)
1213
1214     def _real_extract(self, url):
1215         mobj = re.match(self._VALID_URL, url)
1216         if mobj is None:
1217             raise ExtractorError(u'Invalid URL: %s' % url)
1218
1219         # extract uploader (which is in the url)
1220         uploader = mobj.group(1)
1221         # extract simple title (uploader + slug of song title)
1222         slug_title =  mobj.group(2)
1223         simple_title = uploader + u'-' + slug_title
1224         full_title = '%s/%s' % (uploader, slug_title)
1225
1226         self.report_resolve(full_title)
1227
1228         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
1229         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1230         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
1231
1232         info = json.loads(info_json)
1233         video_id = info['id']
1234         self.report_extraction(full_title)
1235
1236         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1237         stream_json = self._download_webpage(streams_url, full_title,
1238                                              u'Downloading stream definitions',
1239                                              u'unable to download stream definitions')
1240
1241         streams = json.loads(stream_json)
1242         mediaURL = streams['http_mp3_128_url']
1243         upload_date = unified_strdate(info['created_at'])
1244
1245         return [{
1246             'id':       info['id'],
1247             'url':      mediaURL,
1248             'uploader': info['user']['username'],
1249             'upload_date': upload_date,
1250             'title':    info['title'],
1251             'ext':      u'mp3',
1252             'description': info['description'],
1253         }]
1254
1255 class SoundcloudSetIE(InfoExtractor):
1256     """Information extractor for soundcloud.com sets
1257        To access the media, the uid of the song and a stream token
1258        must be extracted from the page source and the script must make
1259        a request to media.soundcloud.com/crossdomain.xml. Then
1260        the media can be grabbed by requesting from an url composed
1261        of the stream token and uid
1262      """
1263
1264     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
1265     IE_NAME = u'soundcloud:set'
1266
1267     def report_resolve(self, video_id):
1268         """Report information extraction."""
1269         self.to_screen(u'%s: Resolving id' % video_id)
1270
1271     def _real_extract(self, url):
1272         mobj = re.match(self._VALID_URL, url)
1273         if mobj is None:
1274             raise ExtractorError(u'Invalid URL: %s' % url)
1275
1276         # extract uploader (which is in the url)
1277         uploader = mobj.group(1)
1278         # extract simple title (uploader + slug of song title)
1279         slug_title =  mobj.group(2)
1280         simple_title = uploader + u'-' + slug_title
1281         full_title = '%s/sets/%s' % (uploader, slug_title)
1282
1283         self.report_resolve(full_title)
1284
1285         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
1286         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1287         info_json = self._download_webpage(resolv_url, full_title)
1288
1289         videos = []
1290         info = json.loads(info_json)
1291         if 'errors' in info:
1292             for err in info['errors']:
1293                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
1294             return
1295
1296         self.report_extraction(full_title)
1297         for track in info['tracks']:
1298             video_id = track['id']
1299
1300             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1301             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
1302
1303             self.report_extraction(video_id)
1304             streams = json.loads(stream_json)
1305             mediaURL = streams['http_mp3_128_url']
1306
1307             videos.append({
1308                 'id':       video_id,
1309                 'url':      mediaURL,
1310                 'uploader': track['user']['username'],
1311                 'upload_date':  unified_strdate(track['created_at']),
1312                 'title':    track['title'],
1313                 'ext':      u'mp3',
1314                 'description': track['description'],
1315             })
1316         return videos
1317
1318
1319 class InfoQIE(InfoExtractor):
1320     """Information extractor for infoq.com"""
1321     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
1322
1323     def _real_extract(self, url):
1324         mobj = re.match(self._VALID_URL, url)
1325         if mobj is None:
1326             raise ExtractorError(u'Invalid URL: %s' % url)
1327
1328         webpage = self._download_webpage(url, video_id=url)
1329         self.report_extraction(url)
1330
1331         # Extract video URL
1332         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
1333         if mobj is None:
1334             raise ExtractorError(u'Unable to extract video url')
1335         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
1336         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
1337
1338         # Extract title
1339         video_title = self._search_regex(r'contentTitle = "(.*?)";',
1340             webpage, u'title')
1341
1342         # Extract description
1343         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
1344             webpage, u'description', fatal=False)
1345
1346         video_filename = video_url.split('/')[-1]
1347         video_id, extension = video_filename.split('.')
1348
1349         info = {
1350             'id': video_id,
1351             'url': video_url,
1352             'uploader': None,
1353             'upload_date': None,
1354             'title': video_title,
1355             'ext': extension, # Extension is always(?) mp4, but seems to be flv
1356             'thumbnail': None,
1357             'description': video_description,
1358         }
1359
1360         return [info]
1361
1362 class MixcloudIE(InfoExtractor):
1363     """Information extractor for www.mixcloud.com"""
1364
1365     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
1366     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
1367     IE_NAME = u'mixcloud'
1368
1369     def report_download_json(self, file_id):
1370         """Report JSON download."""
1371         self.to_screen(u'Downloading json')
1372
1373     def get_urls(self, jsonData, fmt, bitrate='best'):
1374         """Get urls from 'audio_formats' section in json"""
1375         file_url = None
1376         try:
1377             bitrate_list = jsonData[fmt]
1378             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
1379                 bitrate = max(bitrate_list) # select highest
1380
1381             url_list = jsonData[fmt][bitrate]
1382         except TypeError: # we have no bitrate info.
1383             url_list = jsonData[fmt]
1384         return url_list
1385
1386     def check_urls(self, url_list):
1387         """Returns 1st active url from list"""
1388         for url in url_list:
1389             try:
1390                 compat_urllib_request.urlopen(url)
1391                 return url
1392             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1393                 url = None
1394
1395         return None
1396
1397     def _print_formats(self, formats):
1398         print('Available formats:')
1399         for fmt in formats.keys():
1400             for b in formats[fmt]:
1401                 try:
1402                     ext = formats[fmt][b][0]
1403                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
1404                 except TypeError: # we have no bitrate info
1405                     ext = formats[fmt][0]
1406                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
1407                     break
1408
1409     def _real_extract(self, url):
1410         mobj = re.match(self._VALID_URL, url)
1411         if mobj is None:
1412             raise ExtractorError(u'Invalid URL: %s' % url)
1413         # extract uploader & filename from url
1414         uploader = mobj.group(1).decode('utf-8')
1415         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
1416
1417         # construct API request
1418         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
1419         # retrieve .json file with links to files
1420         request = compat_urllib_request.Request(file_url)
1421         try:
1422             self.report_download_json(file_url)
1423             jsonData = compat_urllib_request.urlopen(request).read()
1424         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1425             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
1426
1427         # parse JSON
1428         json_data = json.loads(jsonData)
1429         player_url = json_data['player_swf_url']
1430         formats = dict(json_data['audio_formats'])
1431
1432         req_format = self._downloader.params.get('format', None)
1433         bitrate = None
1434
1435         if self._downloader.params.get('listformats', None):
1436             self._print_formats(formats)
1437             return
1438
1439         if req_format is None or req_format == 'best':
1440             for format_param in formats.keys():
1441                 url_list = self.get_urls(formats, format_param)
1442                 # check urls
1443                 file_url = self.check_urls(url_list)
1444                 if file_url is not None:
1445                     break # got it!
1446         else:
1447             if req_format not in formats:
1448                 raise ExtractorError(u'Format is not available')
1449
1450             url_list = self.get_urls(formats, req_format)
1451             file_url = self.check_urls(url_list)
1452             format_param = req_format
1453
1454         return [{
1455             'id': file_id.decode('utf-8'),
1456             'url': file_url.decode('utf-8'),
1457             'uploader': uploader.decode('utf-8'),
1458             'upload_date': None,
1459             'title': json_data['name'],
1460             'ext': file_url.split('.')[-1].decode('utf-8'),
1461             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1462             'thumbnail': json_data['thumbnail_url'],
1463             'description': json_data['description'],
1464             'player_url': player_url.decode('utf-8'),
1465         }]
1466
1467 class StanfordOpenClassroomIE(InfoExtractor):
1468     """Information extractor for Stanford's Open ClassRoom"""
1469
1470     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
1471     IE_NAME = u'stanfordoc'
1472
1473     def _real_extract(self, url):
1474         mobj = re.match(self._VALID_URL, url)
1475         if mobj is None:
1476             raise ExtractorError(u'Invalid URL: %s' % url)
1477
1478         if mobj.group('course') and mobj.group('video'): # A specific video
1479             course = mobj.group('course')
1480             video = mobj.group('video')
1481             info = {
1482                 'id': course + '_' + video,
1483                 'uploader': None,
1484                 'upload_date': None,
1485             }
1486
1487             self.report_extraction(info['id'])
1488             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
1489             xmlUrl = baseUrl + video + '.xml'
1490             try:
1491                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
1492             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1493                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1494             mdoc = xml.etree.ElementTree.fromstring(metaXml)
1495             try:
1496                 info['title'] = mdoc.findall('./title')[0].text
1497                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
1498             except IndexError:
1499                 raise ExtractorError(u'Invalid metadata XML file')
1500             info['ext'] = info['url'].rpartition('.')[2]
1501             return [info]
1502         elif mobj.group('course'): # A course page
1503             course = mobj.group('course')
1504             info = {
1505                 'id': course,
1506                 'type': 'playlist',
1507                 'uploader': None,
1508                 'upload_date': None,
1509             }
1510
1511             coursepage = self._download_webpage(url, info['id'],
1512                                         note='Downloading course info page',
1513                                         errnote='Unable to download course info page')
1514
1515             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
1516
1517             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
1518                 coursepage, u'description', fatal=False)
1519
1520             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
1521             info['list'] = [
1522                 {
1523                     'type': 'reference',
1524                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
1525                 }
1526                     for vpage in links]
1527             results = []
1528             for entry in info['list']:
1529                 assert entry['type'] == 'reference'
1530                 results += self.extract(entry['url'])
1531             return results
1532         else: # Root page
1533             info = {
1534                 'id': 'Stanford OpenClassroom',
1535                 'type': 'playlist',
1536                 'uploader': None,
1537                 'upload_date': None,
1538             }
1539
1540             self.report_download_webpage(info['id'])
1541             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
1542             try:
1543                 rootpage = compat_urllib_request.urlopen(rootURL).read()
1544             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1545                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
1546
1547             info['title'] = info['id']
1548
1549             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
1550             info['list'] = [
1551                 {
1552                     'type': 'reference',
1553                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
1554                 }
1555                     for cpage in links]
1556
1557             results = []
1558             for entry in info['list']:
1559                 assert entry['type'] == 'reference'
1560                 results += self.extract(entry['url'])
1561             return results
1562
1563 class MTVIE(InfoExtractor):
1564     """Information extractor for MTV.com"""
1565
1566     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
1567     IE_NAME = u'mtv'
1568
1569     def _real_extract(self, url):
1570         mobj = re.match(self._VALID_URL, url)
1571         if mobj is None:
1572             raise ExtractorError(u'Invalid URL: %s' % url)
1573         if not mobj.group('proto'):
1574             url = 'http://' + url
1575         video_id = mobj.group('videoid')
1576
1577         webpage = self._download_webpage(url, video_id)
1578
1579         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
1580             webpage, u'song name', fatal=False)
1581
1582         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
1583             webpage, u'title')
1584
1585         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
1586             webpage, u'mtvn_uri', fatal=False)
1587
1588         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
1589             webpage, u'content id', fatal=False)
1590
1591         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
1592         self.report_extraction(video_id)
1593         request = compat_urllib_request.Request(videogen_url)
1594         try:
1595             metadataXml = compat_urllib_request.urlopen(request).read()
1596         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1597             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
1598
1599         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
1600         renditions = mdoc.findall('.//rendition')
1601
1602         # For now, always pick the highest quality.
1603         rendition = renditions[-1]
1604
1605         try:
1606             _,_,ext = rendition.attrib['type'].partition('/')
1607             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
1608             video_url = rendition.find('./src').text
1609         except KeyError:
1610             raise ExtractorError('Invalid rendition field.')
1611
1612         info = {
1613             'id': video_id,
1614             'url': video_url,
1615             'uploader': performer,
1616             'upload_date': None,
1617             'title': video_title,
1618             'ext': ext,
1619             'format': format,
1620         }
1621
1622         return [info]
1623
1624
1625 class YoukuIE(InfoExtractor):
1626     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
1627
1628     def _gen_sid(self):
1629         nowTime = int(time.time() * 1000)
1630         random1 = random.randint(1000,1998)
1631         random2 = random.randint(1000,9999)
1632
1633         return "%d%d%d" %(nowTime,random1,random2)
1634
1635     def _get_file_ID_mix_string(self, seed):
1636         mixed = []
1637         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
1638         seed = float(seed)
1639         for i in range(len(source)):
1640             seed  =  (seed * 211 + 30031 ) % 65536
1641             index  =  math.floor(seed / 65536 * len(source) )
1642             mixed.append(source[int(index)])
1643             source.remove(source[int(index)])
1644         #return ''.join(mixed)
1645         return mixed
1646
1647     def _get_file_id(self, fileId, seed):
1648         mixed = self._get_file_ID_mix_string(seed)
1649         ids = fileId.split('*')
1650         realId = []
1651         for ch in ids:
1652             if ch:
1653                 realId.append(mixed[int(ch)])
1654         return ''.join(realId)
1655
1656     def _real_extract(self, url):
1657         mobj = re.match(self._VALID_URL, url)
1658         if mobj is None:
1659             raise ExtractorError(u'Invalid URL: %s' % url)
1660         video_id = mobj.group('ID')
1661
1662         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
1663
1664         jsondata = self._download_webpage(info_url, video_id)
1665
1666         self.report_extraction(video_id)
1667         try:
1668             config = json.loads(jsondata)
1669
1670             video_title =  config['data'][0]['title']
1671             seed = config['data'][0]['seed']
1672
1673             format = self._downloader.params.get('format', None)
1674             supported_format = list(config['data'][0]['streamfileids'].keys())
1675
1676             if format is None or format == 'best':
1677                 if 'hd2' in supported_format:
1678                     format = 'hd2'
1679                 else:
1680                     format = 'flv'
1681                 ext = u'flv'
1682             elif format == 'worst':
1683                 format = 'mp4'
1684                 ext = u'mp4'
1685             else:
1686                 format = 'flv'
1687                 ext = u'flv'
1688
1689
1690             fileid = config['data'][0]['streamfileids'][format]
1691             keys = [s['k'] for s in config['data'][0]['segs'][format]]
1692         except (UnicodeDecodeError, ValueError, KeyError):
1693             raise ExtractorError(u'Unable to extract info section')
1694
1695         files_info=[]
1696         sid = self._gen_sid()
1697         fileid = self._get_file_id(fileid, seed)
1698
1699         #column 8,9 of fileid represent the segment number
1700         #fileid[7:9] should be changed
1701         for index, key in enumerate(keys):
1702
1703             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
1704             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
1705
1706             info = {
1707                 'id': '%s_part%02d' % (video_id, index),
1708                 'url': download_url,
1709                 'uploader': None,
1710                 'upload_date': None,
1711                 'title': video_title,
1712                 'ext': ext,
1713             }
1714             files_info.append(info)
1715
1716         return files_info
1717
1718
1719 class XNXXIE(InfoExtractor):
1720     """Information extractor for xnxx.com"""
1721
1722     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
1723     IE_NAME = u'xnxx'
1724     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
1725     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
1726     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
1727
1728     def _real_extract(self, url):
1729         mobj = re.match(self._VALID_URL, url)
1730         if mobj is None:
1731             raise ExtractorError(u'Invalid URL: %s' % url)
1732         video_id = mobj.group(1)
1733
1734         # Get webpage content
1735         webpage = self._download_webpage(url, video_id)
1736
1737         video_url = self._search_regex(self.VIDEO_URL_RE,
1738             webpage, u'video URL')
1739         video_url = compat_urllib_parse.unquote(video_url)
1740
1741         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
1742             webpage, u'title')
1743
1744         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
1745             webpage, u'thumbnail', fatal=False)
1746
1747         return [{
1748             'id': video_id,
1749             'url': video_url,
1750             'uploader': None,
1751             'upload_date': None,
1752             'title': video_title,
1753             'ext': 'flv',
1754             'thumbnail': video_thumbnail,
1755             'description': None,
1756         }]
1757
1758
1759 class GooglePlusIE(InfoExtractor):
1760     """Information extractor for plus.google.com."""
1761
1762     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
1763     IE_NAME = u'plus.google'
1764
1765     def _real_extract(self, url):
1766         # Extract id from URL
1767         mobj = re.match(self._VALID_URL, url)
1768         if mobj is None:
1769             raise ExtractorError(u'Invalid URL: %s' % url)
1770
1771         post_url = mobj.group(0)
1772         video_id = mobj.group(1)
1773
1774         video_extension = 'flv'
1775
1776         # Step 1, Retrieve post webpage to extract further information
1777         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
1778
1779         self.report_extraction(video_id)
1780
1781         # Extract update date
1782         upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
1783             webpage, u'upload date', fatal=False)
1784         if upload_date:
1785             # Convert timestring to a format suitable for filename
1786             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
1787             upload_date = upload_date.strftime('%Y%m%d')
1788
1789         # Extract uploader
1790         uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
1791             webpage, u'uploader', fatal=False)
1792
1793         # Extract title
1794         # Get the first line for title
1795         video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
1796             webpage, 'title', default=u'NA')
1797
1798         # Step 2, Stimulate clicking the image box to launch video
1799         video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
1800             webpage, u'video page URL')
1801         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
1802
1803         # Extract video links on video page
1804         """Extract video links of all sizes"""
1805         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
1806         mobj = re.findall(pattern, webpage)
1807         if len(mobj) == 0:
1808             raise ExtractorError(u'Unable to extract video links')
1809
1810         # Sort in resolution
1811         links = sorted(mobj)
1812
1813         # Choose the lowest of the sort, i.e. highest resolution
1814         video_url = links[-1]
1815         # Only get the url. The resolution part in the tuple has no use anymore
1816         video_url = video_url[-1]
1817         # Treat escaped \u0026 style hex
1818         try:
1819             video_url = video_url.decode("unicode_escape")
1820         except AttributeError: # Python 3
1821             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
1822
1823
1824         return [{
1825             'id':       video_id,
1826             'url':      video_url,
1827             'uploader': uploader,
1828             'upload_date':  upload_date,
1829             'title':    video_title,
1830             'ext':      video_extension,
1831         }]
1832
1833 class NBAIE(InfoExtractor):
1834     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
1835     IE_NAME = u'nba'
1836
1837     def _real_extract(self, url):
1838         mobj = re.match(self._VALID_URL, url)
1839         if mobj is None:
1840             raise ExtractorError(u'Invalid URL: %s' % url)
1841
1842         video_id = mobj.group(1)
1843
1844         webpage = self._download_webpage(url, video_id)
1845
1846         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
1847
1848         shortened_video_id = video_id.rpartition('/')[2]
1849         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
1850             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
1851
1852         # It isn't there in the HTML it returns to us
1853         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
1854
1855         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
1856
1857         info = {
1858             'id': shortened_video_id,
1859             'url': video_url,
1860             'ext': 'mp4',
1861             'title': title,
1862             # 'uploader_date': uploader_date,
1863             'description': description,
1864         }
1865         return [info]
1866
1867 class JustinTVIE(InfoExtractor):
1868     """Information extractor for justin.tv and twitch.tv"""
1869     # TODO: One broadcast may be split into multiple videos. The key
1870     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
1871     # starts at 1 and increases. Can we treat all parts as one video?
1872
1873     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
1874         (?:
1875             (?P<channelid>[^/]+)|
1876             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
1877             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
1878         )
1879         /?(?:\#.*)?$
1880         """
1881     _JUSTIN_PAGE_LIMIT = 100
1882     IE_NAME = u'justin.tv'
1883
1884     def report_download_page(self, channel, offset):
1885         """Report attempt to download a single page of videos."""
1886         self.to_screen(u'%s: Downloading video information from %d to %d' %
1887                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
1888
1889     # Return count of items, list of *valid* items
1890     def _parse_page(self, url, video_id):
1891         webpage = self._download_webpage(url, video_id,
1892                                          u'Downloading video info JSON',
1893                                          u'unable to download video info JSON')
1894
1895         response = json.loads(webpage)
1896         if type(response) != list:
1897             error_text = response.get('error', 'unknown error')
1898             raise ExtractorError(u'Justin.tv API: %s' % error_text)
1899         info = []
1900         for clip in response:
1901             video_url = clip['video_file_url']
1902             if video_url:
1903                 video_extension = os.path.splitext(video_url)[1][1:]
1904                 video_date = re.sub('-', '', clip['start_time'][:10])
1905                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
1906                 video_id = clip['id']
1907                 video_title = clip.get('title', video_id)
1908                 info.append({
1909                     'id': video_id,
1910                     'url': video_url,
1911                     'title': video_title,
1912                     'uploader': clip.get('channel_name', video_uploader_id),
1913                     'uploader_id': video_uploader_id,
1914                     'upload_date': video_date,
1915                     'ext': video_extension,
1916                 })
1917         return (len(response), info)
1918
1919     def _real_extract(self, url):
1920         mobj = re.match(self._VALID_URL, url)
1921         if mobj is None:
1922             raise ExtractorError(u'invalid URL: %s' % url)
1923
1924         api_base = 'http://api.justin.tv'
1925         paged = False
1926         if mobj.group('channelid'):
1927             paged = True
1928             video_id = mobj.group('channelid')
1929             api = api_base + '/channel/archives/%s.json' % video_id
1930         elif mobj.group('chapterid'):
1931             chapter_id = mobj.group('chapterid')
1932
1933             webpage = self._download_webpage(url, chapter_id)
1934             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
1935             if not m:
1936                 raise ExtractorError(u'Cannot find archive of a chapter')
1937             archive_id = m.group(1)
1938
1939             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
1940             chapter_info_xml = self._download_webpage(api, chapter_id,
1941                                              note=u'Downloading chapter information',
1942                                              errnote=u'Chapter information download failed')
1943             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
1944             for a in doc.findall('.//archive'):
1945                 if archive_id == a.find('./id').text:
1946                     break
1947             else:
1948                 raise ExtractorError(u'Could not find chapter in chapter information')
1949
1950             video_url = a.find('./video_file_url').text
1951             video_ext = video_url.rpartition('.')[2] or u'flv'
1952
1953             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
1954             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
1955                                    note='Downloading chapter metadata',
1956                                    errnote='Download of chapter metadata failed')
1957             chapter_info = json.loads(chapter_info_json)
1958
1959             bracket_start = int(doc.find('.//bracket_start').text)
1960             bracket_end = int(doc.find('.//bracket_end').text)
1961
1962             # TODO determine start (and probably fix up file)
1963             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
1964             #video_url += u'?start=' + TODO:start_timestamp
1965             # bracket_start is 13290, but we want 51670615
1966             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
1967                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
1968
1969             info = {
1970                 'id': u'c' + chapter_id,
1971                 'url': video_url,
1972                 'ext': video_ext,
1973                 'title': chapter_info['title'],
1974                 'thumbnail': chapter_info['preview'],
1975                 'description': chapter_info['description'],
1976                 'uploader': chapter_info['channel']['display_name'],
1977                 'uploader_id': chapter_info['channel']['name'],
1978             }
1979             return [info]
1980         else:
1981             video_id = mobj.group('videoid')
1982             api = api_base + '/broadcast/by_archive/%s.json' % video_id
1983
1984         self.report_extraction(video_id)
1985
1986         info = []
1987         offset = 0
1988         limit = self._JUSTIN_PAGE_LIMIT
1989         while True:
1990             if paged:
1991                 self.report_download_page(video_id, offset)
1992             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
1993             page_count, page_info = self._parse_page(page_url, video_id)
1994             info.extend(page_info)
1995             if not paged or page_count != limit:
1996                 break
1997             offset += limit
1998         return info
1999
2000 class FunnyOrDieIE(InfoExtractor):
2001     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
2002
2003     def _real_extract(self, url):
2004         mobj = re.match(self._VALID_URL, url)
2005         if mobj is None:
2006             raise ExtractorError(u'invalid URL: %s' % url)
2007
2008         video_id = mobj.group('id')
2009         webpage = self._download_webpage(url, video_id)
2010
2011         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
2012             webpage, u'video URL', flags=re.DOTALL)
2013
2014         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
2015             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
2016
2017         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
2018             webpage, u'description', fatal=False, flags=re.DOTALL)
2019
2020         info = {
2021             'id': video_id,
2022             'url': video_url,
2023             'ext': 'mp4',
2024             'title': title,
2025             'description': video_description,
2026         }
2027         return [info]
2028
2029 class SteamIE(InfoExtractor):
2030     _VALID_URL = r"""http://store\.steampowered\.com/
2031                 (agecheck/)?
2032                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
2033                 (?P<gameID>\d+)/?
2034                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
2035                 """
2036     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
2037     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
2038
2039     @classmethod
2040     def suitable(cls, url):
2041         """Receives a URL and returns True if suitable for this IE."""
2042         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2043
2044     def _real_extract(self, url):
2045         m = re.match(self._VALID_URL, url, re.VERBOSE)
2046         gameID = m.group('gameID')
2047
2048         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
2049         webpage = self._download_webpage(videourl, gameID)
2050
2051         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
2052             videourl = self._AGECHECK_TEMPLATE % gameID
2053             self.report_age_confirmation()
2054             webpage = self._download_webpage(videourl, gameID)
2055
2056         self.report_extraction(gameID)
2057         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
2058                                              webpage, 'game title')
2059
2060         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
2061         mweb = re.finditer(urlRE, webpage)
2062         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
2063         titles = re.finditer(namesRE, webpage)
2064         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
2065         thumbs = re.finditer(thumbsRE, webpage)
2066         videos = []
2067         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
2068             video_id = vid.group('videoID')
2069             title = vtitle.group('videoName')
2070             video_url = vid.group('videoURL')
2071             video_thumb = thumb.group('thumbnail')
2072             if not video_url:
2073                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
2074             info = {
2075                 'id':video_id,
2076                 'url':video_url,
2077                 'ext': 'flv',
2078                 'title': unescapeHTML(title),
2079                 'thumbnail': video_thumb
2080                   }
2081             videos.append(info)
2082         return [self.playlist_result(videos, gameID, game_title)]
2083
2084 class UstreamIE(InfoExtractor):
2085     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
2086     IE_NAME = u'ustream'
2087
2088     def _real_extract(self, url):
2089         m = re.match(self._VALID_URL, url)
2090         video_id = m.group('videoID')
2091
2092         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
2093         webpage = self._download_webpage(url, video_id)
2094
2095         self.report_extraction(video_id)
2096
2097         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
2098             webpage, u'title')
2099
2100         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
2101             webpage, u'uploader', fatal=False, flags=re.DOTALL)
2102
2103         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
2104             webpage, u'thumbnail', fatal=False)
2105
2106         info = {
2107                 'id': video_id,
2108                 'url': video_url,
2109                 'ext': 'flv',
2110                 'title': video_title,
2111                 'uploader': uploader,
2112                 'thumbnail': thumbnail,
2113                }
2114         return info
2115
2116 class WorldStarHipHopIE(InfoExtractor):
2117     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
2118     IE_NAME = u'WorldStarHipHop'
2119
2120     def _real_extract(self, url):
2121         m = re.match(self._VALID_URL, url)
2122         video_id = m.group('id')
2123
2124         webpage_src = self._download_webpage(url, video_id)
2125
2126         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
2127             webpage_src, u'video URL')
2128
2129         if 'mp4' in video_url:
2130             ext = 'mp4'
2131         else:
2132             ext = 'flv'
2133
2134         video_title = self._html_search_regex(r"<title>(.*)</title>",
2135             webpage_src, u'title')
2136
2137         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
2138         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
2139             webpage_src, u'thumbnail', fatal=False)
2140
2141         if not thumbnail:
2142             _title = r"""candytitles.*>(.*)</span>"""
2143             mobj = re.search(_title, webpage_src)
2144             if mobj is not None:
2145                 video_title = mobj.group(1)
2146
2147         results = [{
2148                     'id': video_id,
2149                     'url' : video_url,
2150                     'title' : video_title,
2151                     'thumbnail' : thumbnail,
2152                     'ext' : ext,
2153                     }]
2154         return results
2155
2156 class RBMARadioIE(InfoExtractor):
2157     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
2158
2159     def _real_extract(self, url):
2160         m = re.match(self._VALID_URL, url)
2161         video_id = m.group('videoID')
2162
2163         webpage = self._download_webpage(url, video_id)
2164
2165         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
2166             webpage, u'json data', flags=re.MULTILINE)
2167
2168         try:
2169             data = json.loads(json_data)
2170         except ValueError as e:
2171             raise ExtractorError(u'Invalid JSON: ' + str(e))
2172
2173         video_url = data['akamai_url'] + '&cbr=256'
2174         url_parts = compat_urllib_parse_urlparse(video_url)
2175         video_ext = url_parts.path.rpartition('.')[2]
2176         info = {
2177                 'id': video_id,
2178                 'url': video_url,
2179                 'ext': video_ext,
2180                 'title': data['title'],
2181                 'description': data.get('teaser_text'),
2182                 'location': data.get('country_of_origin'),
2183                 'uploader': data.get('host', {}).get('name'),
2184                 'uploader_id': data.get('host', {}).get('slug'),
2185                 'thumbnail': data.get('image', {}).get('large_url_2x'),
2186                 'duration': data.get('duration'),
2187         }
2188         return [info]
2189
2190
2191 class YouPornIE(InfoExtractor):
2192     """Information extractor for youporn.com."""
2193     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
2194
2195     def _print_formats(self, formats):
2196         """Print all available formats"""
2197         print(u'Available formats:')
2198         print(u'ext\t\tformat')
2199         print(u'---------------------------------')
2200         for format in formats:
2201             print(u'%s\t\t%s'  % (format['ext'], format['format']))
2202
2203     def _specific(self, req_format, formats):
2204         for x in formats:
2205             if(x["format"]==req_format):
2206                 return x
2207         return None
2208
2209     def _real_extract(self, url):
2210         mobj = re.match(self._VALID_URL, url)
2211         if mobj is None:
2212             raise ExtractorError(u'Invalid URL: %s' % url)
2213         video_id = mobj.group('videoid')
2214
2215         req = compat_urllib_request.Request(url)
2216         req.add_header('Cookie', 'age_verified=1')
2217         webpage = self._download_webpage(req, video_id)
2218
2219         # Get JSON parameters
2220         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
2221         try:
2222             params = json.loads(json_params)
2223         except:
2224             raise ExtractorError(u'Invalid JSON')
2225
2226         self.report_extraction(video_id)
2227         try:
2228             video_title = params['title']
2229             upload_date = unified_strdate(params['release_date_f'])
2230             video_description = params['description']
2231             video_uploader = params['submitted_by']
2232             thumbnail = params['thumbnails'][0]['image']
2233         except KeyError:
2234             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
2235
2236         # Get all of the formats available
2237         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
2238         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
2239             webpage, u'download list').strip()
2240
2241         # Get all of the links from the page
2242         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
2243         links = re.findall(LINK_RE, download_list_html)
2244         if(len(links) == 0):
2245             raise ExtractorError(u'ERROR: no known formats available for video')
2246
2247         self.to_screen(u'Links found: %d' % len(links))
2248
2249         formats = []
2250         for link in links:
2251
2252             # A link looks like this:
2253             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
2254             # A path looks like this:
2255             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
2256             video_url = unescapeHTML( link )
2257             path = compat_urllib_parse_urlparse( video_url ).path
2258             extension = os.path.splitext( path )[1][1:]
2259             format = path.split('/')[4].split('_')[:2]
2260             size = format[0]
2261             bitrate = format[1]
2262             format = "-".join( format )
2263             # title = u'%s-%s-%s' % (video_title, size, bitrate)
2264
2265             formats.append({
2266                 'id': video_id,
2267                 'url': video_url,
2268                 'uploader': video_uploader,
2269                 'upload_date': upload_date,
2270                 'title': video_title,
2271                 'ext': extension,
2272                 'format': format,
2273                 'thumbnail': thumbnail,
2274                 'description': video_description
2275             })
2276
2277         if self._downloader.params.get('listformats', None):
2278             self._print_formats(formats)
2279             return
2280
2281         req_format = self._downloader.params.get('format', None)
2282         self.to_screen(u'Format: %s' % req_format)
2283
2284         if req_format is None or req_format == 'best':
2285             return [formats[0]]
2286         elif req_format == 'worst':
2287             return [formats[-1]]
2288         elif req_format in ('-1', 'all'):
2289             return formats
2290         else:
2291             format = self._specific( req_format, formats )
2292             if result is None:
2293                 raise ExtractorError(u'Requested format not available')
2294             return [format]
2295
2296
2297
2298 class PornotubeIE(InfoExtractor):
2299     """Information extractor for pornotube.com."""
2300     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
2301
2302     def _real_extract(self, url):
2303         mobj = re.match(self._VALID_URL, url)
2304         if mobj is None:
2305             raise ExtractorError(u'Invalid URL: %s' % url)
2306
2307         video_id = mobj.group('videoid')
2308         video_title = mobj.group('title')
2309
2310         # Get webpage content
2311         webpage = self._download_webpage(url, video_id)
2312
2313         # Get the video URL
2314         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
2315         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
2316         video_url = compat_urllib_parse.unquote(video_url)
2317
2318         #Get the uploaded date
2319         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
2320         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
2321         if upload_date: upload_date = unified_strdate(upload_date)
2322
2323         info = {'id': video_id,
2324                 'url': video_url,
2325                 'uploader': None,
2326                 'upload_date': upload_date,
2327                 'title': video_title,
2328                 'ext': 'flv',
2329                 'format': 'flv'}
2330
2331         return [info]
2332
2333 class YouJizzIE(InfoExtractor):
2334     """Information extractor for youjizz.com."""
2335     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
2336
2337     def _real_extract(self, url):
2338         mobj = re.match(self._VALID_URL, url)
2339         if mobj is None:
2340             raise ExtractorError(u'Invalid URL: %s' % url)
2341
2342         video_id = mobj.group('videoid')
2343
2344         # Get webpage content
2345         webpage = self._download_webpage(url, video_id)
2346
2347         # Get the video title
2348         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
2349             webpage, u'title').strip()
2350
2351         # Get the embed page
2352         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
2353         if result is None:
2354             raise ExtractorError(u'ERROR: unable to extract embed page')
2355
2356         embed_page_url = result.group(0).strip()
2357         video_id = result.group('videoid')
2358
2359         webpage = self._download_webpage(embed_page_url, video_id)
2360
2361         # Get the video URL
2362         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
2363             webpage, u'video URL')
2364
2365         info = {'id': video_id,
2366                 'url': video_url,
2367                 'title': video_title,
2368                 'ext': 'flv',
2369                 'format': 'flv',
2370                 'player_url': embed_page_url}
2371
2372         return [info]
2373
2374 class EightTracksIE(InfoExtractor):
2375     IE_NAME = '8tracks'
2376     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
2377
2378     def _real_extract(self, url):
2379         mobj = re.match(self._VALID_URL, url)
2380         if mobj is None:
2381             raise ExtractorError(u'Invalid URL: %s' % url)
2382         playlist_id = mobj.group('id')
2383
2384         webpage = self._download_webpage(url, playlist_id)
2385
2386         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
2387         data = json.loads(json_like)
2388
2389         session = str(random.randint(0, 1000000000))
2390         mix_id = data['id']
2391         track_count = data['tracks_count']
2392         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
2393         next_url = first_url
2394         res = []
2395         for i in itertools.count():
2396             api_json = self._download_webpage(next_url, playlist_id,
2397                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
2398                 errnote=u'Failed to download song information')
2399             api_data = json.loads(api_json)
2400             track_data = api_data[u'set']['track']
2401             info = {
2402                 'id': track_data['id'],
2403                 'url': track_data['track_file_stream_url'],
2404                 'title': track_data['performer'] + u' - ' + track_data['name'],
2405                 'raw_title': track_data['name'],
2406                 'uploader_id': data['user']['login'],
2407                 'ext': 'm4a',
2408             }
2409             res.append(info)
2410             if api_data['set']['at_last_track']:
2411                 break
2412             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
2413         return res
2414
2415 class KeekIE(InfoExtractor):
2416     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
2417     IE_NAME = u'keek'
2418
2419     def _real_extract(self, url):
2420         m = re.match(self._VALID_URL, url)
2421         video_id = m.group('videoID')
2422
2423         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
2424         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
2425         webpage = self._download_webpage(url, video_id)
2426
2427         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2428             webpage, u'title')
2429
2430         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
2431             webpage, u'uploader', fatal=False)
2432
2433         info = {
2434                 'id': video_id,
2435                 'url': video_url,
2436                 'ext': 'mp4',
2437                 'title': video_title,
2438                 'thumbnail': thumbnail,
2439                 'uploader': uploader
2440         }
2441         return [info]
2442
2443 class TEDIE(InfoExtractor):
2444     _VALID_URL=r'''http://www\.ted\.com/
2445                    (
2446                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
2447                         |
2448                         ((?P<type_talk>talks)) # We have a simple talk
2449                    )
2450                    (/lang/(.*?))? # The url may contain the language
2451                    /(?P<name>\w+) # Here goes the name and then ".html"
2452                    '''
2453
2454     @classmethod
2455     def suitable(cls, url):
2456         """Receives a URL and returns True if suitable for this IE."""
2457         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2458
2459     def _real_extract(self, url):
2460         m=re.match(self._VALID_URL, url, re.VERBOSE)
2461         if m.group('type_talk'):
2462             return [self._talk_info(url)]
2463         else :
2464             playlist_id=m.group('playlist_id')
2465             name=m.group('name')
2466             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
2467             return [self._playlist_videos_info(url,name,playlist_id)]
2468
2469     def _playlist_videos_info(self,url,name,playlist_id=0):
2470         '''Returns the videos of the playlist'''
2471         video_RE=r'''
2472                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
2473                      ([.\s]*?)data-playlist_item_id="(\d+)"
2474                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
2475                      '''
2476         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
2477         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
2478         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
2479         m_names=re.finditer(video_name_RE,webpage)
2480
2481         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
2482                                                  webpage, 'playlist title')
2483
2484         playlist_entries = []
2485         for m_video, m_name in zip(m_videos,m_names):
2486             video_id=m_video.group('video_id')
2487             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
2488             playlist_entries.append(self.url_result(talk_url, 'TED'))
2489         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
2490
2491     def _talk_info(self, url, video_id=0):
2492         """Return the video for the talk in the url"""
2493         m = re.match(self._VALID_URL, url,re.VERBOSE)
2494         video_name = m.group('name')
2495         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
2496         self.report_extraction(video_name)
2497         # If the url includes the language we get the title translated
2498         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
2499                                         webpage, 'title')
2500         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
2501                                     webpage, 'json data')
2502         info = json.loads(json_data)
2503         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
2504                                        webpage, 'description', flags = re.DOTALL)
2505
2506         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
2507                                        webpage, 'thumbnail')
2508         info = {
2509                 'id': info['id'],
2510                 'url': info['htmlStreams'][-1]['file'],
2511                 'ext': 'mp4',
2512                 'title': title,
2513                 'thumbnail': thumbnail,
2514                 'description': desc,
2515                 }
2516         return info
2517
2518 class MySpassIE(InfoExtractor):
2519     _VALID_URL = r'http://www.myspass.de/.*'
2520
2521     def _real_extract(self, url):
2522         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
2523
2524         # video id is the last path element of the URL
2525         # usually there is a trailing slash, so also try the second but last
2526         url_path = compat_urllib_parse_urlparse(url).path
2527         url_parent_path, video_id = os.path.split(url_path)
2528         if not video_id:
2529             _, video_id = os.path.split(url_parent_path)
2530
2531         # get metadata
2532         metadata_url = META_DATA_URL_TEMPLATE % video_id
2533         metadata_text = self._download_webpage(metadata_url, video_id)
2534         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
2535
2536         # extract values from metadata
2537         url_flv_el = metadata.find('url_flv')
2538         if url_flv_el is None:
2539             raise ExtractorError(u'Unable to extract download url')
2540         video_url = url_flv_el.text
2541         extension = os.path.splitext(video_url)[1][1:]
2542         title_el = metadata.find('title')
2543         if title_el is None:
2544             raise ExtractorError(u'Unable to extract title')
2545         title = title_el.text
2546         format_id_el = metadata.find('format_id')
2547         if format_id_el is None:
2548             format = ext
2549         else:
2550             format = format_id_el.text
2551         description_el = metadata.find('description')
2552         if description_el is not None:
2553             description = description_el.text
2554         else:
2555             description = None
2556         imagePreview_el = metadata.find('imagePreview')
2557         if imagePreview_el is not None:
2558             thumbnail = imagePreview_el.text
2559         else:
2560             thumbnail = None
2561         info = {
2562             'id': video_id,
2563             'url': video_url,
2564             'title': title,
2565             'ext': extension,
2566             'format': format,
2567             'thumbnail': thumbnail,
2568             'description': description
2569         }
2570         return [info]
2571
2572 class SpiegelIE(InfoExtractor):
2573     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
2574
2575     def _real_extract(self, url):
2576         m = re.match(self._VALID_URL, url)
2577         video_id = m.group('videoID')
2578
2579         webpage = self._download_webpage(url, video_id)
2580
2581         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
2582             webpage, u'title')
2583
2584         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
2585         xml_code = self._download_webpage(xml_url, video_id,
2586                     note=u'Downloading XML', errnote=u'Failed to download XML')
2587
2588         idoc = xml.etree.ElementTree.fromstring(xml_code)
2589         last_type = idoc[-1]
2590         filename = last_type.findall('./filename')[0].text
2591         duration = float(last_type.findall('./duration')[0].text)
2592
2593         video_url = 'http://video2.spiegel.de/flash/' + filename
2594         video_ext = filename.rpartition('.')[2]
2595         info = {
2596             'id': video_id,
2597             'url': video_url,
2598             'ext': video_ext,
2599             'title': video_title,
2600             'duration': duration,
2601         }
2602         return [info]
2603
2604 class LiveLeakIE(InfoExtractor):
2605
2606     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
2607     IE_NAME = u'liveleak'
2608
2609     def _real_extract(self, url):
2610         mobj = re.match(self._VALID_URL, url)
2611         if mobj is None:
2612             raise ExtractorError(u'Invalid URL: %s' % url)
2613
2614         video_id = mobj.group('video_id')
2615
2616         webpage = self._download_webpage(url, video_id)
2617
2618         video_url = self._search_regex(r'file: "(.*?)",',
2619             webpage, u'video URL')
2620
2621         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2622             webpage, u'title').replace('LiveLeak.com -', '').strip()
2623
2624         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
2625             webpage, u'description', fatal=False)
2626
2627         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
2628             webpage, u'uploader', fatal=False)
2629
2630         info = {
2631             'id':  video_id,
2632             'url': video_url,
2633             'ext': 'mp4',
2634             'title': video_title,
2635             'description': video_description,
2636             'uploader': video_uploader
2637         }
2638
2639         return [info]
2640
2641 class ARDIE(InfoExtractor):
2642     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
2643     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
2644     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
2645
2646     def _real_extract(self, url):
2647         # determine video id from url
2648         m = re.match(self._VALID_URL, url)
2649
2650         numid = re.search(r'documentId=([0-9]+)', url)
2651         if numid:
2652             video_id = numid.group(1)
2653         else:
2654             video_id = m.group('video_id')
2655
2656         # determine title and media streams from webpage
2657         html = self._download_webpage(url, video_id)
2658         title = re.search(self._TITLE, html).group('title')
2659         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
2660         if not streams:
2661             assert '"fsk"' in html
2662             raise ExtractorError(u'This video is only available after 8:00 pm')
2663
2664         # choose default media type and highest quality for now
2665         stream = max([s for s in streams if int(s["media_type"]) == 0],
2666                      key=lambda s: int(s["quality"]))
2667
2668         # there's two possibilities: RTMP stream or HTTP download
2669         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
2670         if stream['rtmp_url']:
2671             self.to_screen(u'RTMP download detected')
2672             assert stream['video_url'].startswith('mp4:')
2673             info["url"] = stream["rtmp_url"]
2674             info["play_path"] = stream['video_url']
2675         else:
2676             assert stream["video_url"].endswith('.mp4')
2677             info["url"] = stream["video_url"]
2678         return [info]
2679
2680 class ZDFIE(InfoExtractor):
2681     _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
2682     _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
2683     _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
2684     _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
2685     _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
2686
2687     def _real_extract(self, url):
2688         mobj = re.match(self._VALID_URL, url)
2689         if mobj is None:
2690             raise ExtractorError(u'Invalid URL: %s' % url)
2691         video_id = mobj.group('video_id')
2692
2693         html = self._download_webpage(url, video_id)
2694         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
2695         if streams is None:
2696             raise ExtractorError(u'No media url found.')
2697
2698         # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
2699         # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
2700         # choose first/default media type and highest quality for now
2701         for s in streams:        #find 300 - dsl1000mbit
2702             if s['quality'] == '300' and s['media_type'] == 'wstreaming':
2703                 stream_=s
2704                 break
2705         for s in streams:        #find veryhigh - dsl2000mbit
2706             if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
2707                 stream_=s
2708                 break
2709         if stream_ is None:
2710             raise ExtractorError(u'No stream found.')
2711
2712         media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
2713
2714         self.report_extraction(video_id)
2715         mobj = re.search(self._TITLE, html)
2716         if mobj is None:
2717             raise ExtractorError(u'Cannot extract title')
2718         title = unescapeHTML(mobj.group('title'))
2719
2720         mobj = re.search(self._MMS_STREAM, media_link)
2721         if mobj is None:
2722             mobj = re.search(self._RTSP_STREAM, media_link)
2723             if mobj is None:
2724                 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
2725         mms_url = mobj.group('video_url')
2726
2727         mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
2728         if mobj is None:
2729             raise ExtractorError(u'Cannot extract extention')
2730         ext = mobj.group('ext')
2731
2732         return [{'id': video_id,
2733                  'url': mms_url,
2734                  'title': title,
2735                  'ext': ext
2736                  }]
2737
2738 class TumblrIE(InfoExtractor):
2739     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
2740
2741     def _real_extract(self, url):
2742         m_url = re.match(self._VALID_URL, url)
2743         video_id = m_url.group('id')
2744         blog = m_url.group('blog_name')
2745
2746         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
2747         webpage = self._download_webpage(url, video_id)
2748
2749         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
2750         video = re.search(re_video, webpage)
2751         if video is None:
2752            raise ExtractorError(u'Unable to extract video')
2753         video_url = video.group('video_url')
2754         ext = video.group('ext')
2755
2756         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
2757             webpage, u'thumbnail', fatal=False)  # We pick the first poster
2758         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
2759
2760         # The only place where you can get a title, it's not complete,
2761         # but searching in other places doesn't work for all videos
2762         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
2763             webpage, u'title', flags=re.DOTALL)
2764
2765         return [{'id': video_id,
2766                  'url': video_url,
2767                  'title': video_title,
2768                  'thumbnail': video_thumbnail,
2769                  'ext': ext
2770                  }]
2771
2772 class BandcampIE(InfoExtractor):
2773     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
2774
2775     def _real_extract(self, url):
2776         mobj = re.match(self._VALID_URL, url)
2777         title = mobj.group('title')
2778         webpage = self._download_webpage(url, title)
2779         # We get the link to the free download page
2780         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
2781         if m_download is None:
2782             raise ExtractorError(u'No free songs found')
2783
2784         download_link = m_download.group(1)
2785         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
2786                        webpage, re.MULTILINE|re.DOTALL).group('id')
2787
2788         download_webpage = self._download_webpage(download_link, id,
2789                                                   'Downloading free downloads page')
2790         # We get the dictionary of the track from some javascrip code
2791         info = re.search(r'items: (.*?),$',
2792                          download_webpage, re.MULTILINE).group(1)
2793         info = json.loads(info)[0]
2794         # We pick mp3-320 for now, until format selection can be easily implemented.
2795         mp3_info = info[u'downloads'][u'mp3-320']
2796         # If we try to use this url it says the link has expired
2797         initial_url = mp3_info[u'url']
2798         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
2799         m_url = re.match(re_url, initial_url)
2800         #We build the url we will use to get the final track url
2801         # This url is build in Bandcamp in the script download_bunde_*.js
2802         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
2803         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
2804         # If we could correctly generate the .rand field the url would be
2805         #in the "download_url" key
2806         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
2807
2808         track_info = {'id':id,
2809                       'title' : info[u'title'],
2810                       'ext' :   'mp3',
2811                       'url' :   final_url,
2812                       'thumbnail' : info[u'thumb_url'],
2813                       'uploader' :  info[u'artist']
2814                       }
2815
2816         return [track_info]
2817
2818 class RedTubeIE(InfoExtractor):
2819     """Information Extractor for redtube"""
2820     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
2821
2822     def _real_extract(self,url):
2823         mobj = re.match(self._VALID_URL, url)
2824         if mobj is None:
2825             raise ExtractorError(u'Invalid URL: %s' % url)
2826
2827         video_id = mobj.group('id')
2828         video_extension = 'mp4'
2829         webpage = self._download_webpage(url, video_id)
2830
2831         self.report_extraction(video_id)
2832
2833         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
2834             webpage, u'video URL')
2835
2836         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
2837             webpage, u'title')
2838
2839         return [{
2840             'id':       video_id,
2841             'url':      video_url,
2842             'ext':      video_extension,
2843             'title':    video_title,
2844         }]
2845
2846 class InaIE(InfoExtractor):
2847     """Information Extractor for Ina.fr"""
2848     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
2849
2850     def _real_extract(self,url):
2851         mobj = re.match(self._VALID_URL, url)
2852
2853         video_id = mobj.group('id')
2854         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
2855         video_extension = 'mp4'
2856         webpage = self._download_webpage(mrss_url, video_id)
2857
2858         self.report_extraction(video_id)
2859
2860         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
2861             webpage, u'video URL')
2862
2863         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
2864             webpage, u'title')
2865
2866         return [{
2867             'id':       video_id,
2868             'url':      video_url,
2869             'ext':      video_extension,
2870             'title':    video_title,
2871         }]
2872
2873 class HowcastIE(InfoExtractor):
2874     """Information Extractor for Howcast.com"""
2875     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
2876
2877     def _real_extract(self, url):
2878         mobj = re.match(self._VALID_URL, url)
2879
2880         video_id = mobj.group('id')
2881         webpage_url = 'http://www.howcast.com/videos/' + video_id
2882         webpage = self._download_webpage(webpage_url, video_id)
2883
2884         self.report_extraction(video_id)
2885
2886         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
2887             webpage, u'video URL')
2888
2889         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
2890             webpage, u'title')
2891
2892         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
2893             webpage, u'description', fatal=False)
2894
2895         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
2896             webpage, u'thumbnail', fatal=False)
2897
2898         return [{
2899             'id':       video_id,
2900             'url':      video_url,
2901             'ext':      'mp4',
2902             'title':    video_title,
2903             'description': video_description,
2904             'thumbnail': thumbnail,
2905         }]
2906
2907 class VineIE(InfoExtractor):
2908     """Information Extractor for Vine.co"""
2909     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
2910
2911     def _real_extract(self, url):
2912         mobj = re.match(self._VALID_URL, url)
2913
2914         video_id = mobj.group('id')
2915         webpage_url = 'https://vine.co/v/' + video_id
2916         webpage = self._download_webpage(webpage_url, video_id)
2917
2918         self.report_extraction(video_id)
2919
2920         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
2921             webpage, u'video URL')
2922
2923         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2924             webpage, u'title')
2925
2926         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
2927             webpage, u'thumbnail', fatal=False)
2928
2929         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
2930             webpage, u'uploader', fatal=False, flags=re.DOTALL)
2931
2932         return [{
2933             'id':        video_id,
2934             'url':       video_url,
2935             'ext':       'mp4',
2936             'title':     video_title,
2937             'thumbnail': thumbnail,
2938             'uploader':  uploader,
2939         }]
2940
2941 class FlickrIE(InfoExtractor):
2942     """Information Extractor for Flickr videos"""
2943     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
2944
2945     def _real_extract(self, url):
2946         mobj = re.match(self._VALID_URL, url)
2947
2948         video_id = mobj.group('id')
2949         video_uploader_id = mobj.group('uploader_id')
2950         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
2951         webpage = self._download_webpage(webpage_url, video_id)
2952
2953         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
2954
2955         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
2956         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
2957
2958         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
2959             first_xml, u'node_id')
2960
2961         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
2962         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
2963
2964         self.report_extraction(video_id)
2965
2966         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
2967         if mobj is None:
2968             raise ExtractorError(u'Unable to extract video url')
2969         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
2970
2971         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
2972             webpage, u'video title')
2973
2974         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
2975             webpage, u'description', fatal=False)
2976
2977         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
2978             webpage, u'thumbnail', fatal=False)
2979
2980         return [{
2981             'id':          video_id,
2982             'url':         video_url,
2983             'ext':         'mp4',
2984             'title':       video_title,
2985             'description': video_description,
2986             'thumbnail':   thumbnail,
2987             'uploader_id': video_uploader_id,
2988         }]
2989
2990 class TeamcocoIE(InfoExtractor):
2991     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
2992
2993     def _real_extract(self, url):
2994         mobj = re.match(self._VALID_URL, url)
2995         if mobj is None:
2996             raise ExtractorError(u'Invalid URL: %s' % url)
2997         url_title = mobj.group('url_title')
2998         webpage = self._download_webpage(url, url_title)
2999
3000         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
3001             webpage, u'video id')
3002
3003         self.report_extraction(video_id)
3004
3005         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
3006             webpage, u'title')
3007
3008         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
3009             webpage, u'thumbnail', fatal=False)
3010
3011         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
3012             webpage, u'description', fatal=False)
3013
3014         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
3015         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
3016
3017         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
3018             data, u'video URL')
3019
3020         return [{
3021             'id':          video_id,
3022             'url':         video_url,
3023             'ext':         'mp4',
3024             'title':       video_title,
3025             'thumbnail':   thumbnail,
3026             'description': video_description,
3027         }]
3028
3029 class XHamsterIE(InfoExtractor):
3030     """Information Extractor for xHamster"""
3031     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
3032
3033     def _real_extract(self,url):
3034         mobj = re.match(self._VALID_URL, url)
3035
3036         video_id = mobj.group('id')
3037         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
3038         webpage = self._download_webpage(mrss_url, video_id)
3039
3040         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
3041         if mobj is None:
3042             raise ExtractorError(u'Unable to extract media URL')
3043         if len(mobj.group('server')) == 0:
3044             video_url = compat_urllib_parse.unquote(mobj.group('file'))
3045         else:
3046             video_url = mobj.group('server')+'/key='+mobj.group('file')
3047         video_extension = video_url.split('.')[-1]
3048
3049         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
3050             webpage, u'title')
3051
3052         # Can't see the description anywhere in the UI
3053         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
3054         #     webpage, u'description', fatal=False)
3055         # if video_description: video_description = unescapeHTML(video_description)
3056
3057         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
3058         if mobj:
3059             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
3060         else:
3061             video_upload_date = None
3062             self._downloader.report_warning(u'Unable to extract upload date')
3063
3064         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
3065             webpage, u'uploader id', default=u'anonymous')
3066
3067         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
3068             webpage, u'thumbnail', fatal=False)
3069
3070         return [{
3071             'id':       video_id,
3072             'url':      video_url,
3073             'ext':      video_extension,
3074             'title':    video_title,
3075             # 'description': video_description,
3076             'upload_date': video_upload_date,
3077             'uploader_id': video_uploader_id,
3078             'thumbnail': video_thumbnail
3079         }]
3080
3081 class HypemIE(InfoExtractor):
3082     """Information Extractor for hypem"""
3083     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
3084
3085     def _real_extract(self, url):
3086         mobj = re.match(self._VALID_URL, url)
3087         if mobj is None:
3088             raise ExtractorError(u'Invalid URL: %s' % url)
3089         track_id = mobj.group(1)
3090
3091         data = { 'ax': 1, 'ts': time.time() }
3092         data_encoded = compat_urllib_parse.urlencode(data)
3093         complete_url = url + "?" + data_encoded
3094         request = compat_urllib_request.Request(complete_url)
3095         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
3096         cookie = urlh.headers.get('Set-Cookie', '')
3097
3098         self.report_extraction(track_id)
3099
3100         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
3101             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
3102         try:
3103             track_list = json.loads(html_tracks)
3104             track = track_list[u'tracks'][0]
3105         except ValueError:
3106             raise ExtractorError(u'Hypemachine contained invalid JSON.')
3107
3108         key = track[u"key"]
3109         track_id = track[u"id"]
3110         artist = track[u"artist"]
3111         title = track[u"song"]
3112
3113         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
3114         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
3115         request.add_header('cookie', cookie)
3116         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
3117         try:
3118             song_data = json.loads(song_data_json)
3119         except ValueError:
3120             raise ExtractorError(u'Hypemachine contained invalid JSON.')
3121         final_url = song_data[u"url"]
3122
3123         return [{
3124             'id':       track_id,
3125             'url':      final_url,
3126             'ext':      "mp3",
3127             'title':    title,
3128             'artist':   artist,
3129         }]
3130
3131 class Vbox7IE(InfoExtractor):
3132     """Information Extractor for Vbox7"""
3133     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
3134
3135     def _real_extract(self,url):
3136         mobj = re.match(self._VALID_URL, url)
3137         if mobj is None:
3138             raise ExtractorError(u'Invalid URL: %s' % url)
3139         video_id = mobj.group(1)
3140
3141         redirect_page, urlh = self._download_webpage_handle(url, video_id)
3142         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
3143         redirect_url = urlh.geturl() + new_location
3144         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
3145
3146         title = self._html_search_regex(r'<title>(.*)</title>',
3147             webpage, u'title').split('/')[0].strip()
3148
3149         ext = "flv"
3150         info_url = "http://vbox7.com/play/magare.do"
3151         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
3152         info_request = compat_urllib_request.Request(info_url, data)
3153         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
3154         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
3155         if info_response is None:
3156             raise ExtractorError(u'Unable to extract the media url')
3157         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
3158
3159         return [{
3160             'id':        video_id,
3161             'url':       final_url,
3162             'ext':       ext,
3163             'title':     title,
3164             'thumbnail': thumbnail_url,
3165         }]
3166
3167 class GametrailersIE(InfoExtractor):
3168     _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
3169
3170     def _real_extract(self, url):
3171         mobj = re.match(self._VALID_URL, url)
3172         if mobj is None:
3173             raise ExtractorError(u'Invalid URL: %s' % url)
3174         video_id = mobj.group('id')
3175         video_type = mobj.group('type')
3176         webpage = self._download_webpage(url, video_id)
3177         if video_type == 'full-episodes':
3178             mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
3179         else:
3180             mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
3181         mgid = self._search_regex(mgid_re, webpage, u'mgid')
3182         data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
3183
3184         info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
3185                                            video_id, u'Downloading video info')
3186         links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
3187                                                video_id, u'Downloading video urls info')
3188
3189         self.report_extraction(video_id)
3190         info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
3191                       <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
3192                       <image>.*
3193                         <url>(?P<thumb>.*?)</url>.*
3194                       </image>'''
3195
3196         m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
3197         if m_info is None:
3198             raise ExtractorError(u'Unable to extract video info')
3199         video_title = m_info.group('title')
3200         video_description = m_info.group('description')
3201         video_thumb = m_info.group('thumb')
3202
3203         m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
3204         if m_urls is None or len(m_urls) == 0:
3205             raise ExtractError(u'Unable to extrat video url')
3206         # They are sorted from worst to best quality
3207         video_url = m_urls[-1].group('url')
3208
3209         return {'url':         video_url,
3210                 'id':          video_id,
3211                 'title':       video_title,
3212                 # Videos are actually flv not mp4
3213                 'ext':         'flv',
3214                 'thumbnail':   video_thumb,
3215                 'description': video_description,
3216                 }
3217
3218 def gen_extractors():
3219     """ Return a list of an instance of every supported extractor.
3220     The order does matter; the first extractor matched is the one handling the URL.
3221     """
3222     return [
3223         YoutubePlaylistIE(),
3224         YoutubeChannelIE(),
3225         YoutubeUserIE(),
3226         YoutubeSearchIE(),
3227         YoutubeIE(),
3228         MetacafeIE(),
3229         DailymotionIE(),
3230         GoogleSearchIE(),
3231         PhotobucketIE(),
3232         YahooIE(),
3233         YahooSearchIE(),
3234         DepositFilesIE(),
3235         FacebookIE(),
3236         BlipTVIE(),
3237         BlipTVUserIE(),
3238         VimeoIE(),
3239         MyVideoIE(),
3240         ComedyCentralIE(),
3241         EscapistIE(),
3242         CollegeHumorIE(),
3243         XVideosIE(),
3244         SoundcloudSetIE(),
3245         SoundcloudIE(),
3246         InfoQIE(),
3247         MixcloudIE(),
3248         StanfordOpenClassroomIE(),
3249         MTVIE(),
3250         YoukuIE(),
3251         XNXXIE(),
3252         YouJizzIE(),
3253         PornotubeIE(),
3254         YouPornIE(),
3255         GooglePlusIE(),
3256         ArteTvIE(),
3257         NBAIE(),
3258         WorldStarHipHopIE(),
3259         JustinTVIE(),
3260         FunnyOrDieIE(),
3261         SteamIE(),
3262         UstreamIE(),
3263         RBMARadioIE(),
3264         EightTracksIE(),
3265         KeekIE(),
3266         TEDIE(),
3267         MySpassIE(),
3268         SpiegelIE(),
3269         LiveLeakIE(),
3270         ARDIE(),
3271         ZDFIE(),
3272         TumblrIE(),
3273         BandcampIE(),
3274         RedTubeIE(),
3275         InaIE(),
3276         HowcastIE(),
3277         VineIE(),
3278         FlickrIE(),
3279         TeamcocoIE(),
3280         XHamsterIE(),
3281         HypemIE(),
3282         Vbox7IE(),
3283         GametrailersIE(),
3284         StatigramIE(),
3285         GenericIE()
3286     ]
3287
3288 def get_info_extractor(ie_name):
3289     """Returns the info extractor class with the given ie_name"""
3290     return globals()[ie_name+'IE']