_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19
  20 from .utils import *
  21
  22
  23 class InfoExtractor(object):
  24     """Information Extractor class.
  25
  26     Information extractors are the classes that, given a URL, extract
  27     information about the video (or videos) the URL refers to. This
  28     information includes the real video URL, the video title, author and
  29     others. The information is stored in a dictionary which is then
  30     passed to the FileDownloader. The FileDownloader processes this
  31     information possibly downloading the video to the file system, among
  32     other possible outcomes.
  33
  34     The dictionaries must include the following fields:
  35
  36     id:             Video identifier.
  37     url:            Final video URL.
  38     title:          Video title, unescaped.
  39     ext:            Video filename extension.
  40
  41     The following fields are optional:
  42
  43     format:         The video format, defaults to ext (used for --get-format)
  44     thumbnail:      Full URL to a video thumbnail image.
  45     description:    One-line video description.
  46     uploader:       Full name of the video uploader.
  47     upload_date:    Video upload date (YYYYMMDD).
  48     uploader_id:    Nickname or id of the video uploader.
  49     location:       Physical location of the video.
  50     player_url:     SWF Player URL (used for rtmpdump).
  51     subtitles:      The .srt file contents.
  52     urlhandle:      [internal] The urlHandle to be used to download the file,
  53                     like returned by urllib.request.urlopen
  54
  55     The fields should all be Unicode strings.
  56
  57     Subclasses of this one should re-define the _real_initialize() and
  58     _real_extract() methods and define a _VALID_URL regexp.
  59     Probably, they should also be added to the list of extractors.
  60
  61     _real_extract() must return a *list* of information dictionaries as
  62     described above.
  63
  64     Finally, the _WORKING attribute should be set to False for broken IEs
  65     in order to warn the users and skip the tests.
  66     """
  67
  68     _ready = False
  69     _downloader = None
  70     _WORKING = True
  71
  72     def __init__(self, downloader=None):
  73         """Constructor. Receives an optional downloader."""
  74         self._ready = False
  75         self.set_downloader(downloader)
  76
  77     def suitable(self, url):
  78         """Receives a URL and returns True if suitable for this IE."""
  79         return re.match(self._VALID_URL, url) is not None
  80
  81     def working(self):
  82         """Getter method for _WORKING."""
  83         return self._WORKING
  84
  85     def initialize(self):
  86         """Initializes an instance (authentication, etc)."""
  87         if not self._ready:
  88             self._real_initialize()
  89             self._ready = True
  90
  91     def extract(self, url):
  92         """Extracts URL information and returns it in list of dicts."""
  93         self.initialize()
  94         return self._real_extract(url)
  95
  96     def set_downloader(self, downloader):
  97         """Sets the downloader for this IE."""
  98         self._downloader = downloader
  99
 100     def _real_initialize(self):
 101         """Real initialization process. Redefine in subclasses."""
 102         pass
 103
 104     def _real_extract(self, url):
 105         """Real extraction process. Redefine in subclasses."""
 106         pass
 107
 108     @property
 109     def IE_NAME(self):
 110         return type(self).__name__[:-2]
 111
 112     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 113         """ Returns the response handle """
 114         if note is None:
 115             note = u'Downloading video webpage'
 116         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
 117         try:
 118             return compat_urllib_request.urlopen(url_or_request)
 119         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 120             if errnote is None:
 121                 errnote = u'Unable to download webpage'
 122             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 123
 124     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 125         """ Returns the data of the page as a string """
 126         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 127         webpage_bytes = urlh.read()
 128         return webpage_bytes.decode('utf-8', 'replace')
 129
 130
 131 class YoutubeIE(InfoExtractor):
 132     """Information extractor for youtube.com."""
 133
 134     _VALID_URL = r"""^
 135                      (
 136                          (?:https?://)?                                       # http(s):// (optional)
 137                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 138                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 139                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 140                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
 141                          (?:                                                  # the various things that can precede the ID:
 142                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 143                              |(?:                                             # or the v= param in all its forms
 144                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 145                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 146                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 147                                  v=
 148                              )
 149                          )?                                                   # optional -> youtube.com/xxxx is OK
 150                      )?                                                       # all until now is optional -> you can pass the naked ID
 151                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 152                      (?(1).+)?                                                # if we found the ID, everything can follow
 153                      $"""
 154     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 155     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 156     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 157     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 158     _NETRC_MACHINE = 'youtube'
 159     # Listed in order of quality
 160     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 161     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 162     _video_extensions = {
 163         '13': '3gp',
 164         '17': 'mp4',
 165         '18': 'mp4',
 166         '22': 'mp4',
 167         '37': 'mp4',
 168         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 169         '43': 'webm',
 170         '44': 'webm',
 171         '45': 'webm',
 172         '46': 'webm',
 173     }
 174     _video_dimensions = {
 175         '5': '240x400',
 176         '6': '???',
 177         '13': '???',
 178         '17': '144x176',
 179         '18': '360x640',
 180         '22': '720x1280',
 181         '34': '360x640',
 182         '35': '480x854',
 183         '37': '1080x1920',
 184         '38': '3072x4096',
 185         '43': '360x640',
 186         '44': '480x854',
 187         '45': '720x1280',
 188         '46': '1080x1920',
 189     }
 190     IE_NAME = u'youtube'
 191
 192     def suitable(self, url):
 193         """Receives a URL and returns True if suitable for this IE."""
 194         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
 195
 196     def report_lang(self):
 197         """Report attempt to set language."""
 198         self._downloader.to_screen(u'[youtube] Setting language')
 199
 200     def report_login(self):
 201         """Report attempt to log in."""
 202         self._downloader.to_screen(u'[youtube] Logging in')
 203
 204     def report_age_confirmation(self):
 205         """Report attempt to confirm age."""
 206         self._downloader.to_screen(u'[youtube] Confirming age')
 207
 208     def report_video_webpage_download(self, video_id):
 209         """Report attempt to download video webpage."""
 210         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 211
 212     def report_video_info_webpage_download(self, video_id):
 213         """Report attempt to download video info webpage."""
 214         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 215
 216     def report_video_subtitles_download(self, video_id):
 217         """Report attempt to download video info webpage."""
 218         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 219
 220     def report_information_extraction(self, video_id):
 221         """Report attempt to extract video information."""
 222         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 223
 224     def report_unavailable_format(self, video_id, format):
 225         """Report extracted video URL."""
 226         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 227
 228     def report_rtmp_download(self):
 229         """Indicate the download will use the RTMP protocol."""
 230         self._downloader.to_screen(u'[youtube] RTMP download detected')
 231
 232     def _closed_captions_xml_to_srt(self, xml_string):
 233         srt = ''
 234         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 235         # TODO parse xml instead of regex
 236         for n, (start, dur_tag, dur, caption) in enumerate(texts):
 237             if not dur: dur = '4'
 238             start = float(start)
 239             end = start + float(dur)
 240             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 241             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 242             caption = unescapeHTML(caption)
 243             caption = unescapeHTML(caption) # double cycle, intentional
 244             srt += str(n+1) + '\n'
 245             srt += start + ' --> ' + end + '\n'
 246             srt += caption + '\n\n'
 247         return srt
 248
 249     def _extract_subtitles(self, video_id):
 250         self.report_video_subtitles_download(video_id)
 251         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 252         try:
 253             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 254         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 255             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 256         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 257         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 258         if not srt_lang_list:
 259             return (u'WARNING: video has no closed captions', None)
 260         if self._downloader.params.get('subtitleslang', False):
 261             srt_lang = self._downloader.params.get('subtitleslang')
 262         elif 'en' in srt_lang_list:
 263             srt_lang = 'en'
 264         else:
 265             srt_lang = list(srt_lang_list.keys())[0]
 266         if not srt_lang in srt_lang_list:
 267             return (u'WARNING: no closed captions found in the specified language', None)
 268         params = compat_urllib_parse.urlencode({
 269             'lang': srt_lang,
 270             'name': srt_lang_list[srt_lang].encode('utf-8'),
 271             'v': video_id,
 272         })
 273         url = 'http://www.youtube.com/api/timedtext?' + params
 274         try:
 275             srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8')
 276         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 277             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 278         if not srt_xml:
 279             return (u'WARNING: Did not fetch video subtitles', None)
 280         return (None, self._closed_captions_xml_to_srt(srt_xml))
 281
 282     def _print_formats(self, formats):
 283         print('Available formats:')
 284         for x in formats:
 285             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 286
 287     def _real_initialize(self):
 288         if self._downloader is None:
 289             return
 290
 291         username = None
 292         password = None
 293         downloader_params = self._downloader.params
 294
 295         # Attempt to use provided username and password or .netrc data
 296         if downloader_params.get('username', None) is not None:
 297             username = downloader_params['username']
 298             password = downloader_params['password']
 299         elif downloader_params.get('usenetrc', False):
 300             try:
 301                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 302                 if info is not None:
 303                     username = info[0]
 304                     password = info[2]
 305                 else:
 306                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 307             except (IOError, netrc.NetrcParseError) as err:
 308                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
 309                 return
 310
 311         # Set language
 312         request = compat_urllib_request.Request(self._LANG_URL)
 313         try:
 314             self.report_lang()
 315             compat_urllib_request.urlopen(request).read()
 316         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 317             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
 318             return
 319
 320         # No authentication to be performed
 321         if username is None:
 322             return
 323
 324         request = compat_urllib_request.Request(self._LOGIN_URL)
 325         try:
 326             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 327         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 328             self._downloader.to_stderr(u'WARNING: unable to fetch login page: %s' % compat_str(err))
 329             return
 330
 331         galx = None
 332         dsh = None
 333         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 334         if match:
 335           galx = match.group(1)
 336
 337         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 338         if match:
 339           dsh = match.group(1)
 340
 341         # Log in
 342         login_form_strs = {
 343                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 344                 u'Email': username,
 345                 u'GALX': galx,
 346                 u'Passwd': password,
 347                 u'PersistentCookie': u'yes',
 348                 u'_utf8': u'霱',
 349                 u'bgresponse': u'js_disabled',
 350                 u'checkConnection': u'',
 351                 u'checkedDomains': u'youtube',
 352                 u'dnConn': u'',
 353                 u'dsh': dsh,
 354                 u'pstMsg': u'0',
 355                 u'rmShown': u'1',
 356                 u'secTok': u'',
 357                 u'signIn': u'Sign in',
 358                 u'timeStmp': u'',
 359                 u'service': u'youtube',
 360                 u'uilel': u'3',
 361                 u'hl': u'en_US',
 362         }
 363         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 364         # chokes on unicode
 365         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 366         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 367         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 368         try:
 369             self.report_login()
 370             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 371             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 372                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 373                 return
 374         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 375             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
 376             return
 377
 378         # Confirm age
 379         age_form = {
 380                 'next_url':     '/',
 381                 'action_confirm':   'Confirm',
 382                 }
 383         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 384         try:
 385             self.report_age_confirmation()
 386             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 387         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 388             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 389             return
 390
 391     def _extract_id(self, url):
 392         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 393         if mobj is None:
 394             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 395             return
 396         video_id = mobj.group(2)
 397         return video_id
 398
 399     def _real_extract(self, url):
 400         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 401         mobj = re.search(self._NEXT_URL_RE, url)
 402         if mobj:
 403             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 404         video_id = self._extract_id(url)
 405
 406         # Get video webpage
 407         self.report_video_webpage_download(video_id)
 408         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 409         request = compat_urllib_request.Request(url)
 410         try:
 411             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 412         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 413             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
 414             return
 415
 416         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 417
 418         # Attempt to extract SWF player URL
 419         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 420         if mobj is not None:
 421             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 422         else:
 423             player_url = None
 424
 425         # Get video info
 426         self.report_video_info_webpage_download(video_id)
 427         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 428             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 429                     % (video_id, el_type))
 430             request = compat_urllib_request.Request(video_info_url)
 431             try:
 432                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
 433                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
 434                 video_info = compat_parse_qs(video_info_webpage)
 435                 if 'token' in video_info:
 436                     break
 437             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 438                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 439                 return
 440         if 'token' not in video_info:
 441             if 'reason' in video_info:
 442                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
 443             else:
 444                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 445             return
 446
 447         # Check for "rental" videos
 448         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 449             self._downloader.trouble(u'ERROR: "rental" videos not supported')
 450             return
 451
 452         # Start extracting information
 453         self.report_information_extraction(video_id)
 454
 455         # uploader
 456         if 'author' not in video_info:
 457             self._downloader.trouble(u'ERROR: unable to extract uploader name')
 458             return
 459         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 460
 461         # uploader_id
 462         video_uploader_id = None
 463         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 464         if mobj is not None:
 465             video_uploader_id = mobj.group(1)
 466         else:
 467             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 468
 469         # title
 470         if 'title' not in video_info:
 471             self._downloader.trouble(u'ERROR: unable to extract video title')
 472             return
 473         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 474
 475         # thumbnail image
 476         if 'thumbnail_url' not in video_info:
 477             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 478             video_thumbnail = ''
 479         else:   # don't panic if we can't find it
 480             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 481
 482         # upload date
 483         upload_date = None
 484         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 485         if mobj is not None:
 486             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 487             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 488             for expression in format_expressions:
 489                 try:
 490                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 491                 except:
 492                     pass
 493
 494         # description
 495         video_description = get_element_by_id("eow-description", video_webpage)
 496         if video_description:
 497             video_description = clean_html(video_description)
 498         else:
 499             video_description = ''
 500
 501         # closed captions
 502         video_subtitles = None
 503         if self._downloader.params.get('writesubtitles', False):
 504             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
 505             if srt_error:
 506                 self._downloader.trouble(srt_error)
 507
 508         if 'length_seconds' not in video_info:
 509             self._downloader.trouble(u'WARNING: unable to extract video duration')
 510             video_duration = ''
 511         else:
 512             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 513
 514         # token
 515         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 516
 517         # Decide which formats to download
 518         req_format = self._downloader.params.get('format', None)
 519
 520         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 521             self.report_rtmp_download()
 522             video_url_list = [(None, video_info['conn'][0])]
 523         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 524             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 525             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 526             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 527             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 528
 529             format_limit = self._downloader.params.get('format_limit', None)
 530             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 531             if format_limit is not None and format_limit in available_formats:
 532                 format_list = available_formats[available_formats.index(format_limit):]
 533             else:
 534                 format_list = available_formats
 535             existing_formats = [x for x in format_list if x in url_map]
 536             if len(existing_formats) == 0:
 537                 self._downloader.trouble(u'ERROR: no known formats available for video')
 538                 return
 539             if self._downloader.params.get('listformats', None):
 540                 self._print_formats(existing_formats)
 541                 return
 542             if req_format is None or req_format == 'best':
 543                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 544             elif req_format == 'worst':
 545                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 546             elif req_format in ('-1', 'all'):
 547                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 548             else:
 549                 # Specific formats. We pick the first in a slash-delimeted sequence.
 550                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 551                 req_formats = req_format.split('/')
 552                 video_url_list = None
 553                 for rf in req_formats:
 554                     if rf in url_map:
 555                         video_url_list = [(rf, url_map[rf])]
 556                         break
 557                 if video_url_list is None:
 558                     self._downloader.trouble(u'ERROR: requested format not available')
 559                     return
 560         else:
 561             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 562             return
 563
 564         results = []
 565         for format_param, video_real_url in video_url_list:
 566             # Extension
 567             video_extension = self._video_extensions.get(format_param, 'flv')
 568
 569             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 570                                               self._video_dimensions.get(format_param, '???'))
 571
 572             results.append({
 573                 'id':       video_id,
 574                 'url':      video_real_url,
 575                 'uploader': video_uploader,
 576                 'uploader_id': video_uploader_id,
 577                 'upload_date':  upload_date,
 578                 'title':    video_title,
 579                 'ext':      video_extension,
 580                 'format':   video_format,
 581                 'thumbnail':    video_thumbnail,
 582                 'description':  video_description,
 583                 'player_url':   player_url,
 584                 'subtitles':    video_subtitles,
 585                 'duration':     video_duration
 586             })
 587         return results
 588
 589
 590 class MetacafeIE(InfoExtractor):
 591     """Information Extractor for metacafe.com."""
 592
 593     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 594     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 595     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 596     IE_NAME = u'metacafe'
 597
 598     def __init__(self, downloader=None):
 599         InfoExtractor.__init__(self, downloader)
 600
 601     def report_disclaimer(self):
 602         """Report disclaimer retrieval."""
 603         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 604
 605     def report_age_confirmation(self):
 606         """Report attempt to confirm age."""
 607         self._downloader.to_screen(u'[metacafe] Confirming age')
 608
 609     def report_download_webpage(self, video_id):
 610         """Report webpage download."""
 611         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 612
 613     def report_extraction(self, video_id):
 614         """Report information extraction."""
 615         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 616
 617     def _real_initialize(self):
 618         # Retrieve disclaimer
 619         request = compat_urllib_request.Request(self._DISCLAIMER)
 620         try:
 621             self.report_disclaimer()
 622             disclaimer = compat_urllib_request.urlopen(request).read()
 623         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 624             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
 625             return
 626
 627         # Confirm age
 628         disclaimer_form = {
 629             'filters': '0',
 630             'submit': "Continue - I'm over 18",
 631             }
 632         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 633         try:
 634             self.report_age_confirmation()
 635             disclaimer = compat_urllib_request.urlopen(request).read()
 636         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 637             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 638             return
 639
 640     def _real_extract(self, url):
 641         # Extract id and simplified title from URL
 642         mobj = re.match(self._VALID_URL, url)
 643         if mobj is None:
 644             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 645             return
 646
 647         video_id = mobj.group(1)
 648
 649         # Check if video comes from YouTube
 650         mobj2 = re.match(r'^yt-(.*)$', video_id)
 651         if mobj2 is not None:
 652             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 653             return
 654
 655         # Retrieve video webpage to extract further information
 656         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
 657         try:
 658             self.report_download_webpage(video_id)
 659             webpage = compat_urllib_request.urlopen(request).read()
 660         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 661             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 662             return
 663
 664         # Extract URL, uploader and title from webpage
 665         self.report_extraction(video_id)
 666         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 667         if mobj is not None:
 668             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 669             video_extension = mediaURL[-3:]
 670
 671             # Extract gdaKey if available
 672             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 673             if mobj is None:
 674                 video_url = mediaURL
 675             else:
 676                 gdaKey = mobj.group(1)
 677                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 678         else:
 679             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 680             if mobj is None:
 681                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 682                 return
 683             vardict = compat_parse_qs(mobj.group(1))
 684             if 'mediaData' not in vardict:
 685                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 686                 return
 687             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 688             if mobj is None:
 689                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 690                 return
 691             mediaURL = mobj.group(1).replace('\\/', '/')
 692             video_extension = mediaURL[-3:]
 693             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 694
 695         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 696         if mobj is None:
 697             self._downloader.trouble(u'ERROR: unable to extract title')
 698             return
 699         video_title = mobj.group(1).decode('utf-8')
 700
 701         mobj = re.search(r'submitter=(.*?);', webpage)
 702         if mobj is None:
 703             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 704             return
 705         video_uploader = mobj.group(1)
 706
 707         return [{
 708             'id':       video_id.decode('utf-8'),
 709             'url':      video_url.decode('utf-8'),
 710             'uploader': video_uploader.decode('utf-8'),
 711             'upload_date':  None,
 712             'title':    video_title,
 713             'ext':      video_extension.decode('utf-8'),
 714         }]
 715
 716
 717 class DailymotionIE(InfoExtractor):
 718     """Information Extractor for Dailymotion"""
 719
 720     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 721     IE_NAME = u'dailymotion'
 722     _WORKING = False
 723
 724     def __init__(self, downloader=None):
 725         InfoExtractor.__init__(self, downloader)
 726
 727     def report_extraction(self, video_id):
 728         """Report information extraction."""
 729         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 730
 731     def _real_extract(self, url):
 732         # Extract id and simplified title from URL
 733         mobj = re.match(self._VALID_URL, url)
 734         if mobj is None:
 735             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 736             return
 737
 738         video_id = mobj.group(1).split('_')[0].split('?')[0]
 739
 740         video_extension = 'mp4'
 741
 742         # Retrieve video webpage to extract further information
 743         request = compat_urllib_request.Request(url)
 744         request.add_header('Cookie', 'family_filter=off')
 745         webpage = self._download_webpage(request, video_id)
 746
 747         # Extract URL, uploader and title from webpage
 748         self.report_extraction(video_id)
 749         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 750         if mobj is None:
 751             self._downloader.trouble(u'ERROR: unable to extract media URL')
 752             return
 753         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 754
 755         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 756             if key in flashvars:
 757                 max_quality = key
 758                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 759                 break
 760         else:
 761             self._downloader.trouble(u'ERROR: unable to extract video URL')
 762             return
 763
 764         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 765         if mobj is None:
 766             self._downloader.trouble(u'ERROR: unable to extract video URL')
 767             return
 768
 769         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 770
 771         # TODO: support choosing qualities
 772
 773         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 774         if mobj is None:
 775             self._downloader.trouble(u'ERROR: unable to extract title')
 776             return
 777         video_title = unescapeHTML(mobj.group('title'))
 778
 779         video_uploader = None
 780         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 781         if mobj is None:
 782             # lookin for official user
 783             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 784             if mobj_official is None:
 785                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 786             else:
 787                 video_uploader = mobj_official.group(1)
 788         else:
 789             video_uploader = mobj.group(1)
 790
 791         video_upload_date = None
 792         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 793         if mobj is not None:
 794             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 795
 796         return [{
 797             'id':       video_id,
 798             'url':      video_url,
 799             'uploader': video_uploader,
 800             'upload_date':  video_upload_date,
 801             'title':    video_title,
 802             'ext':      video_extension,
 803         }]
 804
 805
 806 class PhotobucketIE(InfoExtractor):
 807     """Information extractor for photobucket.com."""
 808
 809     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 810     IE_NAME = u'photobucket'
 811
 812     def __init__(self, downloader=None):
 813         InfoExtractor.__init__(self, downloader)
 814
 815     def report_download_webpage(self, video_id):
 816         """Report webpage download."""
 817         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 818
 819     def report_extraction(self, video_id):
 820         """Report information extraction."""
 821         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 822
 823     def _real_extract(self, url):
 824         # Extract id from URL
 825         mobj = re.match(self._VALID_URL, url)
 826         if mobj is None:
 827             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 828             return
 829
 830         video_id = mobj.group(1)
 831
 832         video_extension = 'flv'
 833
 834         # Retrieve video webpage to extract further information
 835         request = compat_urllib_request.Request(url)
 836         try:
 837             self.report_download_webpage(video_id)
 838             webpage = compat_urllib_request.urlopen(request).read()
 839         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 840             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 841             return
 842
 843         # Extract URL, uploader, and title from webpage
 844         self.report_extraction(video_id)
 845         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 846         if mobj is None:
 847             self._downloader.trouble(u'ERROR: unable to extract media URL')
 848             return
 849         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 850
 851         video_url = mediaURL
 852
 853         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 854         if mobj is None:
 855             self._downloader.trouble(u'ERROR: unable to extract title')
 856             return
 857         video_title = mobj.group(1).decode('utf-8')
 858
 859         video_uploader = mobj.group(2).decode('utf-8')
 860
 861         return [{
 862             'id':       video_id.decode('utf-8'),
 863             'url':      video_url.decode('utf-8'),
 864             'uploader': video_uploader,
 865             'upload_date':  None,
 866             'title':    video_title,
 867             'ext':      video_extension.decode('utf-8'),
 868         }]
 869
 870
 871 class YahooIE(InfoExtractor):
 872     """Information extractor for video.yahoo.com."""
 873
 874     _WORKING = False
 875     # _VALID_URL matches all Yahoo! Video URLs
 876     # _VPAGE_URL matches only the extractable '/watch/' URLs
 877     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 878     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 879     IE_NAME = u'video.yahoo'
 880
 881     def __init__(self, downloader=None):
 882         InfoExtractor.__init__(self, downloader)
 883
 884     def report_download_webpage(self, video_id):
 885         """Report webpage download."""
 886         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 887
 888     def report_extraction(self, video_id):
 889         """Report information extraction."""
 890         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 891
 892     def _real_extract(self, url, new_video=True):
 893         # Extract ID from URL
 894         mobj = re.match(self._VALID_URL, url)
 895         if mobj is None:
 896             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 897             return
 898
 899         video_id = mobj.group(2)
 900         video_extension = 'flv'
 901
 902         # Rewrite valid but non-extractable URLs as
 903         # extractable English language /watch/ URLs
 904         if re.match(self._VPAGE_URL, url) is None:
 905             request = compat_urllib_request.Request(url)
 906             try:
 907                 webpage = compat_urllib_request.urlopen(request).read()
 908             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 909                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 910                 return
 911
 912             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 913             if mobj is None:
 914                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 915                 return
 916             yahoo_id = mobj.group(1)
 917
 918             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 919             if mobj is None:
 920                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 921                 return
 922             yahoo_vid = mobj.group(1)
 923
 924             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 925             return self._real_extract(url, new_video=False)
 926
 927         # Retrieve video webpage to extract further information
 928         request = compat_urllib_request.Request(url)
 929         try:
 930             self.report_download_webpage(video_id)
 931             webpage = compat_urllib_request.urlopen(request).read()
 932         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 933             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 934             return
 935
 936         # Extract uploader and title from webpage
 937         self.report_extraction(video_id)
 938         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 939         if mobj is None:
 940             self._downloader.trouble(u'ERROR: unable to extract video title')
 941             return
 942         video_title = mobj.group(1).decode('utf-8')
 943
 944         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 945         if mobj is None:
 946             self._downloader.trouble(u'ERROR: unable to extract video uploader')
 947             return
 948         video_uploader = mobj.group(1).decode('utf-8')
 949
 950         # Extract video thumbnail
 951         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 952         if mobj is None:
 953             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 954             return
 955         video_thumbnail = mobj.group(1).decode('utf-8')
 956
 957         # Extract video description
 958         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 959         if mobj is None:
 960             self._downloader.trouble(u'ERROR: unable to extract video description')
 961             return
 962         video_description = mobj.group(1).decode('utf-8')
 963         if not video_description:
 964             video_description = 'No description available.'
 965
 966         # Extract video height and width
 967         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 968         if mobj is None:
 969             self._downloader.trouble(u'ERROR: unable to extract video height')
 970             return
 971         yv_video_height = mobj.group(1)
 972
 973         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 974         if mobj is None:
 975             self._downloader.trouble(u'ERROR: unable to extract video width')
 976             return
 977         yv_video_width = mobj.group(1)
 978
 979         # Retrieve video playlist to extract media URL
 980         # I'm not completely sure what all these options are, but we
 981         # seem to need most of them, otherwise the server sends a 401.
 982         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 983         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 984         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 985                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 986                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 987         try:
 988             self.report_download_webpage(video_id)
 989             webpage = compat_urllib_request.urlopen(request).read()
 990         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 991             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 992             return
 993
 994         # Extract media URL from playlist XML
 995         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 996         if mobj is None:
 997             self._downloader.trouble(u'ERROR: Unable to extract media URL')
 998             return
 999         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1000         video_url = unescapeHTML(video_url)
1001
1002         return [{
1003             'id':       video_id.decode('utf-8'),
1004             'url':      video_url,
1005             'uploader': video_uploader,
1006             'upload_date':  None,
1007             'title':    video_title,
1008             'ext':      video_extension.decode('utf-8'),
1009             'thumbnail':    video_thumbnail.decode('utf-8'),
1010             'description':  video_description,
1011         }]
1012
1013
1014 class VimeoIE(InfoExtractor):
1015     """Information extractor for vimeo.com."""
1016
1017     # _VALID_URL matches Vimeo URLs
1018     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1019     IE_NAME = u'vimeo'
1020
1021     def __init__(self, downloader=None):
1022         InfoExtractor.__init__(self, downloader)
1023
1024     def report_download_webpage(self, video_id):
1025         """Report webpage download."""
1026         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1027
1028     def report_extraction(self, video_id):
1029         """Report information extraction."""
1030         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1031
1032     def _real_extract(self, url, new_video=True):
1033         # Extract ID from URL
1034         mobj = re.match(self._VALID_URL, url)
1035         if mobj is None:
1036             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1037             return
1038
1039         video_id = mobj.group('id')
1040         if not mobj.group('proto'):
1041             url = 'https://' + url
1042         if mobj.group('direct_link'):
1043             url = 'https://vimeo.com/' + video_id
1044
1045         # Retrieve video webpage to extract further information
1046         request = compat_urllib_request.Request(url, None, std_headers)
1047         try:
1048             self.report_download_webpage(video_id)
1049             webpage_bytes = compat_urllib_request.urlopen(request).read()
1050             webpage = webpage_bytes.decode('utf-8')
1051         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1052             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1053             return
1054
1055         # Now we begin extracting as much information as we can from what we
1056         # retrieved. First we extract the information common to all extractors,
1057         # and latter we extract those that are Vimeo specific.
1058         self.report_extraction(video_id)
1059
1060         # Extract the config JSON
1061         try:
1062             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1063             config = json.loads(config)
1064         except:
1065             self._downloader.trouble(u'ERROR: unable to extract info section')
1066             return
1067
1068         # Extract title
1069         video_title = config["video"]["title"]
1070
1071         # Extract uploader and uploader_id
1072         video_uploader = config["video"]["owner"]["name"]
1073         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1074
1075         # Extract video thumbnail
1076         video_thumbnail = config["video"]["thumbnail"]
1077
1078         # Extract video description
1079         video_description = get_element_by_attribute("itemprop", "description", webpage)
1080         if video_description: video_description = clean_html(video_description)
1081         else: video_description = ''
1082
1083         # Extract upload date
1084         video_upload_date = None
1085         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1086         if mobj is not None:
1087             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1088
1089         # Vimeo specific: extract request signature and timestamp
1090         sig = config['request']['signature']
1091         timestamp = config['request']['timestamp']
1092
1093         # Vimeo specific: extract video codec and quality information
1094         # First consider quality, then codecs, then take everything
1095         # TODO bind to format param
1096         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1097         files = { 'hd': [], 'sd': [], 'other': []}
1098         for codec_name, codec_extension in codecs:
1099             if codec_name in config["video"]["files"]:
1100                 if 'hd' in config["video"]["files"][codec_name]:
1101                     files['hd'].append((codec_name, codec_extension, 'hd'))
1102                 elif 'sd' in config["video"]["files"][codec_name]:
1103                     files['sd'].append((codec_name, codec_extension, 'sd'))
1104                 else:
1105                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1106
1107         for quality in ('hd', 'sd', 'other'):
1108             if len(files[quality]) > 0:
1109                 video_quality = files[quality][0][2]
1110                 video_codec = files[quality][0][0]
1111                 video_extension = files[quality][0][1]
1112                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1113                 break
1114         else:
1115             self._downloader.trouble(u'ERROR: no known codec found')
1116             return
1117
1118         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1119                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1120
1121         return [{
1122             'id':       video_id,
1123             'url':      video_url,
1124             'uploader': video_uploader,
1125             'uploader_id': video_uploader_id,
1126             'upload_date':  video_upload_date,
1127             'title':    video_title,
1128             'ext':      video_extension,
1129             'thumbnail':    video_thumbnail,
1130             'description':  video_description,
1131         }]
1132
1133
1134 class ArteTvIE(InfoExtractor):
1135     """arte.tv information extractor."""
1136
1137     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1138     _LIVE_URL = r'index-[0-9]+\.html$'
1139
1140     IE_NAME = u'arte.tv'
1141
1142     def __init__(self, downloader=None):
1143         InfoExtractor.__init__(self, downloader)
1144
1145     def report_download_webpage(self, video_id):
1146         """Report webpage download."""
1147         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1148
1149     def report_extraction(self, video_id):
1150         """Report information extraction."""
1151         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1152
1153     def fetch_webpage(self, url):
1154         request = compat_urllib_request.Request(url)
1155         try:
1156             self.report_download_webpage(url)
1157             webpage = compat_urllib_request.urlopen(request).read()
1158         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1159             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1160             return
1161         except ValueError as err:
1162             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1163             return
1164         return webpage
1165
1166     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1167         page = self.fetch_webpage(url)
1168         mobj = re.search(regex, page, regexFlags)
1169         info = {}
1170
1171         if mobj is None:
1172             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1173             return
1174
1175         for (i, key, err) in matchTuples:
1176             if mobj.group(i) is None:
1177                 self._downloader.trouble(err)
1178                 return
1179             else:
1180                 info[key] = mobj.group(i)
1181
1182         return info
1183
1184     def extractLiveStream(self, url):
1185         video_lang = url.split('/')[-4]
1186         info = self.grep_webpage(
1187             url,
1188             r'src="(.*?/videothek_js.*?\.js)',
1189             0,
1190             [
1191                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1192             ]
1193         )
1194         http_host = url.split('/')[2]
1195         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1196         info = self.grep_webpage(
1197             next_url,
1198             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1199                 '(http://.*?\.swf).*?' +
1200                 '(rtmp://.*?)\'',
1201             re.DOTALL,
1202             [
1203                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1204                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1205                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1206             ]
1207         )
1208         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1209
1210     def extractPlus7Stream(self, url):
1211         video_lang = url.split('/')[-3]
1212         info = self.grep_webpage(
1213             url,
1214             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1215             0,
1216             [
1217                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1218             ]
1219         )
1220         next_url = compat_urllib_parse.unquote(info.get('url'))
1221         info = self.grep_webpage(
1222             next_url,
1223             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1224             0,
1225             [
1226                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1227             ]
1228         )
1229         next_url = compat_urllib_parse.unquote(info.get('url'))
1230
1231         info = self.grep_webpage(
1232             next_url,
1233             r'<video id="(.*?)".*?>.*?' +
1234                 '<name>(.*?)</name>.*?' +
1235                 '<dateVideo>(.*?)</dateVideo>.*?' +
1236                 '<url quality="hd">(.*?)</url>',
1237             re.DOTALL,
1238             [
1239                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1240                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1241                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1242                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1243             ]
1244         )
1245
1246         return {
1247             'id':           info.get('id'),
1248             'url':          compat_urllib_parse.unquote(info.get('url')),
1249             'uploader':     u'arte.tv',
1250             'upload_date':  info.get('date'),
1251             'title':        info.get('title').decode('utf-8'),
1252             'ext':          u'mp4',
1253             'format':       u'NA',
1254             'player_url':   None,
1255         }
1256
1257     def _real_extract(self, url):
1258         video_id = url.split('/')[-1]
1259         self.report_extraction(video_id)
1260
1261         if re.search(self._LIVE_URL, video_id) is not None:
1262             self.extractLiveStream(url)
1263             return
1264         else:
1265             info = self.extractPlus7Stream(url)
1266
1267         return [info]
1268
1269
1270 class GenericIE(InfoExtractor):
1271     """Generic last-resort information extractor."""
1272
1273     _VALID_URL = r'.*'
1274     IE_NAME = u'generic'
1275
1276     def __init__(self, downloader=None):
1277         InfoExtractor.__init__(self, downloader)
1278
1279     def report_download_webpage(self, video_id):
1280         """Report webpage download."""
1281         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1282         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1283
1284     def report_extraction(self, video_id):
1285         """Report information extraction."""
1286         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1287
1288     def report_following_redirect(self, new_url):
1289         """Report information extraction."""
1290         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1291
1292     def _test_redirect(self, url):
1293         """Check if it is a redirect, like url shorteners, in case restart chain."""
1294         class HeadRequest(compat_urllib_request.Request):
1295             def get_method(self):
1296                 return "HEAD"
1297
1298         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1299             """
1300             Subclass the HTTPRedirectHandler to make it use our
1301             HeadRequest also on the redirected URL
1302             """
1303             def redirect_request(self, req, fp, code, msg, headers, newurl):
1304                 if code in (301, 302, 303, 307):
1305                     newurl = newurl.replace(' ', '%20')
1306                     newheaders = dict((k,v) for k,v in req.headers.items()
1307                                       if k.lower() not in ("content-length", "content-type"))
1308                     return HeadRequest(newurl,
1309                                        headers=newheaders,
1310                                        origin_req_host=req.get_origin_req_host(),
1311                                        unverifiable=True)
1312                 else:
1313                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1314
1315         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1316             """
1317             Fallback to GET if HEAD is not allowed (405 HTTP error)
1318             """
1319             def http_error_405(self, req, fp, code, msg, headers):
1320                 fp.read()
1321                 fp.close()
1322
1323                 newheaders = dict((k,v) for k,v in req.headers.items()
1324                                   if k.lower() not in ("content-length", "content-type"))
1325                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1326                                                  headers=newheaders,
1327                                                  origin_req_host=req.get_origin_req_host(),
1328                                                  unverifiable=True))
1329
1330         # Build our opener
1331         opener = compat_urllib_request.OpenerDirector()
1332         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1333                         HTTPMethodFallback, HEADRedirectHandler,
1334                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1335             opener.add_handler(handler())
1336
1337         response = opener.open(HeadRequest(url))
1338         new_url = response.geturl()
1339
1340         if url == new_url:
1341             return False
1342
1343         self.report_following_redirect(new_url)
1344         self._downloader.download([new_url])
1345         return True
1346
1347     def _real_extract(self, url):
1348         if self._test_redirect(url): return
1349
1350         video_id = url.split('/')[-1]
1351         request = compat_urllib_request.Request(url)
1352         try:
1353             self.report_download_webpage(video_id)
1354             webpage = compat_urllib_request.urlopen(request).read()
1355         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1356             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1357             return
1358         except ValueError as err:
1359             # since this is the last-resort InfoExtractor, if
1360             # this error is thrown, it'll be thrown here
1361             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1362             return
1363
1364         self.report_extraction(video_id)
1365         # Start with something easy: JW Player in SWFObject
1366         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1367         if mobj is None:
1368             # Broaden the search a little bit
1369             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1370         if mobj is None:
1371             # Broaden the search a little bit: JWPlayer JS loader
1372             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1373         if mobj is None:
1374             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1375             return
1376
1377         # It's possible that one of the regexes
1378         # matched, but returned an empty group:
1379         if mobj.group(1) is None:
1380             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1381             return
1382
1383         video_url = compat_urllib_parse.unquote(mobj.group(1))
1384         video_id = os.path.basename(video_url)
1385
1386         # here's a fun little line of code for you:
1387         video_extension = os.path.splitext(video_id)[1][1:]
1388         video_id = os.path.splitext(video_id)[0]
1389
1390         # it's tempting to parse this further, but you would
1391         # have to take into account all the variations like
1392         #   Video Title - Site Name
1393         #   Site Name | Video Title
1394         #   Video Title - Tagline | Site Name
1395         # and so on and so forth; it's just not practical
1396         mobj = re.search(r'<title>(.*)</title>', webpage)
1397         if mobj is None:
1398             self._downloader.trouble(u'ERROR: unable to extract title')
1399             return
1400         video_title = mobj.group(1)
1401
1402         # video uploader is domain name
1403         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1404         if mobj is None:
1405             self._downloader.trouble(u'ERROR: unable to extract title')
1406             return
1407         video_uploader = mobj.group(1)
1408
1409         return [{
1410             'id':       video_id,
1411             'url':      video_url,
1412             'uploader': video_uploader,
1413             'upload_date':  None,
1414             'title':    video_title,
1415             'ext':      video_extension,
1416         }]
1417
1418
1419 class YoutubeSearchIE(InfoExtractor):
1420     """Information Extractor for YouTube search queries."""
1421     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1422     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1423     _max_youtube_results = 1000
1424     IE_NAME = u'youtube:search'
1425
1426     def __init__(self, downloader=None):
1427         InfoExtractor.__init__(self, downloader)
1428
1429     def report_download_page(self, query, pagenum):
1430         """Report attempt to download search page with given number."""
1431         query = query.decode(preferredencoding())
1432         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1433
1434     def _real_extract(self, query):
1435         mobj = re.match(self._VALID_URL, query)
1436         if mobj is None:
1437             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1438             return
1439
1440         prefix, query = query.split(':')
1441         prefix = prefix[8:]
1442         query = query.encode('utf-8')
1443         if prefix == '':
1444             self._download_n_results(query, 1)
1445             return
1446         elif prefix == 'all':
1447             self._download_n_results(query, self._max_youtube_results)
1448             return
1449         else:
1450             try:
1451                 n = int(prefix)
1452                 if n <= 0:
1453                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1454                     return
1455                 elif n > self._max_youtube_results:
1456                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1457                     n = self._max_youtube_results
1458                 self._download_n_results(query, n)
1459                 return
1460             except ValueError: # parsing prefix as integer fails
1461                 self._download_n_results(query, 1)
1462                 return
1463
1464     def _download_n_results(self, query, n):
1465         """Downloads a specified number of results for a query"""
1466
1467         video_ids = []
1468         pagenum = 0
1469         limit = n
1470
1471         while (50 * pagenum) < limit:
1472             self.report_download_page(query, pagenum+1)
1473             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1474             request = compat_urllib_request.Request(result_url)
1475             try:
1476                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1477             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1478                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1479                 return
1480             api_response = json.loads(data)['data']
1481
1482             new_ids = list(video['id'] for video in api_response['items'])
1483             video_ids += new_ids
1484
1485             limit = min(n, api_response['totalItems'])
1486             pagenum += 1
1487
1488         if len(video_ids) > n:
1489             video_ids = video_ids[:n]
1490         for id in video_ids:
1491             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1492         return
1493
1494
1495 class GoogleSearchIE(InfoExtractor):
1496     """Information Extractor for Google Video search queries."""
1497     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1498     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1499     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1500     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1501     _max_google_results = 1000
1502     IE_NAME = u'video.google:search'
1503
1504     def __init__(self, downloader=None):
1505         InfoExtractor.__init__(self, downloader)
1506
1507     def report_download_page(self, query, pagenum):
1508         """Report attempt to download playlist page with given number."""
1509         query = query.decode(preferredencoding())
1510         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1511
1512     def _real_extract(self, query):
1513         mobj = re.match(self._VALID_URL, query)
1514         if mobj is None:
1515             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1516             return
1517
1518         prefix, query = query.split(':')
1519         prefix = prefix[8:]
1520         query = query.encode('utf-8')
1521         if prefix == '':
1522             self._download_n_results(query, 1)
1523             return
1524         elif prefix == 'all':
1525             self._download_n_results(query, self._max_google_results)
1526             return
1527         else:
1528             try:
1529                 n = int(prefix)
1530                 if n <= 0:
1531                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1532                     return
1533                 elif n > self._max_google_results:
1534                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1535                     n = self._max_google_results
1536                 self._download_n_results(query, n)
1537                 return
1538             except ValueError: # parsing prefix as integer fails
1539                 self._download_n_results(query, 1)
1540                 return
1541
1542     def _download_n_results(self, query, n):
1543         """Downloads a specified number of results for a query"""
1544
1545         video_ids = []
1546         pagenum = 0
1547
1548         while True:
1549             self.report_download_page(query, pagenum)
1550             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1551             request = compat_urllib_request.Request(result_url)
1552             try:
1553                 page = compat_urllib_request.urlopen(request).read()
1554             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1555                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1556                 return
1557
1558             # Extract video identifiers
1559             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1560                 video_id = mobj.group(1)
1561                 if video_id not in video_ids:
1562                     video_ids.append(video_id)
1563                     if len(video_ids) == n:
1564                         # Specified n videos reached
1565                         for id in video_ids:
1566                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1567                         return
1568
1569             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1570                 for id in video_ids:
1571                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1572                 return
1573
1574             pagenum = pagenum + 1
1575
1576
1577 class YahooSearchIE(InfoExtractor):
1578     """Information Extractor for Yahoo! Video search queries."""
1579
1580     _WORKING = False
1581     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1582     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1583     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1584     _MORE_PAGES_INDICATOR = r'\s*Next'
1585     _max_yahoo_results = 1000
1586     IE_NAME = u'video.yahoo:search'
1587
1588     def __init__(self, downloader=None):
1589         InfoExtractor.__init__(self, downloader)
1590
1591     def report_download_page(self, query, pagenum):
1592         """Report attempt to download playlist page with given number."""
1593         query = query.decode(preferredencoding())
1594         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1595
1596     def _real_extract(self, query):
1597         mobj = re.match(self._VALID_URL, query)
1598         if mobj is None:
1599             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1600             return
1601
1602         prefix, query = query.split(':')
1603         prefix = prefix[8:]
1604         query = query.encode('utf-8')
1605         if prefix == '':
1606             self._download_n_results(query, 1)
1607             return
1608         elif prefix == 'all':
1609             self._download_n_results(query, self._max_yahoo_results)
1610             return
1611         else:
1612             try:
1613                 n = int(prefix)
1614                 if n <= 0:
1615                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1616                     return
1617                 elif n > self._max_yahoo_results:
1618                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1619                     n = self._max_yahoo_results
1620                 self._download_n_results(query, n)
1621                 return
1622             except ValueError: # parsing prefix as integer fails
1623                 self._download_n_results(query, 1)
1624                 return
1625
1626     def _download_n_results(self, query, n):
1627         """Downloads a specified number of results for a query"""
1628
1629         video_ids = []
1630         already_seen = set()
1631         pagenum = 1
1632
1633         while True:
1634             self.report_download_page(query, pagenum)
1635             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1636             request = compat_urllib_request.Request(result_url)
1637             try:
1638                 page = compat_urllib_request.urlopen(request).read()
1639             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1640                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1641                 return
1642
1643             # Extract video identifiers
1644             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1645                 video_id = mobj.group(1)
1646                 if video_id not in already_seen:
1647                     video_ids.append(video_id)
1648                     already_seen.add(video_id)
1649                     if len(video_ids) == n:
1650                         # Specified n videos reached
1651                         for id in video_ids:
1652                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1653                         return
1654
1655             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1656                 for id in video_ids:
1657                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1658                 return
1659
1660             pagenum = pagenum + 1
1661
1662
1663 class YoutubePlaylistIE(InfoExtractor):
1664     """Information Extractor for YouTube playlists."""
1665
1666     _VALID_URL = r"""(?:
1667                         (?:https?://)?
1668                         (?:\w+\.)?
1669                         youtube\.com/
1670                         (?:
1671                            (?:course|view_play_list|my_playlists|artist|playlist)
1672                            \? .*? (p|a|list)=
1673                         |  user/.*?/user/
1674                         |  p/
1675                         |  user/.*?#[pg]/c/
1676                         )
1677                         (?:PL|EC)?
1678                      |PL|EC)
1679                      ([0-9A-Za-z-_]{10,})
1680                      (?:/.*?/([0-9A-Za-z_-]+))?
1681                      .*"""
1682     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1683     _MAX_RESULTS = 50
1684     IE_NAME = u'youtube:playlist'
1685
1686     def __init__(self, downloader=None):
1687         InfoExtractor.__init__(self, downloader)
1688
1689     def suitable(self, url):
1690         """Receives a URL and returns True if suitable for this IE."""
1691         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
1692
1693     def report_download_page(self, playlist_id, pagenum):
1694         """Report attempt to download playlist page with given number."""
1695         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1696
1697     def _real_extract(self, url):
1698         # Extract playlist id
1699         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1700         if mobj is None:
1701             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1702             return
1703
1704         # Single video case
1705         if mobj.group(3) is not None:
1706             self._downloader.download([mobj.group(3)])
1707             return
1708
1709         # Download playlist videos from API
1710         playlist_id = mobj.group(2)
1711         page_num = 1
1712         videos = []
1713
1714         while True:
1715             self.report_download_page(playlist_id, page_num)
1716
1717             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1718             try:
1719                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1720             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1721                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1722                 return
1723
1724             try:
1725                 response = json.loads(page)
1726             except ValueError as err:
1727                 self._downloader.trouble(u'ERROR: Invalid JSON in API response: ' + compat_str(err))
1728                 return
1729
1730             videos += [(entry['yt$position']['$t'], entry['content']['src']) for entry in response['feed']['entry']]
1731
1732             if len(response['feed']['entry']) < self._MAX_RESULTS:
1733                 break
1734             page_num += 1
1735
1736         videos = map(operator.itemgetter(1), sorted(videos))
1737
1738         total = len(videos)
1739
1740         playliststart = self._downloader.params.get('playliststart', 1) - 1
1741         playlistend = self._downloader.params.get('playlistend', -1)
1742         if playlistend == -1:
1743             videos = videos[playliststart:]
1744         else:
1745             videos = videos[playliststart:playlistend]
1746
1747         if len(videos) == total:
1748             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1749         else:
1750             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1751
1752         for video in videos:
1753             self._downloader.download([video])
1754         return
1755
1756
1757 class YoutubeChannelIE(InfoExtractor):
1758     """Information Extractor for YouTube channels."""
1759
1760     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1761     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1762     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1763     IE_NAME = u'youtube:channel'
1764
1765     def report_download_page(self, channel_id, pagenum):
1766         """Report attempt to download channel page with given number."""
1767         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1768
1769     def _real_extract(self, url):
1770         # Extract channel id
1771         mobj = re.match(self._VALID_URL, url)
1772         if mobj is None:
1773             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1774             return
1775
1776         # Download channel pages
1777         channel_id = mobj.group(1)
1778         video_ids = []
1779         pagenum = 1
1780
1781         while True:
1782             self.report_download_page(channel_id, pagenum)
1783             url = self._TEMPLATE_URL % (channel_id, pagenum)
1784             request = compat_urllib_request.Request(url)
1785             try:
1786                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1787             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1788                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1789                 return
1790
1791             # Extract video identifiers
1792             ids_in_page = []
1793             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1794                 if mobj.group(1) not in ids_in_page:
1795                     ids_in_page.append(mobj.group(1))
1796             video_ids.extend(ids_in_page)
1797
1798             if self._MORE_PAGES_INDICATOR not in page:
1799                 break
1800             pagenum = pagenum + 1
1801
1802         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1803
1804         for id in video_ids:
1805             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1806         return
1807
1808
1809 class YoutubeUserIE(InfoExtractor):
1810     """Information Extractor for YouTube users."""
1811
1812     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1813     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1814     _GDATA_PAGE_SIZE = 50
1815     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1816     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1817     IE_NAME = u'youtube:user'
1818
1819     def __init__(self, downloader=None):
1820         InfoExtractor.__init__(self, downloader)
1821
1822     def report_download_page(self, username, start_index):
1823         """Report attempt to download user page."""
1824         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1825                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1826
1827     def _real_extract(self, url):
1828         # Extract username
1829         mobj = re.match(self._VALID_URL, url)
1830         if mobj is None:
1831             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1832             return
1833
1834         username = mobj.group(1)
1835
1836         # Download video ids using YouTube Data API. Result size per
1837         # query is limited (currently to 50 videos) so we need to query
1838         # page by page until there are no video ids - it means we got
1839         # all of them.
1840
1841         video_ids = []
1842         pagenum = 0
1843
1844         while True:
1845             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1846             self.report_download_page(username, start_index)
1847
1848             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1849
1850             try:
1851                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1852             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1853                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1854                 return
1855
1856             # Extract video identifiers
1857             ids_in_page = []
1858
1859             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1860                 if mobj.group(1) not in ids_in_page:
1861                     ids_in_page.append(mobj.group(1))
1862
1863             video_ids.extend(ids_in_page)
1864
1865             # A little optimization - if current page is not
1866             # "full", ie. does not contain PAGE_SIZE video ids then
1867             # we can assume that this page is the last one - there
1868             # are no more ids on further pages - no need to query
1869             # again.
1870
1871             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1872                 break
1873
1874             pagenum += 1
1875
1876         all_ids_count = len(video_ids)
1877         playliststart = self._downloader.params.get('playliststart', 1) - 1
1878         playlistend = self._downloader.params.get('playlistend', -1)
1879
1880         if playlistend == -1:
1881             video_ids = video_ids[playliststart:]
1882         else:
1883             video_ids = video_ids[playliststart:playlistend]
1884
1885         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1886                 (username, all_ids_count, len(video_ids)))
1887
1888         for video_id in video_ids:
1889             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1890
1891
1892 class BlipTVUserIE(InfoExtractor):
1893     """Information Extractor for blip.tv users."""
1894
1895     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1896     _PAGE_SIZE = 12
1897     IE_NAME = u'blip.tv:user'
1898
1899     def __init__(self, downloader=None):
1900         InfoExtractor.__init__(self, downloader)
1901
1902     def report_download_page(self, username, pagenum):
1903         """Report attempt to download user page."""
1904         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1905                 (self.IE_NAME, username, pagenum))
1906
1907     def _real_extract(self, url):
1908         # Extract username
1909         mobj = re.match(self._VALID_URL, url)
1910         if mobj is None:
1911             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1912             return
1913
1914         username = mobj.group(1)
1915
1916         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1917
1918         request = compat_urllib_request.Request(url)
1919
1920         try:
1921             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1922             mobj = re.search(r'data-users-id="([^"]+)"', page)
1923             page_base = page_base % mobj.group(1)
1924         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1925             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1926             return
1927
1928
1929         # Download video ids using BlipTV Ajax calls. Result size per
1930         # query is limited (currently to 12 videos) so we need to query
1931         # page by page until there are no video ids - it means we got
1932         # all of them.
1933
1934         video_ids = []
1935         pagenum = 1
1936
1937         while True:
1938             self.report_download_page(username, pagenum)
1939             url = page_base + "&page=" + str(pagenum)
1940             request = compat_urllib_request.Request( url )
1941             try:
1942                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1943             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1944                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1945                 return
1946
1947             # Extract video identifiers
1948             ids_in_page = []
1949
1950             for mobj in re.finditer(r'href="/([^"]+)"', page):
1951                 if mobj.group(1) not in ids_in_page:
1952                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1953
1954             video_ids.extend(ids_in_page)
1955
1956             # A little optimization - if current page is not
1957             # "full", ie. does not contain PAGE_SIZE video ids then
1958             # we can assume that this page is the last one - there
1959             # are no more ids on further pages - no need to query
1960             # again.
1961
1962             if len(ids_in_page) < self._PAGE_SIZE:
1963                 break
1964
1965             pagenum += 1
1966
1967         all_ids_count = len(video_ids)
1968         playliststart = self._downloader.params.get('playliststart', 1) - 1
1969         playlistend = self._downloader.params.get('playlistend', -1)
1970
1971         if playlistend == -1:
1972             video_ids = video_ids[playliststart:]
1973         else:
1974             video_ids = video_ids[playliststart:playlistend]
1975
1976         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1977                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1978
1979         for video_id in video_ids:
1980             self._downloader.download([u'http://blip.tv/'+video_id])
1981
1982
1983 class DepositFilesIE(InfoExtractor):
1984     """Information extractor for depositfiles.com"""
1985
1986     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1987
1988     def report_download_webpage(self, file_id):
1989         """Report webpage download."""
1990         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1991
1992     def report_extraction(self, file_id):
1993         """Report information extraction."""
1994         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1995
1996     def _real_extract(self, url):
1997         file_id = url.split('/')[-1]
1998         # Rebuild url in english locale
1999         url = 'http://depositfiles.com/en/files/' + file_id
2000
2001         # Retrieve file webpage with 'Free download' button pressed
2002         free_download_indication = { 'gateway_result' : '1' }
2003         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2004         try:
2005             self.report_download_webpage(file_id)
2006             webpage = compat_urllib_request.urlopen(request).read()
2007         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2008             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
2009             return
2010
2011         # Search for the real file URL
2012         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2013         if (mobj is None) or (mobj.group(1) is None):
2014             # Try to figure out reason of the error.
2015             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2016             if (mobj is not None) and (mobj.group(1) is not None):
2017                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2018                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2019             else:
2020                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2021             return
2022
2023         file_url = mobj.group(1)
2024         file_extension = os.path.splitext(file_url)[1][1:]
2025
2026         # Search for file title
2027         mobj = re.search(r'<b title="(.*?)">', webpage)
2028         if mobj is None:
2029             self._downloader.trouble(u'ERROR: unable to extract title')
2030             return
2031         file_title = mobj.group(1).decode('utf-8')
2032
2033         return [{
2034             'id':       file_id.decode('utf-8'),
2035             'url':      file_url.decode('utf-8'),
2036             'uploader': None,
2037             'upload_date':  None,
2038             'title':    file_title,
2039             'ext':      file_extension.decode('utf-8'),
2040         }]
2041
2042
2043 class FacebookIE(InfoExtractor):
2044     """Information Extractor for Facebook"""
2045
2046     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2047     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2048     _NETRC_MACHINE = 'facebook'
2049     IE_NAME = u'facebook'
2050
2051     def report_login(self):
2052         """Report attempt to log in."""
2053         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2054
2055     def _real_initialize(self):
2056         if self._downloader is None:
2057             return
2058
2059         useremail = None
2060         password = None
2061         downloader_params = self._downloader.params
2062
2063         # Attempt to use provided username and password or .netrc data
2064         if downloader_params.get('username', None) is not None:
2065             useremail = downloader_params['username']
2066             password = downloader_params['password']
2067         elif downloader_params.get('usenetrc', False):
2068             try:
2069                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2070                 if info is not None:
2071                     useremail = info[0]
2072                     password = info[2]
2073                 else:
2074                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2075             except (IOError, netrc.NetrcParseError) as err:
2076                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2077                 return
2078
2079         if useremail is None:
2080             return
2081
2082         # Log in
2083         login_form = {
2084             'email': useremail,
2085             'pass': password,
2086             'login': 'Log+In'
2087             }
2088         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2089         try:
2090             self.report_login()
2091             login_results = compat_urllib_request.urlopen(request).read()
2092             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2093                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2094                 return
2095         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2096             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2097             return
2098
2099     def _real_extract(self, url):
2100         mobj = re.match(self._VALID_URL, url)
2101         if mobj is None:
2102             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2103             return
2104         video_id = mobj.group('ID')
2105
2106         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2107         webpage = self._download_webpage(url, video_id)
2108
2109         BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2110         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2111         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2112         if not m:
2113             raise ExtractorError(u'Cannot parse data')
2114         data = dict(json.loads(m.group(1)))
2115         params_raw = compat_urllib_parse.unquote(data['params'])
2116         params = json.loads(params_raw)
2117         video_url = params['hd_src']
2118         if not video_url:
2119             video_url = params['sd_src']
2120         if not video_url:
2121             raise ExtractorError(u'Cannot find video URL')
2122         video_duration = int(params['video_duration'])
2123
2124         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2125         if not m:
2126             raise ExtractorError(u'Cannot find title in webpage')
2127         video_title = unescapeHTML(m.group(1))
2128
2129         info = {
2130             'id': video_id,
2131             'title': video_title,
2132             'url': video_url,
2133             'ext': 'mp4',
2134             'duration': video_duration,
2135             'thumbnail': params['thumbnail_src'],
2136         }
2137         return [info]
2138
2139
2140 class BlipTVIE(InfoExtractor):
2141     """Information extractor for blip.tv"""
2142
2143     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2144     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2145     IE_NAME = u'blip.tv'
2146
2147     def report_extraction(self, file_id):
2148         """Report information extraction."""
2149         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2150
2151     def report_direct_download(self, title):
2152         """Report information extraction."""
2153         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2154
2155     def _real_extract(self, url):
2156         mobj = re.match(self._VALID_URL, url)
2157         if mobj is None:
2158             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2159             return
2160
2161         if '?' in url:
2162             cchar = '&'
2163         else:
2164             cchar = '?'
2165         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2166         request = compat_urllib_request.Request(json_url)
2167         request.add_header('User-Agent', 'iTunes/10.6.1')
2168         self.report_extraction(mobj.group(1))
2169         info = None
2170         try:
2171             urlh = compat_urllib_request.urlopen(request)
2172             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2173                 basename = url.split('/')[-1]
2174                 title,ext = os.path.splitext(basename)
2175                 title = title.decode('UTF-8')
2176                 ext = ext.replace('.', '')
2177                 self.report_direct_download(title)
2178                 info = {
2179                     'id': title,
2180                     'url': url,
2181                     'uploader': None,
2182                     'upload_date': None,
2183                     'title': title,
2184                     'ext': ext,
2185                     'urlhandle': urlh
2186                 }
2187         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2188             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2189         if info is None: # Regular URL
2190             try:
2191                 json_code_bytes = urlh.read()
2192                 json_code = json_code_bytes.decode('utf-8')
2193             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2194                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2195                 return
2196
2197             try:
2198                 json_data = json.loads(json_code)
2199                 if 'Post' in json_data:
2200                     data = json_data['Post']
2201                 else:
2202                     data = json_data
2203
2204                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2205                 video_url = data['media']['url']
2206                 umobj = re.match(self._URL_EXT, video_url)
2207                 if umobj is None:
2208                     raise ValueError('Can not determine filename extension')
2209                 ext = umobj.group(1)
2210
2211                 info = {
2212                     'id': data['item_id'],
2213                     'url': video_url,
2214                     'uploader': data['display_name'],
2215                     'upload_date': upload_date,
2216                     'title': data['title'],
2217                     'ext': ext,
2218                     'format': data['media']['mimeType'],
2219                     'thumbnail': data['thumbnailUrl'],
2220                     'description': data['description'],
2221                     'player_url': data['embedUrl'],
2222                     'user_agent': 'iTunes/10.6.1',
2223                 }
2224             except (ValueError,KeyError) as err:
2225                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2226                 return
2227
2228         return [info]
2229
2230
2231 class MyVideoIE(InfoExtractor):
2232     """Information Extractor for myvideo.de."""
2233
2234     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2235     IE_NAME = u'myvideo'
2236
2237     def __init__(self, downloader=None):
2238         InfoExtractor.__init__(self, downloader)
2239
2240     def report_extraction(self, video_id):
2241         """Report information extraction."""
2242         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2243
2244     def _real_extract(self,url):
2245         mobj = re.match(self._VALID_URL, url)
2246         if mobj is None:
2247             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2248             return
2249
2250         video_id = mobj.group(1)
2251
2252         # Get video webpage
2253         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2254         webpage = self._download_webpage(webpage_url, video_id)
2255
2256         self.report_extraction(video_id)
2257         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
2258                  webpage)
2259         if mobj is None:
2260             self._downloader.trouble(u'ERROR: unable to extract media URL')
2261             return
2262         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2263
2264         mobj = re.search('<title>([^<]+)</title>', webpage)
2265         if mobj is None:
2266             self._downloader.trouble(u'ERROR: unable to extract title')
2267             return
2268
2269         video_title = mobj.group(1)
2270
2271         return [{
2272             'id':       video_id,
2273             'url':      video_url,
2274             'uploader': None,
2275             'upload_date':  None,
2276             'title':    video_title,
2277             'ext':      u'flv',
2278         }]
2279
2280 class ComedyCentralIE(InfoExtractor):
2281     """Information extractor for The Daily Show and Colbert Report """
2282
2283     # urls can be abbreviations like :thedailyshow or :colbert
2284     # urls for episodes like:
2285     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2286     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2287     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2288     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2289                       |(https?://)?(www\.)?
2290                           (?P<showname>thedailyshow|colbertnation)\.com/
2291                          (full-episodes/(?P<episode>.*)|
2292                           (?P<clip>
2293                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2294                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2295                      $"""
2296
2297     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2298
2299     _video_extensions = {
2300         '3500': 'mp4',
2301         '2200': 'mp4',
2302         '1700': 'mp4',
2303         '1200': 'mp4',
2304         '750': 'mp4',
2305         '400': 'mp4',
2306     }
2307     _video_dimensions = {
2308         '3500': '1280x720',
2309         '2200': '960x540',
2310         '1700': '768x432',
2311         '1200': '640x360',
2312         '750': '512x288',
2313         '400': '384x216',
2314     }
2315
2316     def suitable(self, url):
2317         """Receives a URL and returns True if suitable for this IE."""
2318         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2319
2320     def report_extraction(self, episode_id):
2321         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2322
2323     def report_config_download(self, episode_id, media_id):
2324         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2325
2326     def report_index_download(self, episode_id):
2327         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2328
2329     def _print_formats(self, formats):
2330         print('Available formats:')
2331         for x in formats:
2332             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2333
2334
2335     def _real_extract(self, url):
2336         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2337         if mobj is None:
2338             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2339             return
2340
2341         if mobj.group('shortname'):
2342             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2343                 url = u'http://www.thedailyshow.com/full-episodes/'
2344             else:
2345                 url = u'http://www.colbertnation.com/full-episodes/'
2346             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2347             assert mobj is not None
2348
2349         if mobj.group('clip'):
2350             if mobj.group('showname') == 'thedailyshow':
2351                 epTitle = mobj.group('tdstitle')
2352             else:
2353                 epTitle = mobj.group('cntitle')
2354             dlNewest = False
2355         else:
2356             dlNewest = not mobj.group('episode')
2357             if dlNewest:
2358                 epTitle = mobj.group('showname')
2359             else:
2360                 epTitle = mobj.group('episode')
2361
2362         req = compat_urllib_request.Request(url)
2363         self.report_extraction(epTitle)
2364         try:
2365             htmlHandle = compat_urllib_request.urlopen(req)
2366             html = htmlHandle.read()
2367             webpage = html.decode('utf-8')
2368         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2369             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2370             return
2371         if dlNewest:
2372             url = htmlHandle.geturl()
2373             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2374             if mobj is None:
2375                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2376                 return
2377             if mobj.group('episode') == '':
2378                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2379                 return
2380             epTitle = mobj.group('episode')
2381
2382         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2383
2384         if len(mMovieParams) == 0:
2385             # The Colbert Report embeds the information in a without
2386             # a URL prefix; so extract the alternate reference
2387             # and then add the URL prefix manually.
2388
2389             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2390             if len(altMovieParams) == 0:
2391                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2392                 return
2393             else:
2394                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2395
2396         uri = mMovieParams[0][1]
2397         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2398         self.report_index_download(epTitle)
2399         try:
2400             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2401         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2402             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2403             return
2404
2405         results = []
2406
2407         idoc = xml.etree.ElementTree.fromstring(indexXml)
2408         itemEls = idoc.findall('.//item')
2409         for partNum,itemEl in enumerate(itemEls):
2410             mediaId = itemEl.findall('./guid')[0].text
2411             shortMediaId = mediaId.split(':')[-1]
2412             showId = mediaId.split(':')[-2].replace('.com', '')
2413             officialTitle = itemEl.findall('./title')[0].text
2414             officialDate = itemEl.findall('./pubDate')[0].text
2415
2416             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2417                         compat_urllib_parse.urlencode({'uri': mediaId}))
2418             configReq = compat_urllib_request.Request(configUrl)
2419             self.report_config_download(epTitle, shortMediaId)
2420             try:
2421                 configXml = compat_urllib_request.urlopen(configReq).read()
2422             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2423                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2424                 return
2425
2426             cdoc = xml.etree.ElementTree.fromstring(configXml)
2427             turls = []
2428             for rendition in cdoc.findall('.//rendition'):
2429                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2430                 turls.append(finfo)
2431
2432             if len(turls) == 0:
2433                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2434                 continue
2435
2436             if self._downloader.params.get('listformats', None):
2437                 self._print_formats([i[0] for i in turls])
2438                 return
2439
2440             # For now, just pick the highest bitrate
2441             format,rtmp_video_url = turls[-1]
2442
2443             # Get the format arg from the arg stream
2444             req_format = self._downloader.params.get('format', None)
2445
2446             # Select format if we can find one
2447             for f,v in turls:
2448                 if f == req_format:
2449                     format, rtmp_video_url = f, v
2450                     break
2451
2452             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2453             if not m:
2454                 raise ExtractorError(u'Cannot transform RTMP url')
2455             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2456             video_url = base + m.group('finalid')
2457
2458             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2459             info = {
2460                 'id': shortMediaId,
2461                 'url': video_url,
2462                 'uploader': showId,
2463                 'upload_date': officialDate,
2464                 'title': effTitle,
2465                 'ext': 'mp4',
2466                 'format': format,
2467                 'thumbnail': None,
2468                 'description': officialTitle,
2469             }
2470             results.append(info)
2471
2472         return results
2473
2474
2475 class EscapistIE(InfoExtractor):
2476     """Information extractor for The Escapist """
2477
2478     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2479     IE_NAME = u'escapist'
2480
2481     def report_extraction(self, showName):
2482         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2483
2484     def report_config_download(self, showName):
2485         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2486
2487     def _real_extract(self, url):
2488         mobj = re.match(self._VALID_URL, url)
2489         if mobj is None:
2490             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2491             return
2492         showName = mobj.group('showname')
2493         videoId = mobj.group('episode')
2494
2495         self.report_extraction(showName)
2496         try:
2497             webPage = compat_urllib_request.urlopen(url)
2498             webPageBytes = webPage.read()
2499             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2500             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2501         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2502             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2503             return
2504
2505         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2506         description = unescapeHTML(descMatch.group(1))
2507         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2508         imgUrl = unescapeHTML(imgMatch.group(1))
2509         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2510         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2511         configUrlMatch = re.search('config=(.*)$', playerUrl)
2512         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2513
2514         self.report_config_download(showName)
2515         try:
2516             configJSON = compat_urllib_request.urlopen(configUrl)
2517             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2518             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2519         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2520             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2521             return
2522
2523         # Technically, it's JavaScript, not JSON
2524         configJSON = configJSON.replace("'", '"')
2525
2526         try:
2527             config = json.loads(configJSON)
2528         except (ValueError,) as err:
2529             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2530             return
2531
2532         playlist = config['playlist']
2533         videoUrl = playlist[1]['url']
2534
2535         info = {
2536             'id': videoId,
2537             'url': videoUrl,
2538             'uploader': showName,
2539             'upload_date': None,
2540             'title': showName,
2541             'ext': 'flv',
2542             'thumbnail': imgUrl,
2543             'description': description,
2544             'player_url': playerUrl,
2545         }
2546
2547         return [info]
2548
2549 class CollegeHumorIE(InfoExtractor):
2550     """Information extractor for collegehumor.com"""
2551
2552     _WORKING = False
2553     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2554     IE_NAME = u'collegehumor'
2555
2556     def report_manifest(self, video_id):
2557         """Report information extraction."""
2558         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2559
2560     def report_extraction(self, video_id):
2561         """Report information extraction."""
2562         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2563
2564     def _real_extract(self, url):
2565         mobj = re.match(self._VALID_URL, url)
2566         if mobj is None:
2567             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2568             return
2569         video_id = mobj.group('videoid')
2570
2571         info = {
2572             'id': video_id,
2573             'uploader': None,
2574             'upload_date': None,
2575         }
2576
2577         self.report_extraction(video_id)
2578         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2579         try:
2580             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2581         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2582             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2583             return
2584
2585         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2586         try:
2587             videoNode = mdoc.findall('./video')[0]
2588             info['description'] = videoNode.findall('./description')[0].text
2589             info['title'] = videoNode.findall('./caption')[0].text
2590             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2591             manifest_url = videoNode.findall('./file')[0].text
2592         except IndexError:
2593             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2594             return
2595
2596         manifest_url += '?hdcore=2.10.3'
2597         self.report_manifest(video_id)
2598         try:
2599             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2600         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2601             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2602             return
2603
2604         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2605         try:
2606             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2607             node_id = media_node.attrib['url']
2608             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2609         except IndexError as err:
2610             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2611             return
2612
2613         url_pr = compat_urllib_parse_urlparse(manifest_url)
2614         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2615
2616         info['url'] = url
2617         info['ext'] = 'f4f'
2618         return [info]
2619
2620
2621 class XVideosIE(InfoExtractor):
2622     """Information extractor for xvideos.com"""
2623
2624     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2625     IE_NAME = u'xvideos'
2626
2627     def report_extraction(self, video_id):
2628         """Report information extraction."""
2629         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2630
2631     def _real_extract(self, url):
2632         mobj = re.match(self._VALID_URL, url)
2633         if mobj is None:
2634             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2635             return
2636         video_id = mobj.group(1)
2637
2638         webpage = self._download_webpage(url, video_id)
2639
2640         self.report_extraction(video_id)
2641
2642
2643         # Extract video URL
2644         mobj = re.search(r'flv_url=(.+?)&', webpage)
2645         if mobj is None:
2646             self._downloader.trouble(u'ERROR: unable to extract video url')
2647             return
2648         video_url = compat_urllib_parse.unquote(mobj.group(1))
2649
2650
2651         # Extract title
2652         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2653         if mobj is None:
2654             self._downloader.trouble(u'ERROR: unable to extract video title')
2655             return
2656         video_title = mobj.group(1)
2657
2658
2659         # Extract video thumbnail
2660         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2661         if mobj is None:
2662             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2663             return
2664         video_thumbnail = mobj.group(0)
2665
2666         info = {
2667             'id': video_id,
2668             'url': video_url,
2669             'uploader': None,
2670             'upload_date': None,
2671             'title': video_title,
2672             'ext': 'flv',
2673             'thumbnail': video_thumbnail,
2674             'description': None,
2675         }
2676
2677         return [info]
2678
2679
2680 class SoundcloudIE(InfoExtractor):
2681     """Information extractor for soundcloud.com
2682        To access the media, the uid of the song and a stream token
2683        must be extracted from the page source and the script must make
2684        a request to media.soundcloud.com/crossdomain.xml. Then
2685        the media can be grabbed by requesting from an url composed
2686        of the stream token and uid
2687      """
2688
2689     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2690     IE_NAME = u'soundcloud'
2691
2692     def __init__(self, downloader=None):
2693         InfoExtractor.__init__(self, downloader)
2694
2695     def report_resolve(self, video_id):
2696         """Report information extraction."""
2697         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2698
2699     def report_extraction(self, video_id):
2700         """Report information extraction."""
2701         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2702
2703     def _real_extract(self, url):
2704         mobj = re.match(self._VALID_URL, url)
2705         if mobj is None:
2706             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2707             return
2708
2709         # extract uploader (which is in the url)
2710         uploader = mobj.group(1)
2711         # extract simple title (uploader + slug of song title)
2712         slug_title =  mobj.group(2)
2713         simple_title = uploader + u'-' + slug_title
2714
2715         self.report_resolve('%s/%s' % (uploader, slug_title))
2716
2717         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2718         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2719         request = compat_urllib_request.Request(resolv_url)
2720         try:
2721             info_json_bytes = compat_urllib_request.urlopen(request).read()
2722             info_json = info_json_bytes.decode('utf-8')
2723         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2724             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2725             return
2726
2727         info = json.loads(info_json)
2728         video_id = info['id']
2729         self.report_extraction('%s/%s' % (uploader, slug_title))
2730
2731         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2732         request = compat_urllib_request.Request(streams_url)
2733         try:
2734             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2735             stream_json = stream_json_bytes.decode('utf-8')
2736         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2737             self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2738             return
2739
2740         streams = json.loads(stream_json)
2741         mediaURL = streams['http_mp3_128_url']
2742
2743         return [{
2744             'id':       info['id'],
2745             'url':      mediaURL,
2746             'uploader': info['user']['username'],
2747             'upload_date':  info['created_at'],
2748             'title':    info['title'],
2749             'ext':      u'mp3',
2750             'description': info['description'],
2751         }]
2752
2753
2754 class InfoQIE(InfoExtractor):
2755     """Information extractor for infoq.com"""
2756     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2757
2758     def report_extraction(self, video_id):
2759         """Report information extraction."""
2760         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2761
2762     def _real_extract(self, url):
2763         mobj = re.match(self._VALID_URL, url)
2764         if mobj is None:
2765             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2766             return
2767
2768         webpage = self._download_webpage(url, video_id=url)
2769         self.report_extraction(url)
2770
2771         # Extract video URL
2772         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2773         if mobj is None:
2774             self._downloader.trouble(u'ERROR: unable to extract video url')
2775             return
2776         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2777         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2778
2779         # Extract title
2780         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2781         if mobj is None:
2782             self._downloader.trouble(u'ERROR: unable to extract video title')
2783             return
2784         video_title = mobj.group(1)
2785
2786         # Extract description
2787         video_description = u'No description available.'
2788         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2789         if mobj is not None:
2790             video_description = mobj.group(1)
2791
2792         video_filename = video_url.split('/')[-1]
2793         video_id, extension = video_filename.split('.')
2794
2795         info = {
2796             'id': video_id,
2797             'url': video_url,
2798             'uploader': None,
2799             'upload_date': None,
2800             'title': video_title,
2801             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2802             'thumbnail': None,
2803             'description': video_description,
2804         }
2805
2806         return [info]
2807
2808 class MixcloudIE(InfoExtractor):
2809     """Information extractor for www.mixcloud.com"""
2810
2811     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2812     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2813     IE_NAME = u'mixcloud'
2814
2815     def __init__(self, downloader=None):
2816         InfoExtractor.__init__(self, downloader)
2817
2818     def report_download_json(self, file_id):
2819         """Report JSON download."""
2820         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2821
2822     def report_extraction(self, file_id):
2823         """Report information extraction."""
2824         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2825
2826     def get_urls(self, jsonData, fmt, bitrate='best'):
2827         """Get urls from 'audio_formats' section in json"""
2828         file_url = None
2829         try:
2830             bitrate_list = jsonData[fmt]
2831             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2832                 bitrate = max(bitrate_list) # select highest
2833
2834             url_list = jsonData[fmt][bitrate]
2835         except TypeError: # we have no bitrate info.
2836             url_list = jsonData[fmt]
2837         return url_list
2838
2839     def check_urls(self, url_list):
2840         """Returns 1st active url from list"""
2841         for url in url_list:
2842             try:
2843                 compat_urllib_request.urlopen(url)
2844                 return url
2845             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2846                 url = None
2847
2848         return None
2849
2850     def _print_formats(self, formats):
2851         print('Available formats:')
2852         for fmt in formats.keys():
2853             for b in formats[fmt]:
2854                 try:
2855                     ext = formats[fmt][b][0]
2856                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2857                 except TypeError: # we have no bitrate info
2858                     ext = formats[fmt][0]
2859                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2860                     break
2861
2862     def _real_extract(self, url):
2863         mobj = re.match(self._VALID_URL, url)
2864         if mobj is None:
2865             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2866             return
2867         # extract uploader & filename from url
2868         uploader = mobj.group(1).decode('utf-8')
2869         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2870
2871         # construct API request
2872         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2873         # retrieve .json file with links to files
2874         request = compat_urllib_request.Request(file_url)
2875         try:
2876             self.report_download_json(file_url)
2877             jsonData = compat_urllib_request.urlopen(request).read()
2878         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2879             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2880             return
2881
2882         # parse JSON
2883         json_data = json.loads(jsonData)
2884         player_url = json_data['player_swf_url']
2885         formats = dict(json_data['audio_formats'])
2886
2887         req_format = self._downloader.params.get('format', None)
2888         bitrate = None
2889
2890         if self._downloader.params.get('listformats', None):
2891             self._print_formats(formats)
2892             return
2893
2894         if req_format is None or req_format == 'best':
2895             for format_param in formats.keys():
2896                 url_list = self.get_urls(formats, format_param)
2897                 # check urls
2898                 file_url = self.check_urls(url_list)
2899                 if file_url is not None:
2900                     break # got it!
2901         else:
2902             if req_format not in formats:
2903                 self._downloader.trouble(u'ERROR: format is not available')
2904                 return
2905
2906             url_list = self.get_urls(formats, req_format)
2907             file_url = self.check_urls(url_list)
2908             format_param = req_format
2909
2910         return [{
2911             'id': file_id.decode('utf-8'),
2912             'url': file_url.decode('utf-8'),
2913             'uploader': uploader.decode('utf-8'),
2914             'upload_date': None,
2915             'title': json_data['name'],
2916             'ext': file_url.split('.')[-1].decode('utf-8'),
2917             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2918             'thumbnail': json_data['thumbnail_url'],
2919             'description': json_data['description'],
2920             'player_url': player_url.decode('utf-8'),
2921         }]
2922
2923 class StanfordOpenClassroomIE(InfoExtractor):
2924     """Information extractor for Stanford's Open ClassRoom"""
2925
2926     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2927     IE_NAME = u'stanfordoc'
2928
2929     def report_download_webpage(self, objid):
2930         """Report information extraction."""
2931         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2932
2933     def report_extraction(self, video_id):
2934         """Report information extraction."""
2935         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2936
2937     def _real_extract(self, url):
2938         mobj = re.match(self._VALID_URL, url)
2939         if mobj is None:
2940             raise ExtractorError(u'Invalid URL: %s' % url)
2941
2942         if mobj.group('course') and mobj.group('video'): # A specific video
2943             course = mobj.group('course')
2944             video = mobj.group('video')
2945             info = {
2946                 'id': course + '_' + video,
2947                 'uploader': None,
2948                 'upload_date': None,
2949             }
2950
2951             self.report_extraction(info['id'])
2952             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2953             xmlUrl = baseUrl + video + '.xml'
2954             try:
2955                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2956             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2957                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2958                 return
2959             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2960             try:
2961                 info['title'] = mdoc.findall('./title')[0].text
2962                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2963             except IndexError:
2964                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2965                 return
2966             info['ext'] = info['url'].rpartition('.')[2]
2967             return [info]
2968         elif mobj.group('course'): # A course page
2969             course = mobj.group('course')
2970             info = {
2971                 'id': course,
2972                 'type': 'playlist',
2973                 'uploader': None,
2974                 'upload_date': None,
2975             }
2976
2977             coursepage = self._download_webpage(url, info['id'],
2978                                         note='Downloading course info page',
2979                                         errnote='Unable to download course info page')
2980
2981             m = re.search('<h1>([^<]+)</h1>', coursepage)
2982             if m:
2983                 info['title'] = unescapeHTML(m.group(1))
2984             else:
2985                 info['title'] = info['id']
2986
2987             m = re.search('<description>([^<]+)</description>', coursepage)
2988             if m:
2989                 info['description'] = unescapeHTML(m.group(1))
2990
2991             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2992             info['list'] = [
2993                 {
2994                     'type': 'reference',
2995                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2996                 }
2997                     for vpage in links]
2998             results = []
2999             for entry in info['list']:
3000                 assert entry['type'] == 'reference'
3001                 results += self.extract(entry['url'])
3002             return results
3003         else: # Root page
3004             info = {
3005                 'id': 'Stanford OpenClassroom',
3006                 'type': 'playlist',
3007                 'uploader': None,
3008                 'upload_date': None,
3009             }
3010
3011             self.report_download_webpage(info['id'])
3012             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3013             try:
3014                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3015             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3016                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3017                 return
3018
3019             info['title'] = info['id']
3020
3021             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3022             info['list'] = [
3023                 {
3024                     'type': 'reference',
3025                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3026                 }
3027                     for cpage in links]
3028
3029             results = []
3030             for entry in info['list']:
3031                 assert entry['type'] == 'reference'
3032                 results += self.extract(entry['url'])
3033             return results
3034
3035 class MTVIE(InfoExtractor):
3036     """Information extractor for MTV.com"""
3037
3038     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3039     IE_NAME = u'mtv'
3040
3041     def report_extraction(self, video_id):
3042         """Report information extraction."""
3043         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3044
3045     def _real_extract(self, url):
3046         mobj = re.match(self._VALID_URL, url)
3047         if mobj is None:
3048             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3049             return
3050         if not mobj.group('proto'):
3051             url = 'http://' + url
3052         video_id = mobj.group('videoid')
3053
3054         webpage = self._download_webpage(url, video_id)
3055
3056         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3057         if mobj is None:
3058             self._downloader.trouble(u'ERROR: unable to extract song name')
3059             return
3060         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3061         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3062         if mobj is None:
3063             self._downloader.trouble(u'ERROR: unable to extract performer')
3064             return
3065         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3066         video_title = performer + ' - ' + song_name
3067
3068         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3069         if mobj is None:
3070             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3071             return
3072         mtvn_uri = mobj.group(1)
3073
3074         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3075         if mobj is None:
3076             self._downloader.trouble(u'ERROR: unable to extract content id')
3077             return
3078         content_id = mobj.group(1)
3079
3080         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3081         self.report_extraction(video_id)
3082         request = compat_urllib_request.Request(videogen_url)
3083         try:
3084             metadataXml = compat_urllib_request.urlopen(request).read()
3085         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3086             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3087             return
3088
3089         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3090         renditions = mdoc.findall('.//rendition')
3091
3092         # For now, always pick the highest quality.
3093         rendition = renditions[-1]
3094
3095         try:
3096             _,_,ext = rendition.attrib['type'].partition('/')
3097             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3098             video_url = rendition.find('./src').text
3099         except KeyError:
3100             self._downloader.trouble('Invalid rendition field.')
3101             return
3102
3103         info = {
3104             'id': video_id,
3105             'url': video_url,
3106             'uploader': performer,
3107             'upload_date': None,
3108             'title': video_title,
3109             'ext': ext,
3110             'format': format,
3111         }
3112
3113         return [info]
3114
3115
3116 class YoukuIE(InfoExtractor):
3117     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3118
3119     def report_download_webpage(self, file_id):
3120         """Report webpage download."""
3121         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3122
3123     def report_extraction(self, file_id):
3124         """Report information extraction."""
3125         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3126
3127     def _gen_sid(self):
3128         nowTime = int(time.time() * 1000)
3129         random1 = random.randint(1000,1998)
3130         random2 = random.randint(1000,9999)
3131
3132         return "%d%d%d" %(nowTime,random1,random2)
3133
3134     def _get_file_ID_mix_string(self, seed):
3135         mixed = []
3136         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3137         seed = float(seed)
3138         for i in range(len(source)):
3139             seed  =  (seed * 211 + 30031 ) % 65536
3140             index  =  math.floor(seed / 65536 * len(source) )
3141             mixed.append(source[int(index)])
3142             source.remove(source[int(index)])
3143         #return ''.join(mixed)
3144         return mixed
3145
3146     def _get_file_id(self, fileId, seed):
3147         mixed = self._get_file_ID_mix_string(seed)
3148         ids = fileId.split('*')
3149         realId = []
3150         for ch in ids:
3151             if ch:
3152                 realId.append(mixed[int(ch)])
3153         return ''.join(realId)
3154
3155     def _real_extract(self, url):
3156         mobj = re.match(self._VALID_URL, url)
3157         if mobj is None:
3158             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3159             return
3160         video_id = mobj.group('ID')
3161
3162         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3163
3164         request = compat_urllib_request.Request(info_url, None, std_headers)
3165         try:
3166             self.report_download_webpage(video_id)
3167             jsondata = compat_urllib_request.urlopen(request).read()
3168         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3169             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3170             return
3171
3172         self.report_extraction(video_id)
3173         try:
3174             jsonstr = jsondata.decode('utf-8')
3175             config = json.loads(jsonstr)
3176
3177             video_title =  config['data'][0]['title']
3178             seed = config['data'][0]['seed']
3179
3180             format = self._downloader.params.get('format', None)
3181             supported_format = list(config['data'][0]['streamfileids'].keys())
3182
3183             if format is None or format == 'best':
3184                 if 'hd2' in supported_format:
3185                     format = 'hd2'
3186                 else:
3187                     format = 'flv'
3188                 ext = u'flv'
3189             elif format == 'worst':
3190                 format = 'mp4'
3191                 ext = u'mp4'
3192             else:
3193                 format = 'flv'
3194                 ext = u'flv'
3195
3196
3197             fileid = config['data'][0]['streamfileids'][format]
3198             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3199         except (UnicodeDecodeError, ValueError, KeyError):
3200             self._downloader.trouble(u'ERROR: unable to extract info section')
3201             return
3202
3203         files_info=[]
3204         sid = self._gen_sid()
3205         fileid = self._get_file_id(fileid, seed)
3206
3207         #column 8,9 of fileid represent the segment number
3208         #fileid[7:9] should be changed
3209         for index, key in enumerate(keys):
3210
3211             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3212             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3213
3214             info = {
3215                 'id': '%s_part%02d' % (video_id, index),
3216                 'url': download_url,
3217                 'uploader': None,
3218                 'upload_date': None,
3219                 'title': video_title,
3220                 'ext': ext,
3221             }
3222             files_info.append(info)
3223
3224         return files_info
3225
3226
3227 class XNXXIE(InfoExtractor):
3228     """Information extractor for xnxx.com"""
3229
3230     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3231     IE_NAME = u'xnxx'
3232     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3233     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3234     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3235
3236     def report_webpage(self, video_id):
3237         """Report information extraction"""
3238         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3239
3240     def report_extraction(self, video_id):
3241         """Report information extraction"""
3242         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3243
3244     def _real_extract(self, url):
3245         mobj = re.match(self._VALID_URL, url)
3246         if mobj is None:
3247             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3248             return
3249         video_id = mobj.group(1)
3250
3251         self.report_webpage(video_id)
3252
3253         # Get webpage content
3254         try:
3255             webpage_bytes = compat_urllib_request.urlopen(url).read()
3256             webpage = webpage_bytes.decode('utf-8')
3257         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3258             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3259             return
3260
3261         result = re.search(self.VIDEO_URL_RE, webpage)
3262         if result is None:
3263             self._downloader.trouble(u'ERROR: unable to extract video url')
3264             return
3265         video_url = compat_urllib_parse.unquote(result.group(1))
3266
3267         result = re.search(self.VIDEO_TITLE_RE, webpage)
3268         if result is None:
3269             self._downloader.trouble(u'ERROR: unable to extract video title')
3270             return
3271         video_title = result.group(1)
3272
3273         result = re.search(self.VIDEO_THUMB_RE, webpage)
3274         if result is None:
3275             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3276             return
3277         video_thumbnail = result.group(1)
3278
3279         return [{
3280             'id': video_id,
3281             'url': video_url,
3282             'uploader': None,
3283             'upload_date': None,
3284             'title': video_title,
3285             'ext': 'flv',
3286             'thumbnail': video_thumbnail,
3287             'description': None,
3288         }]
3289
3290
3291 class GooglePlusIE(InfoExtractor):
3292     """Information extractor for plus.google.com."""
3293
3294     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3295     IE_NAME = u'plus.google'
3296
3297     def __init__(self, downloader=None):
3298         InfoExtractor.__init__(self, downloader)
3299
3300     def report_extract_entry(self, url):
3301         """Report downloading extry"""
3302         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3303
3304     def report_date(self, upload_date):
3305         """Report downloading extry"""
3306         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3307
3308     def report_uploader(self, uploader):
3309         """Report downloading extry"""
3310         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3311
3312     def report_title(self, video_title):
3313         """Report downloading extry"""
3314         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3315
3316     def report_extract_vid_page(self, video_page):
3317         """Report information extraction."""
3318         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3319
3320     def _real_extract(self, url):
3321         # Extract id from URL
3322         mobj = re.match(self._VALID_URL, url)
3323         if mobj is None:
3324             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3325             return
3326
3327         post_url = mobj.group(0)
3328         video_id = mobj.group(1)
3329
3330         video_extension = 'flv'
3331
3332         # Step 1, Retrieve post webpage to extract further information
3333         self.report_extract_entry(post_url)
3334         request = compat_urllib_request.Request(post_url)
3335         try:
3336             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3337         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3338             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3339             return
3340
3341         # Extract update date
3342         upload_date = None
3343         pattern = 'title="Timestamp">(.*?)</a>'
3344         mobj = re.search(pattern, webpage)
3345         if mobj:
3346             upload_date = mobj.group(1)
3347             # Convert timestring to a format suitable for filename
3348             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3349             upload_date = upload_date.strftime('%Y%m%d')
3350         self.report_date(upload_date)
3351
3352         # Extract uploader
3353         uploader = None
3354         pattern = r'rel\="author".*?>(.*?)</a>'
3355         mobj = re.search(pattern, webpage)
3356         if mobj:
3357             uploader = mobj.group(1)
3358         self.report_uploader(uploader)
3359
3360         # Extract title
3361         # Get the first line for title
3362         video_title = u'NA'
3363         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3364         mobj = re.search(pattern, webpage)
3365         if mobj:
3366             video_title = mobj.group(1)
3367         self.report_title(video_title)
3368
3369         # Step 2, Stimulate clicking the image box to launch video
3370         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3371         mobj = re.search(pattern, webpage)
3372         if mobj is None:
3373             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3374
3375         video_page = mobj.group(1)
3376         request = compat_urllib_request.Request(video_page)
3377         try:
3378             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3379         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3380             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3381             return
3382         self.report_extract_vid_page(video_page)
3383
3384
3385         # Extract video links on video page
3386         """Extract video links of all sizes"""
3387         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3388         mobj = re.findall(pattern, webpage)
3389         if len(mobj) == 0:
3390             self._downloader.trouble(u'ERROR: unable to extract video links')
3391
3392         # Sort in resolution
3393         links = sorted(mobj)
3394
3395         # Choose the lowest of the sort, i.e. highest resolution
3396         video_url = links[-1]
3397         # Only get the url. The resolution part in the tuple has no use anymore
3398         video_url = video_url[-1]
3399         # Treat escaped \u0026 style hex
3400         try:
3401             video_url = video_url.decode("unicode_escape")
3402         except AttributeError: # Python 3
3403             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3404
3405
3406         return [{
3407             'id':       video_id,
3408             'url':      video_url,
3409             'uploader': uploader,
3410             'upload_date':  upload_date,
3411             'title':    video_title,
3412             'ext':      video_extension,
3413         }]
3414
3415 class NBAIE(InfoExtractor):
3416     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3417     IE_NAME = u'nba'
3418
3419     def _real_extract(self, url):
3420         mobj = re.match(self._VALID_URL, url)
3421         if mobj is None:
3422             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3423             return
3424
3425         video_id = mobj.group(1)
3426         if video_id.endswith('/index.html'):
3427             video_id = video_id[:-len('/index.html')]
3428
3429         webpage = self._download_webpage(url, video_id)
3430
3431         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3432         def _findProp(rexp, default=None):
3433             m = re.search(rexp, webpage)
3434             if m:
3435                 return unescapeHTML(m.group(1))
3436             else:
3437                 return default
3438
3439         shortened_video_id = video_id.rpartition('/')[2]
3440         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3441         info = {
3442             'id': shortened_video_id,
3443             'url': video_url,
3444             'ext': 'mp4',
3445             'title': title,
3446             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3447             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3448         }
3449         return [info]
3450
3451 class JustinTVIE(InfoExtractor):
3452     """Information extractor for justin.tv and twitch.tv"""
3453     # TODO: One broadcast may be split into multiple videos. The key
3454     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3455     # starts at 1 and increases. Can we treat all parts as one video?
3456
3457     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3458         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3459     _JUSTIN_PAGE_LIMIT = 100
3460     IE_NAME = u'justin.tv'
3461
3462     def report_extraction(self, file_id):
3463         """Report information extraction."""
3464         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3465
3466     def report_download_page(self, channel, offset):
3467         """Report attempt to download a single page of videos."""
3468         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3469                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3470
3471     # Return count of items, list of *valid* items
3472     def _parse_page(self, url):
3473         try:
3474             urlh = compat_urllib_request.urlopen(url)
3475             webpage_bytes = urlh.read()
3476             webpage = webpage_bytes.decode('utf-8', 'ignore')
3477         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3478             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3479             return
3480
3481         response = json.loads(webpage)
3482         if type(response) != list:
3483             error_text = response.get('error', 'unknown error')
3484             self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3485             return
3486         info = []
3487         for clip in response:
3488             video_url = clip['video_file_url']
3489             if video_url:
3490                 video_extension = os.path.splitext(video_url)[1][1:]
3491                 video_date = re.sub('-', '', clip['start_time'][:10])
3492                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3493                 video_id = clip['id']
3494                 video_title = clip.get('title', video_id)
3495                 info.append({
3496                     'id': video_id,
3497                     'url': video_url,
3498                     'title': video_title,
3499                     'uploader': clip.get('channel_name', video_uploader_id),
3500                     'uploader_id': video_uploader_id,
3501                     'upload_date': video_date,
3502                     'ext': video_extension,
3503                 })
3504         return (len(response), info)
3505
3506     def _real_extract(self, url):
3507         mobj = re.match(self._VALID_URL, url)
3508         if mobj is None:
3509             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3510             return
3511
3512         api = 'http://api.justin.tv'
3513         video_id = mobj.group(mobj.lastindex)
3514         paged = False
3515         if mobj.lastindex == 1:
3516             paged = True
3517             api += '/channel/archives/%s.json'
3518         else:
3519             api += '/broadcast/by_archive/%s.json'
3520         api = api % (video_id,)
3521
3522         self.report_extraction(video_id)
3523
3524         info = []
3525         offset = 0
3526         limit = self._JUSTIN_PAGE_LIMIT
3527         while True:
3528             if paged:
3529                 self.report_download_page(video_id, offset)
3530             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3531             page_count, page_info = self._parse_page(page_url)
3532             info.extend(page_info)
3533             if not paged or page_count != limit:
3534                 break
3535             offset += limit
3536         return info
3537
3538 class FunnyOrDieIE(InfoExtractor):
3539     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3540
3541     def _real_extract(self, url):
3542         mobj = re.match(self._VALID_URL, url)
3543         if mobj is None:
3544             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3545             return
3546
3547         video_id = mobj.group('id')
3548         webpage = self._download_webpage(url, video_id)
3549
3550         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3551         if not m:
3552             self._downloader.trouble(u'ERROR: unable to find video information')
3553         video_url = unescapeHTML(m.group('url'))
3554
3555         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3556         if not m:
3557             self._downloader.trouble(u'Cannot find video title')
3558         title = unescapeHTML(m.group('title'))
3559
3560         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3561         if m:
3562             desc = unescapeHTML(m.group('desc'))
3563         else:
3564             desc = None
3565
3566         info = {
3567             'id': video_id,
3568             'url': video_url,
3569             'ext': 'mp4',
3570             'title': title,
3571             'description': desc,
3572         }
3573         return [info]
3574
3575 class TweetReelIE(InfoExtractor):
3576     _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3577
3578     def _real_extract(self, url):
3579         mobj = re.match(self._VALID_URL, url)
3580         if mobj is None:
3581             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3582             return
3583
3584         video_id = mobj.group('id')
3585         webpage = self._download_webpage(url, video_id)
3586
3587         m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3588         if not m:
3589             self._downloader.trouble(u'ERROR: Cannot find status ID')
3590         status_id = m.group(1)
3591
3592         m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3593         if not m:
3594             self._downloader.trouble(u'WARNING: Cannot find description')
3595         desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3596
3597         m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3598         if not m:
3599             self._downloader.trouble(u'ERROR: Cannot find uploader')
3600         uploader = unescapeHTML(m.group('uploader'))
3601         uploader_id = unescapeHTML(m.group('uploader_id'))
3602
3603         m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3604         if not m:
3605             self._downloader.trouble(u'ERROR: Cannot find upload date')
3606         upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3607
3608         title = desc
3609         video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3610
3611         info = {
3612             'id': video_id,
3613             'url': video_url,
3614             'ext': 'mov',
3615             'title': title,
3616             'description': desc,
3617             'uploader': uploader,
3618             'uploader_id': uploader_id,
3619             'internal_id': status_id,
3620             'upload_date': upload_date
3621         }
3622         return [info]
3623
3624 class SteamIE(InfoExtractor):
3625     _VALID_URL = r"""http://store.steampowered.com/
3626                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3627                 (?P<gameID>\d+)/?
3628                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3629                 """
3630
3631     def suitable(self, url):
3632         """Receives a URL and returns True if suitable for this IE."""
3633         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3634
3635     def _real_extract(self, url):
3636         m = re.match(self._VALID_URL, url, re.VERBOSE)
3637         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3638         gameID = m.group('gameID')
3639         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3640         webpage = self._download_webpage(videourl, gameID)
3641         mweb = re.finditer(urlRE, webpage)
3642         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3643         titles = re.finditer(namesRE, webpage)
3644         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3645         thumbs = re.finditer(thumbsRE, webpage)
3646         videos = []
3647         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3648             video_id = vid.group('videoID')
3649             title = vtitle.group('videoName')
3650             video_url = vid.group('videoURL')
3651             video_thumb = thumb.group('thumbnail')
3652             if not video_url:
3653                 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3654             info = {
3655                 'id':video_id,
3656                 'url':video_url,
3657                 'ext': 'flv',
3658                 'title': unescapeHTML(title),
3659                 'thumbnail': video_thumb
3660                   }
3661             videos.append(info)
3662         return videos
3663
3664 class UstreamIE(InfoExtractor):
3665     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3666     IE_NAME = u'ustream'
3667
3668     def _real_extract(self, url):
3669         m = re.match(self._VALID_URL, url)
3670         video_id = m.group('videoID')
3671         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3672         webpage = self._download_webpage(url, video_id)
3673         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3674         title = m.group('title')
3675         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3676         uploader = m.group('uploader')
3677         info = {
3678                 'id':video_id,
3679                 'url':video_url,
3680                 'ext': 'flv',
3681                 'title': title,
3682                 'uploader': uploader
3683                   }
3684         return [info]
3685
3686 class RBMARadioIE(InfoExtractor):
3687     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3688
3689     def _real_extract(self, url):
3690         m = re.match(self._VALID_URL, url)
3691         video_id = m.group('videoID')
3692
3693         webpage = self._download_webpage(url, video_id)
3694         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3695         if not m:
3696             raise ExtractorError(u'Cannot find metadata')
3697         json_data = m.group(1)
3698
3699         try:
3700             data = json.loads(json_data)
3701         except ValueError as e:
3702             raise ExtractorError(u'Invalid JSON: ' + str(e))
3703
3704         video_url = data['akamai_url'] + '&cbr=256'
3705         url_parts = compat_urllib_parse_urlparse(video_url)
3706         video_ext = url_parts.path.rpartition('.')[2]
3707         info = {
3708                 'id': video_id,
3709                 'url': video_url,
3710                 'ext': video_ext,
3711                 'title': data['title'],
3712                 'description': data.get('teaser_text'),
3713                 'location': data.get('country_of_origin'),
3714                 'uploader': data.get('host', {}).get('name'),
3715                 'uploader_id': data.get('host', {}).get('slug'),
3716                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3717                 'duration': data.get('duration'),
3718         }
3719         return [info]
3720
3721
3722 class YouPornIE(InfoExtractor):
3723     """Information extractor for youporn.com."""
3724     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3725
3726     def _print_formats(self, formats):
3727         """Print all available formats"""
3728         print(u'Available formats:')
3729         print(u'ext\t\tformat')
3730         print(u'---------------------------------')
3731         for format in formats:
3732             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3733
3734     def _specific(self, req_format, formats):
3735         for x in formats:
3736             if(x["format"]==req_format):
3737                 return x
3738         return None
3739
3740     def _real_extract(self, url):
3741         mobj = re.match(self._VALID_URL, url)
3742         if mobj is None:
3743             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3744             return
3745
3746         video_id = mobj.group('videoid')
3747
3748         req = compat_urllib_request.Request(url)
3749         req.add_header('Cookie', 'age_verified=1')
3750         webpage = self._download_webpage(req, video_id)
3751
3752         # Get the video title
3753         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3754         if result is None:
3755             raise ExtractorError(u'Unable to extract video title')
3756         video_title = result.group('title').strip()
3757
3758         # Get the video date
3759         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3760         if result is None:
3761             self._downloader.to_stderr(u'WARNING: unable to extract video date')
3762             upload_date = None
3763         else:
3764             upload_date = result.group('date').strip()
3765
3766         # Get the video uploader
3767         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3768         if result is None:
3769             self._downloader.to_stderr(u'WARNING: unable to extract uploader')
3770             video_uploader = None
3771         else:
3772             video_uploader = result.group('uploader').strip()
3773             video_uploader = clean_html( video_uploader )
3774
3775         # Get all of the formats available
3776         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3777         result = re.search(DOWNLOAD_LIST_RE, webpage)
3778         if result is None:
3779             raise ExtractorError(u'Unable to extract download list')
3780         download_list_html = result.group('download_list').strip()
3781
3782         # Get all of the links from the page
3783         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3784         links = re.findall(LINK_RE, download_list_html)
3785         if(len(links) == 0):
3786             raise ExtractorError(u'ERROR: no known formats available for video')
3787
3788         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3789
3790         formats = []
3791         for link in links:
3792
3793             # A link looks like this:
3794             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3795             # A path looks like this:
3796             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3797             video_url = unescapeHTML( link )
3798             path = compat_urllib_parse_urlparse( video_url ).path
3799             extension = os.path.splitext( path )[1][1:]
3800             format = path.split('/')[4].split('_')[:2]
3801             size = format[0]
3802             bitrate = format[1]
3803             format = "-".join( format )
3804             title = u'%s-%s-%s' % (video_title, size, bitrate)
3805
3806             formats.append({
3807                 'id': video_id,
3808                 'url': video_url,
3809                 'uploader': video_uploader,
3810                 'upload_date': upload_date,
3811                 'title': title,
3812                 'ext': extension,
3813                 'format': format,
3814                 'thumbnail': None,
3815                 'description': None,
3816                 'player_url': None
3817             })
3818
3819         if self._downloader.params.get('listformats', None):
3820             self._print_formats(formats)
3821             return
3822
3823         req_format = self._downloader.params.get('format', None)
3824         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3825
3826         if req_format is None or req_format == 'best':
3827             return [formats[0]]
3828         elif req_format == 'worst':
3829             return [formats[-1]]
3830         elif req_format in ('-1', 'all'):
3831             return formats
3832         else:
3833             format = self._specific( req_format, formats )
3834             if result is None:
3835                 self._downloader.trouble(u'ERROR: requested format not available')
3836                 return
3837             return [format]
3838
3839
3840
3841 class PornotubeIE(InfoExtractor):
3842     """Information extractor for pornotube.com."""
3843     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3844
3845     def _real_extract(self, url):
3846         mobj = re.match(self._VALID_URL, url)
3847         if mobj is None:
3848             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3849             return
3850
3851         video_id = mobj.group('videoid')
3852         video_title = mobj.group('title')
3853
3854         # Get webpage content
3855         webpage = self._download_webpage(url, video_id)
3856
3857         # Get the video URL
3858         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3859         result = re.search(VIDEO_URL_RE, webpage)
3860         if result is None:
3861             self._downloader.trouble(u'ERROR: unable to extract video url')
3862             return
3863         video_url = compat_urllib_parse.unquote(result.group('url'))
3864
3865         #Get the uploaded date
3866         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3867         result = re.search(VIDEO_UPLOADED_RE, webpage)
3868         if result is None:
3869             self._downloader.trouble(u'ERROR: unable to extract video title')
3870             return
3871         upload_date = result.group('date')
3872
3873         info = {'id': video_id,
3874                 'url': video_url,
3875                 'uploader': None,
3876                 'upload_date': upload_date,
3877                 'title': video_title,
3878                 'ext': 'flv',
3879                 'format': 'flv'}
3880
3881         return [info]
3882
3883 class YouJizzIE(InfoExtractor):
3884     """Information extractor for youjizz.com."""
3885     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3886
3887     def _real_extract(self, url):
3888         mobj = re.match(self._VALID_URL, url)
3889         if mobj is None:
3890             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3891             return
3892
3893         video_id = mobj.group('videoid')
3894
3895         # Get webpage content
3896         webpage = self._download_webpage(url, video_id)
3897
3898         # Get the video title
3899         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3900         if result is None:
3901             raise ExtractorError(u'ERROR: unable to extract video title')
3902         video_title = result.group('title').strip()
3903
3904         # Get the embed page
3905         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3906         if result is None:
3907             raise ExtractorError(u'ERROR: unable to extract embed page')
3908
3909         embed_page_url = result.group(0).strip()
3910         video_id = result.group('videoid')
3911
3912         webpage = self._download_webpage(embed_page_url, video_id)
3913
3914         # Get the video URL
3915         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3916         if result is None:
3917             raise ExtractorError(u'ERROR: unable to extract video url')
3918         video_url = result.group('source')
3919
3920         info = {'id': video_id,
3921                 'url': video_url,
3922                 'title': video_title,
3923                 'ext': 'flv',
3924                 'format': 'flv',
3925                 'player_url': embed_page_url}
3926
3927         return [info]
3928
3929 class EightTracksIE(InfoExtractor):
3930     IE_NAME = '8tracks'
3931     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3932
3933     def _real_extract(self, url):
3934         mobj = re.match(self._VALID_URL, url)
3935         if mobj is None:
3936             raise ExtractorError(u'Invalid URL: %s' % url)
3937         playlist_id = mobj.group('id')
3938
3939         webpage = self._download_webpage(url, playlist_id)
3940
3941         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3942         if not m:
3943             raise ExtractorError(u'Cannot find trax information')
3944         json_like = m.group(1)
3945         data = json.loads(json_like)
3946
3947         session = str(random.randint(0, 1000000000))
3948         mix_id = data['id']
3949         track_count = data['tracks_count']
3950         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3951         next_url = first_url
3952         res = []
3953         for i in itertools.count():
3954             api_json = self._download_webpage(next_url, playlist_id,
3955                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3956                 errnote=u'Failed to download song information')
3957             api_data = json.loads(api_json)
3958             track_data = api_data[u'set']['track']
3959             info = {
3960                 'id': track_data['id'],
3961                 'url': track_data['track_file_stream_url'],
3962                 'title': track_data['performer'] + u' - ' + track_data['name'],
3963                 'raw_title': track_data['name'],
3964                 'uploader_id': data['user']['login'],
3965                 'ext': 'm4a',
3966             }
3967             res.append(info)
3968             if api_data['set']['at_last_track']:
3969                 break
3970             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3971         return res
3972
3973 class KeekIE(InfoExtractor):
3974     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3975     IE_NAME = u'keek'
3976
3977     def _real_extract(self, url):
3978         m = re.match(self._VALID_URL, url)
3979         video_id = m.group('videoID')
3980         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3981         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3982         webpage = self._download_webpage(url, video_id)
3983         m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
3984         title = unescapeHTML(m.group('title'))
3985         m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
3986         uploader = unescapeHTML(m.group('uploader'))
3987         info = {
3988                 'id':video_id,
3989                 'url':video_url,
3990                 'ext': 'mp4',
3991                 'title': title,
3992                 'thumbnail': thumbnail,
3993                 'uploader': uploader
3994         }
3995         return [info]
3996
3997 class TEDIE(InfoExtractor):
3998     _VALID_URL=r'''http://www.ted.com/
3999                    (
4000                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4001                         |
4002                         ((?P<type_talk>talks)) # We have a simple talk
4003                    )
4004                    /(?P<name>\w+) # Here goes the name and then ".html"
4005                    '''
4006
4007     def suitable(self, url):
4008         """Receives a URL and returns True if suitable for this IE."""
4009         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
4010
4011     def _real_extract(self, url):
4012         m=re.match(self._VALID_URL, url, re.VERBOSE)
4013         if m.group('type_talk'):
4014             return [self._talk_info(url)]
4015         else :
4016             playlist_id=m.group('playlist_id')
4017             name=m.group('name')
4018             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4019             return self._playlist_videos_info(url,name,playlist_id)
4020
4021     def _talk_video_link(self,mediaSlug):
4022         '''Returns the video link for that mediaSlug'''
4023         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4024
4025     def _playlist_videos_info(self,url,name,playlist_id=0):
4026         '''Returns the videos of the playlist'''
4027         video_RE=r'''
4028                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4029                      ([.\s]*?)data-playlist_item_id="(\d+)"
4030                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4031                      '''
4032         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4033         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4034         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4035         m_names=re.finditer(video_name_RE,webpage)
4036         info=[]
4037         for m_video, m_name in zip(m_videos,m_names):
4038             video_id=m_video.group('video_id')
4039             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4040             info.append(self._talk_info(talk_url,video_id))
4041         return info
4042
4043     def _talk_info(self, url, video_id=0):
4044         """Return the video for the talk in the url"""
4045         m=re.match(self._VALID_URL, url,re.VERBOSE)
4046         videoName=m.group('name')
4047         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4048         # If the url includes the language we get the title translated
4049         title_RE=r'<h1><span id="altHeadline" >(?P<title>.*)</span></h1>'
4050         title=re.search(title_RE, webpage).group('title')
4051         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4052                         "id":(?P<videoID>[\d]+).*?
4053                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4054         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4055         thumb_match=re.search(thumb_RE,webpage)
4056         info_match=re.search(info_RE,webpage,re.VERBOSE)
4057         video_id=info_match.group('videoID')
4058         mediaSlug=info_match.group('mediaSlug')
4059         video_url=self._talk_video_link(mediaSlug)
4060         info = {
4061                 'id': video_id,
4062                 'url': video_url,
4063                 'ext': 'mp4',
4064                 'title': title,
4065                 'thumbnail': thumb_match.group('thumbnail')
4066                 }
4067         return info
4068
4069 class MySpassIE(InfoExtractor):
4070     _VALID_URL = r'http://www.myspass.de/.*'
4071
4072     def _real_extract(self, url):
4073         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4074
4075         # video id is the last path element of the URL
4076         # usually there is a trailing slash, so also try the second but last
4077         url_path = compat_urllib_parse_urlparse(url).path
4078         url_parent_path, video_id = os.path.split(url_path)
4079         if not video_id:
4080             _, video_id = os.path.split(url_parent_path)
4081
4082         # get metadata
4083         metadata_url = META_DATA_URL_TEMPLATE % video_id
4084         metadata_text = self._download_webpage(metadata_url, video_id)
4085         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4086
4087         # extract values from metadata
4088         url_flv_el = metadata.find('url_flv')
4089         if url_flv_el is None:
4090             self._downloader.trouble(u'ERROR: unable to extract download url')
4091             return
4092         video_url = url_flv_el.text
4093         extension = os.path.splitext(video_url)[1][1:]
4094         title_el = metadata.find('title')
4095         if title_el is None:
4096             self._downloader.trouble(u'ERROR: unable to extract title')
4097             return
4098         title = title_el.text
4099         format_id_el = metadata.find('format_id')
4100         if format_id_el is None:
4101             format = ext
4102         else:
4103             format = format_id_el.text
4104         description_el = metadata.find('description')
4105         if description_el is not None:
4106             description = description_el.text
4107         else:
4108             description = None
4109         imagePreview_el = metadata.find('imagePreview')
4110         if imagePreview_el is not None:
4111             thumbnail = imagePreview_el.text
4112         else:
4113             thumbnail = None
4114         info = {
4115             'id': video_id,
4116             'url': video_url,
4117             'title': title,
4118             'ext': extension,
4119             'format': format,
4120             'thumbnail': thumbnail,
4121             'description': description
4122         }
4123         return [info]
4124
4125 def gen_extractors():
4126     """ Return a list of an instance of every supported extractor.
4127     The order does matter; the first extractor matched is the one handling the URL.
4128     """
4129     return [
4130         YoutubePlaylistIE(),
4131         YoutubeChannelIE(),
4132         YoutubeUserIE(),
4133         YoutubeSearchIE(),
4134         YoutubeIE(),
4135         MetacafeIE(),
4136         DailymotionIE(),
4137         GoogleSearchIE(),
4138         PhotobucketIE(),
4139         YahooIE(),
4140         YahooSearchIE(),
4141         DepositFilesIE(),
4142         FacebookIE(),
4143         BlipTVUserIE(),
4144         BlipTVIE(),
4145         VimeoIE(),
4146         MyVideoIE(),
4147         ComedyCentralIE(),
4148         EscapistIE(),
4149         CollegeHumorIE(),
4150         XVideosIE(),
4151         SoundcloudIE(),
4152         InfoQIE(),
4153         MixcloudIE(),
4154         StanfordOpenClassroomIE(),
4155         MTVIE(),
4156         YoukuIE(),
4157         XNXXIE(),
4158         YouJizzIE(),
4159         PornotubeIE(),
4160         YouPornIE(),
4161         GooglePlusIE(),
4162         ArteTvIE(),
4163         NBAIE(),
4164         JustinTVIE(),
4165         FunnyOrDieIE(),
4166         TweetReelIE(),
4167         SteamIE(),
4168         UstreamIE(),
4169         RBMARadioIE(),
4170         EightTracksIE(),
4171         KeekIE(),
4172         TEDIE(),
4173         MySpassIE(),
4174         GenericIE()
4175     ]
4176
4177