_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19
  20 from .utils import *
  21
  22
  23 class InfoExtractor(object):
  24     """Information Extractor class.
  25
  26     Information extractors are the classes that, given a URL, extract
  27     information about the video (or videos) the URL refers to. This
  28     information includes the real video URL, the video title, author and
  29     others. The information is stored in a dictionary which is then
  30     passed to the FileDownloader. The FileDownloader processes this
  31     information possibly downloading the video to the file system, among
  32     other possible outcomes.
  33
  34     The dictionaries must include the following fields:
  35
  36     id:             Video identifier.
  37     url:            Final video URL.
  38     title:          Video title, unescaped.
  39     ext:            Video filename extension.
  40
  41     The following fields are optional:
  42
  43     format:         The video format, defaults to ext (used for --get-format)
  44     thumbnail:      Full URL to a video thumbnail image.
  45     description:    One-line video description.
  46     uploader:       Full name of the video uploader.
  47     upload_date:    Video upload date (YYYYMMDD).
  48     uploader_id:    Nickname or id of the video uploader.
  49     location:       Physical location of the video.
  50     player_url:     SWF Player URL (used for rtmpdump).
  51     subtitles:      The .srt file contents.
  52     urlhandle:      [internal] The urlHandle to be used to download the file,
  53                     like returned by urllib.request.urlopen
  54
  55     The fields should all be Unicode strings.
  56
  57     Subclasses of this one should re-define the _real_initialize() and
  58     _real_extract() methods and define a _VALID_URL regexp.
  59     Probably, they should also be added to the list of extractors.
  60
  61     _real_extract() must return a *list* of information dictionaries as
  62     described above.
  63
  64     Finally, the _WORKING attribute should be set to False for broken IEs
  65     in order to warn the users and skip the tests.
  66     """
  67
  68     _ready = False
  69     _downloader = None
  70     _WORKING = True
  71
  72     def __init__(self, downloader=None):
  73         """Constructor. Receives an optional downloader."""
  74         self._ready = False
  75         self.set_downloader(downloader)
  76
  77     @classmethod
  78     def suitable(cls, url):
  79         """Receives a URL and returns True if suitable for this IE."""
  80         return re.match(cls._VALID_URL, url) is not None
  81
  82     @classmethod
  83     def working(cls):
  84         """Getter method for _WORKING."""
  85         return cls._WORKING
  86
  87     def initialize(self):
  88         """Initializes an instance (authentication, etc)."""
  89         if not self._ready:
  90             self._real_initialize()
  91             self._ready = True
  92
  93     def extract(self, url):
  94         """Extracts URL information and returns it in list of dicts."""
  95         self.initialize()
  96         return self._real_extract(url)
  97
  98     def set_downloader(self, downloader):
  99         """Sets the downloader for this IE."""
 100         self._downloader = downloader
 101
 102     def _real_initialize(self):
 103         """Real initialization process. Redefine in subclasses."""
 104         pass
 105
 106     def _real_extract(self, url):
 107         """Real extraction process. Redefine in subclasses."""
 108         pass
 109
 110     @property
 111     def IE_NAME(self):
 112         return type(self).__name__[:-2]
 113
 114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 115         """ Returns the response handle """
 116         if note is None:
 117             note = u'Downloading video webpage'
 118         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
 119         try:
 120             return compat_urllib_request.urlopen(url_or_request)
 121         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 122             if errnote is None:
 123                 errnote = u'Unable to download webpage'
 124             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 125
 126     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 127         """ Returns the data of the page as a string """
 128         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 129         webpage_bytes = urlh.read()
 130         return webpage_bytes.decode('utf-8', 'replace')
 131
 132
 133 class YoutubeIE(InfoExtractor):
 134     """Information extractor for youtube.com."""
 135
 136     _VALID_URL = r"""^
 137                      (
 138                          (?:https?://)?                                       # http(s):// (optional)
 139                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 140                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 141                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 142                          (?:                                                  # the various things that can precede the ID:
 143                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 144                              |(?:                                             # or the v= param in all its forms
 145                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 146                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 147                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 148                                  v=
 149                              )
 150                          )?                                                   # optional -> youtube.com/xxxx is OK
 151                      )?                                                       # all until now is optional -> you can pass the naked ID
 152                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 153                      (?(1).+)?                                                # if we found the ID, everything can follow
 154                      $"""
 155     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 156     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 157     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 158     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 159     _NETRC_MACHINE = 'youtube'
 160     # Listed in order of quality
 161     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 162     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 163     _video_extensions = {
 164         '13': '3gp',
 165         '17': 'mp4',
 166         '18': 'mp4',
 167         '22': 'mp4',
 168         '37': 'mp4',
 169         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 170         '43': 'webm',
 171         '44': 'webm',
 172         '45': 'webm',
 173         '46': 'webm',
 174     }
 175     _video_dimensions = {
 176         '5': '240x400',
 177         '6': '???',
 178         '13': '???',
 179         '17': '144x176',
 180         '18': '360x640',
 181         '22': '720x1280',
 182         '34': '360x640',
 183         '35': '480x854',
 184         '37': '1080x1920',
 185         '38': '3072x4096',
 186         '43': '360x640',
 187         '44': '480x854',
 188         '45': '720x1280',
 189         '46': '1080x1920',
 190     }
 191     IE_NAME = u'youtube'
 192
 193     @classmethod
 194     def suitable(cls, url):
 195         """Receives a URL and returns True if suitable for this IE."""
 196         if YoutubePlaylistIE.suitable(url): return False
 197         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 198
 199     def report_lang(self):
 200         """Report attempt to set language."""
 201         self._downloader.to_screen(u'[youtube] Setting language')
 202
 203     def report_login(self):
 204         """Report attempt to log in."""
 205         self._downloader.to_screen(u'[youtube] Logging in')
 206
 207     def report_age_confirmation(self):
 208         """Report attempt to confirm age."""
 209         self._downloader.to_screen(u'[youtube] Confirming age')
 210
 211     def report_video_webpage_download(self, video_id):
 212         """Report attempt to download video webpage."""
 213         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 214
 215     def report_video_info_webpage_download(self, video_id):
 216         """Report attempt to download video info webpage."""
 217         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 218
 219     def report_video_subtitles_download(self, video_id):
 220         """Report attempt to download video info webpage."""
 221         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 222
 223     def report_information_extraction(self, video_id):
 224         """Report attempt to extract video information."""
 225         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 226
 227     def report_unavailable_format(self, video_id, format):
 228         """Report extracted video URL."""
 229         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 230
 231     def report_rtmp_download(self):
 232         """Indicate the download will use the RTMP protocol."""
 233         self._downloader.to_screen(u'[youtube] RTMP download detected')
 234
 235     def _closed_captions_xml_to_srt(self, xml_string):
 236         srt = ''
 237         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 238         # TODO parse xml instead of regex
 239         for n, (start, dur_tag, dur, caption) in enumerate(texts):
 240             if not dur: dur = '4'
 241             start = float(start)
 242             end = start + float(dur)
 243             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 244             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 245             caption = unescapeHTML(caption)
 246             caption = unescapeHTML(caption) # double cycle, intentional
 247             srt += str(n+1) + '\n'
 248             srt += start + ' --> ' + end + '\n'
 249             srt += caption + '\n\n'
 250         return srt
 251
 252     def _extract_subtitles(self, video_id):
 253         self.report_video_subtitles_download(video_id)
 254         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 255         try:
 256             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 257         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 258             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 259         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 260         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 261         if not srt_lang_list:
 262             return (u'WARNING: video has no closed captions', None)
 263         if self._downloader.params.get('subtitleslang', False):
 264             srt_lang = self._downloader.params.get('subtitleslang')
 265         elif 'en' in srt_lang_list:
 266             srt_lang = 'en'
 267         else:
 268             srt_lang = list(srt_lang_list.keys())[0]
 269         if not srt_lang in srt_lang_list:
 270             return (u'WARNING: no closed captions found in the specified language', None)
 271         params = compat_urllib_parse.urlencode({
 272             'lang': srt_lang,
 273             'name': srt_lang_list[srt_lang].encode('utf-8'),
 274             'v': video_id,
 275         })
 276         url = 'http://www.youtube.com/api/timedtext?' + params
 277         try:
 278             srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8')
 279         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 280             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 281         if not srt_xml:
 282             return (u'WARNING: Did not fetch video subtitles', None)
 283         return (None, self._closed_captions_xml_to_srt(srt_xml))
 284
 285     def _print_formats(self, formats):
 286         print('Available formats:')
 287         for x in formats:
 288             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 289
 290     def _real_initialize(self):
 291         if self._downloader is None:
 292             return
 293
 294         username = None
 295         password = None
 296         downloader_params = self._downloader.params
 297
 298         # Attempt to use provided username and password or .netrc data
 299         if downloader_params.get('username', None) is not None:
 300             username = downloader_params['username']
 301             password = downloader_params['password']
 302         elif downloader_params.get('usenetrc', False):
 303             try:
 304                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 305                 if info is not None:
 306                     username = info[0]
 307                     password = info[2]
 308                 else:
 309                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 310             except (IOError, netrc.NetrcParseError) as err:
 311                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 312                 return
 313
 314         # Set language
 315         request = compat_urllib_request.Request(self._LANG_URL)
 316         try:
 317             self.report_lang()
 318             compat_urllib_request.urlopen(request).read()
 319         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 320             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 321             return
 322
 323         # No authentication to be performed
 324         if username is None:
 325             return
 326
 327         request = compat_urllib_request.Request(self._LOGIN_URL)
 328         try:
 329             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 330         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 331             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 332             return
 333
 334         galx = None
 335         dsh = None
 336         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 337         if match:
 338           galx = match.group(1)
 339
 340         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 341         if match:
 342           dsh = match.group(1)
 343
 344         # Log in
 345         login_form_strs = {
 346                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 347                 u'Email': username,
 348                 u'GALX': galx,
 349                 u'Passwd': password,
 350                 u'PersistentCookie': u'yes',
 351                 u'_utf8': u'霱',
 352                 u'bgresponse': u'js_disabled',
 353                 u'checkConnection': u'',
 354                 u'checkedDomains': u'youtube',
 355                 u'dnConn': u'',
 356                 u'dsh': dsh,
 357                 u'pstMsg': u'0',
 358                 u'rmShown': u'1',
 359                 u'secTok': u'',
 360                 u'signIn': u'Sign in',
 361                 u'timeStmp': u'',
 362                 u'service': u'youtube',
 363                 u'uilel': u'3',
 364                 u'hl': u'en_US',
 365         }
 366         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 367         # chokes on unicode
 368         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 369         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 370         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 371         try:
 372             self.report_login()
 373             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 374             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 375                 self._downloader.report_warning(u'unable to log in: bad username or password')
 376                 return
 377         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 378             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 379             return
 380
 381         # Confirm age
 382         age_form = {
 383                 'next_url':     '/',
 384                 'action_confirm':   'Confirm',
 385                 }
 386         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 387         try:
 388             self.report_age_confirmation()
 389             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 390         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 391             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 392             return
 393
 394     def _extract_id(self, url):
 395         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 396         if mobj is None:
 397             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 398             return
 399         video_id = mobj.group(2)
 400         return video_id
 401
 402     def _real_extract(self, url):
 403         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 404         mobj = re.search(self._NEXT_URL_RE, url)
 405         if mobj:
 406             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 407         video_id = self._extract_id(url)
 408
 409         # Get video webpage
 410         self.report_video_webpage_download(video_id)
 411         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 412         request = compat_urllib_request.Request(url)
 413         try:
 414             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 415         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 416             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
 417             return
 418
 419         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 420
 421         # Attempt to extract SWF player URL
 422         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 423         if mobj is not None:
 424             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 425         else:
 426             player_url = None
 427
 428         # Get video info
 429         self.report_video_info_webpage_download(video_id)
 430         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 431             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 432                     % (video_id, el_type))
 433             request = compat_urllib_request.Request(video_info_url)
 434             try:
 435                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
 436                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
 437                 video_info = compat_parse_qs(video_info_webpage)
 438                 if 'token' in video_info:
 439                     break
 440             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 441                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 442                 return
 443         if 'token' not in video_info:
 444             if 'reason' in video_info:
 445                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
 446             else:
 447                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 448             return
 449
 450         # Check for "rental" videos
 451         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 452             self._downloader.trouble(u'ERROR: "rental" videos not supported')
 453             return
 454
 455         # Start extracting information
 456         self.report_information_extraction(video_id)
 457
 458         # uploader
 459         if 'author' not in video_info:
 460             self._downloader.trouble(u'ERROR: unable to extract uploader name')
 461             return
 462         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 463
 464         # uploader_id
 465         video_uploader_id = None
 466         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 467         if mobj is not None:
 468             video_uploader_id = mobj.group(1)
 469         else:
 470             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 471
 472         # title
 473         if 'title' not in video_info:
 474             self._downloader.trouble(u'ERROR: unable to extract video title')
 475             return
 476         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 477
 478         # thumbnail image
 479         if 'thumbnail_url' not in video_info:
 480             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 481             video_thumbnail = ''
 482         else:   # don't panic if we can't find it
 483             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 484
 485         # upload date
 486         upload_date = None
 487         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 488         if mobj is not None:
 489             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 490             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 491             for expression in format_expressions:
 492                 try:
 493                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 494                 except:
 495                     pass
 496
 497         # description
 498         video_description = get_element_by_id("eow-description", video_webpage)
 499         if video_description:
 500             video_description = clean_html(video_description)
 501         else:
 502             video_description = ''
 503
 504         # closed captions
 505         video_subtitles = None
 506         if self._downloader.params.get('writesubtitles', False):
 507             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
 508             if srt_error:
 509                 self._downloader.trouble(srt_error)
 510
 511         if 'length_seconds' not in video_info:
 512             self._downloader.trouble(u'WARNING: unable to extract video duration')
 513             video_duration = ''
 514         else:
 515             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 516
 517         # token
 518         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 519
 520         # Decide which formats to download
 521         req_format = self._downloader.params.get('format', None)
 522
 523         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 524             self.report_rtmp_download()
 525             video_url_list = [(None, video_info['conn'][0])]
 526         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 527             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 528             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 529             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 530             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 531
 532             format_limit = self._downloader.params.get('format_limit', None)
 533             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 534             if format_limit is not None and format_limit in available_formats:
 535                 format_list = available_formats[available_formats.index(format_limit):]
 536             else:
 537                 format_list = available_formats
 538             existing_formats = [x for x in format_list if x in url_map]
 539             if len(existing_formats) == 0:
 540                 self._downloader.trouble(u'ERROR: no known formats available for video')
 541                 return
 542             if self._downloader.params.get('listformats', None):
 543                 self._print_formats(existing_formats)
 544                 return
 545             if req_format is None or req_format == 'best':
 546                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 547             elif req_format == 'worst':
 548                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 549             elif req_format in ('-1', 'all'):
 550                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 551             else:
 552                 # Specific formats. We pick the first in a slash-delimeted sequence.
 553                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 554                 req_formats = req_format.split('/')
 555                 video_url_list = None
 556                 for rf in req_formats:
 557                     if rf in url_map:
 558                         video_url_list = [(rf, url_map[rf])]
 559                         break
 560                 if video_url_list is None:
 561                     self._downloader.trouble(u'ERROR: requested format not available')
 562                     return
 563         else:
 564             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 565             return
 566
 567         results = []
 568         for format_param, video_real_url in video_url_list:
 569             # Extension
 570             video_extension = self._video_extensions.get(format_param, 'flv')
 571
 572             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 573                                               self._video_dimensions.get(format_param, '???'))
 574
 575             results.append({
 576                 'id':       video_id,
 577                 'url':      video_real_url,
 578                 'uploader': video_uploader,
 579                 'uploader_id': video_uploader_id,
 580                 'upload_date':  upload_date,
 581                 'title':    video_title,
 582                 'ext':      video_extension,
 583                 'format':   video_format,
 584                 'thumbnail':    video_thumbnail,
 585                 'description':  video_description,
 586                 'player_url':   player_url,
 587                 'subtitles':    video_subtitles,
 588                 'duration':     video_duration
 589             })
 590         return results
 591
 592
 593 class MetacafeIE(InfoExtractor):
 594     """Information Extractor for metacafe.com."""
 595
 596     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 597     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 598     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 599     IE_NAME = u'metacafe'
 600
 601     def __init__(self, downloader=None):
 602         InfoExtractor.__init__(self, downloader)
 603
 604     def report_disclaimer(self):
 605         """Report disclaimer retrieval."""
 606         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 607
 608     def report_age_confirmation(self):
 609         """Report attempt to confirm age."""
 610         self._downloader.to_screen(u'[metacafe] Confirming age')
 611
 612     def report_download_webpage(self, video_id):
 613         """Report webpage download."""
 614         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 615
 616     def report_extraction(self, video_id):
 617         """Report information extraction."""
 618         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 619
 620     def _real_initialize(self):
 621         # Retrieve disclaimer
 622         request = compat_urllib_request.Request(self._DISCLAIMER)
 623         try:
 624             self.report_disclaimer()
 625             disclaimer = compat_urllib_request.urlopen(request).read()
 626         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 627             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
 628             return
 629
 630         # Confirm age
 631         disclaimer_form = {
 632             'filters': '0',
 633             'submit': "Continue - I'm over 18",
 634             }
 635         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 636         try:
 637             self.report_age_confirmation()
 638             disclaimer = compat_urllib_request.urlopen(request).read()
 639         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 640             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 641             return
 642
 643     def _real_extract(self, url):
 644         # Extract id and simplified title from URL
 645         mobj = re.match(self._VALID_URL, url)
 646         if mobj is None:
 647             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 648             return
 649
 650         video_id = mobj.group(1)
 651
 652         # Check if video comes from YouTube
 653         mobj2 = re.match(r'^yt-(.*)$', video_id)
 654         if mobj2 is not None:
 655             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 656             return
 657
 658         # Retrieve video webpage to extract further information
 659         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
 660         try:
 661             self.report_download_webpage(video_id)
 662             webpage = compat_urllib_request.urlopen(request).read()
 663         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 664             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 665             return
 666
 667         # Extract URL, uploader and title from webpage
 668         self.report_extraction(video_id)
 669         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 670         if mobj is not None:
 671             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 672             video_extension = mediaURL[-3:]
 673
 674             # Extract gdaKey if available
 675             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 676             if mobj is None:
 677                 video_url = mediaURL
 678             else:
 679                 gdaKey = mobj.group(1)
 680                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 681         else:
 682             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 683             if mobj is None:
 684                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 685                 return
 686             vardict = compat_parse_qs(mobj.group(1))
 687             if 'mediaData' not in vardict:
 688                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 689                 return
 690             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 691             if mobj is None:
 692                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 693                 return
 694             mediaURL = mobj.group(1).replace('\\/', '/')
 695             video_extension = mediaURL[-3:]
 696             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 697
 698         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 699         if mobj is None:
 700             self._downloader.trouble(u'ERROR: unable to extract title')
 701             return
 702         video_title = mobj.group(1).decode('utf-8')
 703
 704         mobj = re.search(r'submitter=(.*?);', webpage)
 705         if mobj is None:
 706             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 707             return
 708         video_uploader = mobj.group(1)
 709
 710         return [{
 711             'id':       video_id.decode('utf-8'),
 712             'url':      video_url.decode('utf-8'),
 713             'uploader': video_uploader.decode('utf-8'),
 714             'upload_date':  None,
 715             'title':    video_title,
 716             'ext':      video_extension.decode('utf-8'),
 717         }]
 718
 719
 720 class DailymotionIE(InfoExtractor):
 721     """Information Extractor for Dailymotion"""
 722
 723     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 724     IE_NAME = u'dailymotion'
 725     _WORKING = False
 726
 727     def __init__(self, downloader=None):
 728         InfoExtractor.__init__(self, downloader)
 729
 730     def report_extraction(self, video_id):
 731         """Report information extraction."""
 732         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 733
 734     def _real_extract(self, url):
 735         # Extract id and simplified title from URL
 736         mobj = re.match(self._VALID_URL, url)
 737         if mobj is None:
 738             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 739             return
 740
 741         video_id = mobj.group(1).split('_')[0].split('?')[0]
 742
 743         video_extension = 'mp4'
 744
 745         # Retrieve video webpage to extract further information
 746         request = compat_urllib_request.Request(url)
 747         request.add_header('Cookie', 'family_filter=off')
 748         webpage = self._download_webpage(request, video_id)
 749
 750         # Extract URL, uploader and title from webpage
 751         self.report_extraction(video_id)
 752         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 753         if mobj is None:
 754             self._downloader.trouble(u'ERROR: unable to extract media URL')
 755             return
 756         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 757
 758         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 759             if key in flashvars:
 760                 max_quality = key
 761                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 762                 break
 763         else:
 764             self._downloader.trouble(u'ERROR: unable to extract video URL')
 765             return
 766
 767         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 768         if mobj is None:
 769             self._downloader.trouble(u'ERROR: unable to extract video URL')
 770             return
 771
 772         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 773
 774         # TODO: support choosing qualities
 775
 776         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 777         if mobj is None:
 778             self._downloader.trouble(u'ERROR: unable to extract title')
 779             return
 780         video_title = unescapeHTML(mobj.group('title'))
 781
 782         video_uploader = None
 783         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 784         if mobj is None:
 785             # lookin for official user
 786             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 787             if mobj_official is None:
 788                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 789             else:
 790                 video_uploader = mobj_official.group(1)
 791         else:
 792             video_uploader = mobj.group(1)
 793
 794         video_upload_date = None
 795         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 796         if mobj is not None:
 797             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 798
 799         return [{
 800             'id':       video_id,
 801             'url':      video_url,
 802             'uploader': video_uploader,
 803             'upload_date':  video_upload_date,
 804             'title':    video_title,
 805             'ext':      video_extension,
 806         }]
 807
 808
 809 class PhotobucketIE(InfoExtractor):
 810     """Information extractor for photobucket.com."""
 811
 812     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 813     IE_NAME = u'photobucket'
 814
 815     def __init__(self, downloader=None):
 816         InfoExtractor.__init__(self, downloader)
 817
 818     def report_download_webpage(self, video_id):
 819         """Report webpage download."""
 820         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 821
 822     def report_extraction(self, video_id):
 823         """Report information extraction."""
 824         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 825
 826     def _real_extract(self, url):
 827         # Extract id from URL
 828         mobj = re.match(self._VALID_URL, url)
 829         if mobj is None:
 830             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 831             return
 832
 833         video_id = mobj.group(1)
 834
 835         video_extension = 'flv'
 836
 837         # Retrieve video webpage to extract further information
 838         request = compat_urllib_request.Request(url)
 839         try:
 840             self.report_download_webpage(video_id)
 841             webpage = compat_urllib_request.urlopen(request).read()
 842         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 843             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 844             return
 845
 846         # Extract URL, uploader, and title from webpage
 847         self.report_extraction(video_id)
 848         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 849         if mobj is None:
 850             self._downloader.trouble(u'ERROR: unable to extract media URL')
 851             return
 852         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 853
 854         video_url = mediaURL
 855
 856         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 857         if mobj is None:
 858             self._downloader.trouble(u'ERROR: unable to extract title')
 859             return
 860         video_title = mobj.group(1).decode('utf-8')
 861
 862         video_uploader = mobj.group(2).decode('utf-8')
 863
 864         return [{
 865             'id':       video_id.decode('utf-8'),
 866             'url':      video_url.decode('utf-8'),
 867             'uploader': video_uploader,
 868             'upload_date':  None,
 869             'title':    video_title,
 870             'ext':      video_extension.decode('utf-8'),
 871         }]
 872
 873
 874 class YahooIE(InfoExtractor):
 875     """Information extractor for video.yahoo.com."""
 876
 877     _WORKING = False
 878     # _VALID_URL matches all Yahoo! Video URLs
 879     # _VPAGE_URL matches only the extractable '/watch/' URLs
 880     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 881     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 882     IE_NAME = u'video.yahoo'
 883
 884     def __init__(self, downloader=None):
 885         InfoExtractor.__init__(self, downloader)
 886
 887     def report_download_webpage(self, video_id):
 888         """Report webpage download."""
 889         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 890
 891     def report_extraction(self, video_id):
 892         """Report information extraction."""
 893         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 894
 895     def _real_extract(self, url, new_video=True):
 896         # Extract ID from URL
 897         mobj = re.match(self._VALID_URL, url)
 898         if mobj is None:
 899             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 900             return
 901
 902         video_id = mobj.group(2)
 903         video_extension = 'flv'
 904
 905         # Rewrite valid but non-extractable URLs as
 906         # extractable English language /watch/ URLs
 907         if re.match(self._VPAGE_URL, url) is None:
 908             request = compat_urllib_request.Request(url)
 909             try:
 910                 webpage = compat_urllib_request.urlopen(request).read()
 911             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 912                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 913                 return
 914
 915             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 916             if mobj is None:
 917                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 918                 return
 919             yahoo_id = mobj.group(1)
 920
 921             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 922             if mobj is None:
 923                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 924                 return
 925             yahoo_vid = mobj.group(1)
 926
 927             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 928             return self._real_extract(url, new_video=False)
 929
 930         # Retrieve video webpage to extract further information
 931         request = compat_urllib_request.Request(url)
 932         try:
 933             self.report_download_webpage(video_id)
 934             webpage = compat_urllib_request.urlopen(request).read()
 935         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 936             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 937             return
 938
 939         # Extract uploader and title from webpage
 940         self.report_extraction(video_id)
 941         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 942         if mobj is None:
 943             self._downloader.trouble(u'ERROR: unable to extract video title')
 944             return
 945         video_title = mobj.group(1).decode('utf-8')
 946
 947         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 948         if mobj is None:
 949             self._downloader.trouble(u'ERROR: unable to extract video uploader')
 950             return
 951         video_uploader = mobj.group(1).decode('utf-8')
 952
 953         # Extract video thumbnail
 954         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 955         if mobj is None:
 956             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 957             return
 958         video_thumbnail = mobj.group(1).decode('utf-8')
 959
 960         # Extract video description
 961         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 962         if mobj is None:
 963             self._downloader.trouble(u'ERROR: unable to extract video description')
 964             return
 965         video_description = mobj.group(1).decode('utf-8')
 966         if not video_description:
 967             video_description = 'No description available.'
 968
 969         # Extract video height and width
 970         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 971         if mobj is None:
 972             self._downloader.trouble(u'ERROR: unable to extract video height')
 973             return
 974         yv_video_height = mobj.group(1)
 975
 976         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 977         if mobj is None:
 978             self._downloader.trouble(u'ERROR: unable to extract video width')
 979             return
 980         yv_video_width = mobj.group(1)
 981
 982         # Retrieve video playlist to extract media URL
 983         # I'm not completely sure what all these options are, but we
 984         # seem to need most of them, otherwise the server sends a 401.
 985         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 986         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 987         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 988                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 989                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 990         try:
 991             self.report_download_webpage(video_id)
 992             webpage = compat_urllib_request.urlopen(request).read()
 993         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 994             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 995             return
 996
 997         # Extract media URL from playlist XML
 998         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 999         if mobj is None:
1000             self._downloader.trouble(u'ERROR: Unable to extract media URL')
1001             return
1002         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1003         video_url = unescapeHTML(video_url)
1004
1005         return [{
1006             'id':       video_id.decode('utf-8'),
1007             'url':      video_url,
1008             'uploader': video_uploader,
1009             'upload_date':  None,
1010             'title':    video_title,
1011             'ext':      video_extension.decode('utf-8'),
1012             'thumbnail':    video_thumbnail.decode('utf-8'),
1013             'description':  video_description,
1014         }]
1015
1016
1017 class VimeoIE(InfoExtractor):
1018     """Information extractor for vimeo.com."""
1019
1020     # _VALID_URL matches Vimeo URLs
1021     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1022     IE_NAME = u'vimeo'
1023
1024     def __init__(self, downloader=None):
1025         InfoExtractor.__init__(self, downloader)
1026
1027     def report_download_webpage(self, video_id):
1028         """Report webpage download."""
1029         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1030
1031     def report_extraction(self, video_id):
1032         """Report information extraction."""
1033         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1034
1035     def _real_extract(self, url, new_video=True):
1036         # Extract ID from URL
1037         mobj = re.match(self._VALID_URL, url)
1038         if mobj is None:
1039             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1040             return
1041
1042         video_id = mobj.group('id')
1043         if not mobj.group('proto'):
1044             url = 'https://' + url
1045         if mobj.group('direct_link'):
1046             url = 'https://vimeo.com/' + video_id
1047
1048         # Retrieve video webpage to extract further information
1049         request = compat_urllib_request.Request(url, None, std_headers)
1050         try:
1051             self.report_download_webpage(video_id)
1052             webpage_bytes = compat_urllib_request.urlopen(request).read()
1053             webpage = webpage_bytes.decode('utf-8')
1054         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1055             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1056             return
1057
1058         # Now we begin extracting as much information as we can from what we
1059         # retrieved. First we extract the information common to all extractors,
1060         # and latter we extract those that are Vimeo specific.
1061         self.report_extraction(video_id)
1062
1063         # Extract the config JSON
1064         try:
1065             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1066             config = json.loads(config)
1067         except:
1068             self._downloader.trouble(u'ERROR: unable to extract info section')
1069             return
1070
1071         # Extract title
1072         video_title = config["video"]["title"]
1073
1074         # Extract uploader and uploader_id
1075         video_uploader = config["video"]["owner"]["name"]
1076         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1077
1078         # Extract video thumbnail
1079         video_thumbnail = config["video"]["thumbnail"]
1080
1081         # Extract video description
1082         video_description = get_element_by_attribute("itemprop", "description", webpage)
1083         if video_description: video_description = clean_html(video_description)
1084         else: video_description = ''
1085
1086         # Extract upload date
1087         video_upload_date = None
1088         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1089         if mobj is not None:
1090             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1091
1092         # Vimeo specific: extract request signature and timestamp
1093         sig = config['request']['signature']
1094         timestamp = config['request']['timestamp']
1095
1096         # Vimeo specific: extract video codec and quality information
1097         # First consider quality, then codecs, then take everything
1098         # TODO bind to format param
1099         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1100         files = { 'hd': [], 'sd': [], 'other': []}
1101         for codec_name, codec_extension in codecs:
1102             if codec_name in config["video"]["files"]:
1103                 if 'hd' in config["video"]["files"][codec_name]:
1104                     files['hd'].append((codec_name, codec_extension, 'hd'))
1105                 elif 'sd' in config["video"]["files"][codec_name]:
1106                     files['sd'].append((codec_name, codec_extension, 'sd'))
1107                 else:
1108                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1109
1110         for quality in ('hd', 'sd', 'other'):
1111             if len(files[quality]) > 0:
1112                 video_quality = files[quality][0][2]
1113                 video_codec = files[quality][0][0]
1114                 video_extension = files[quality][0][1]
1115                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1116                 break
1117         else:
1118             self._downloader.trouble(u'ERROR: no known codec found')
1119             return
1120
1121         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1122                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1123
1124         return [{
1125             'id':       video_id,
1126             'url':      video_url,
1127             'uploader': video_uploader,
1128             'uploader_id': video_uploader_id,
1129             'upload_date':  video_upload_date,
1130             'title':    video_title,
1131             'ext':      video_extension,
1132             'thumbnail':    video_thumbnail,
1133             'description':  video_description,
1134         }]
1135
1136
1137 class ArteTvIE(InfoExtractor):
1138     """arte.tv information extractor."""
1139
1140     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1141     _LIVE_URL = r'index-[0-9]+\.html$'
1142
1143     IE_NAME = u'arte.tv'
1144
1145     def __init__(self, downloader=None):
1146         InfoExtractor.__init__(self, downloader)
1147
1148     def report_download_webpage(self, video_id):
1149         """Report webpage download."""
1150         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1151
1152     def report_extraction(self, video_id):
1153         """Report information extraction."""
1154         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1155
1156     def fetch_webpage(self, url):
1157         request = compat_urllib_request.Request(url)
1158         try:
1159             self.report_download_webpage(url)
1160             webpage = compat_urllib_request.urlopen(request).read()
1161         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1162             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1163             return
1164         except ValueError as err:
1165             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1166             return
1167         return webpage
1168
1169     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1170         page = self.fetch_webpage(url)
1171         mobj = re.search(regex, page, regexFlags)
1172         info = {}
1173
1174         if mobj is None:
1175             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1176             return
1177
1178         for (i, key, err) in matchTuples:
1179             if mobj.group(i) is None:
1180                 self._downloader.trouble(err)
1181                 return
1182             else:
1183                 info[key] = mobj.group(i)
1184
1185         return info
1186
1187     def extractLiveStream(self, url):
1188         video_lang = url.split('/')[-4]
1189         info = self.grep_webpage(
1190             url,
1191             r'src="(.*?/videothek_js.*?\.js)',
1192             0,
1193             [
1194                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1195             ]
1196         )
1197         http_host = url.split('/')[2]
1198         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1199         info = self.grep_webpage(
1200             next_url,
1201             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1202                 '(http://.*?\.swf).*?' +
1203                 '(rtmp://.*?)\'',
1204             re.DOTALL,
1205             [
1206                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1207                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1208                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1209             ]
1210         )
1211         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1212
1213     def extractPlus7Stream(self, url):
1214         video_lang = url.split('/')[-3]
1215         info = self.grep_webpage(
1216             url,
1217             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1218             0,
1219             [
1220                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1221             ]
1222         )
1223         next_url = compat_urllib_parse.unquote(info.get('url'))
1224         info = self.grep_webpage(
1225             next_url,
1226             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1227             0,
1228             [
1229                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1230             ]
1231         )
1232         next_url = compat_urllib_parse.unquote(info.get('url'))
1233
1234         info = self.grep_webpage(
1235             next_url,
1236             r'<video id="(.*?)".*?>.*?' +
1237                 '<name>(.*?)</name>.*?' +
1238                 '<dateVideo>(.*?)</dateVideo>.*?' +
1239                 '<url quality="hd">(.*?)</url>',
1240             re.DOTALL,
1241             [
1242                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1243                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1244                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1245                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1246             ]
1247         )
1248
1249         return {
1250             'id':           info.get('id'),
1251             'url':          compat_urllib_parse.unquote(info.get('url')),
1252             'uploader':     u'arte.tv',
1253             'upload_date':  info.get('date'),
1254             'title':        info.get('title').decode('utf-8'),
1255             'ext':          u'mp4',
1256             'format':       u'NA',
1257             'player_url':   None,
1258         }
1259
1260     def _real_extract(self, url):
1261         video_id = url.split('/')[-1]
1262         self.report_extraction(video_id)
1263
1264         if re.search(self._LIVE_URL, video_id) is not None:
1265             self.extractLiveStream(url)
1266             return
1267         else:
1268             info = self.extractPlus7Stream(url)
1269
1270         return [info]
1271
1272
1273 class GenericIE(InfoExtractor):
1274     """Generic last-resort information extractor."""
1275
1276     _VALID_URL = r'.*'
1277     IE_NAME = u'generic'
1278
1279     def __init__(self, downloader=None):
1280         InfoExtractor.__init__(self, downloader)
1281
1282     def report_download_webpage(self, video_id):
1283         """Report webpage download."""
1284         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1285         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1286
1287     def report_extraction(self, video_id):
1288         """Report information extraction."""
1289         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1290
1291     def report_following_redirect(self, new_url):
1292         """Report information extraction."""
1293         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1294
1295     def _test_redirect(self, url):
1296         """Check if it is a redirect, like url shorteners, in case restart chain."""
1297         class HeadRequest(compat_urllib_request.Request):
1298             def get_method(self):
1299                 return "HEAD"
1300
1301         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1302             """
1303             Subclass the HTTPRedirectHandler to make it use our
1304             HeadRequest also on the redirected URL
1305             """
1306             def redirect_request(self, req, fp, code, msg, headers, newurl):
1307                 if code in (301, 302, 303, 307):
1308                     newurl = newurl.replace(' ', '%20')
1309                     newheaders = dict((k,v) for k,v in req.headers.items()
1310                                       if k.lower() not in ("content-length", "content-type"))
1311                     return HeadRequest(newurl,
1312                                        headers=newheaders,
1313                                        origin_req_host=req.get_origin_req_host(),
1314                                        unverifiable=True)
1315                 else:
1316                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1317
1318         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1319             """
1320             Fallback to GET if HEAD is not allowed (405 HTTP error)
1321             """
1322             def http_error_405(self, req, fp, code, msg, headers):
1323                 fp.read()
1324                 fp.close()
1325
1326                 newheaders = dict((k,v) for k,v in req.headers.items()
1327                                   if k.lower() not in ("content-length", "content-type"))
1328                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1329                                                  headers=newheaders,
1330                                                  origin_req_host=req.get_origin_req_host(),
1331                                                  unverifiable=True))
1332
1333         # Build our opener
1334         opener = compat_urllib_request.OpenerDirector()
1335         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1336                         HTTPMethodFallback, HEADRedirectHandler,
1337                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1338             opener.add_handler(handler())
1339
1340         response = opener.open(HeadRequest(url))
1341         new_url = response.geturl()
1342
1343         if url == new_url:
1344             return False
1345
1346         self.report_following_redirect(new_url)
1347         self._downloader.download([new_url])
1348         return True
1349
1350     def _real_extract(self, url):
1351         if self._test_redirect(url): return
1352
1353         video_id = url.split('/')[-1]
1354         request = compat_urllib_request.Request(url)
1355         try:
1356             self.report_download_webpage(video_id)
1357             webpage = compat_urllib_request.urlopen(request).read()
1358         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1359             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1360             return
1361         except ValueError as err:
1362             # since this is the last-resort InfoExtractor, if
1363             # this error is thrown, it'll be thrown here
1364             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1365             return
1366
1367         self.report_extraction(video_id)
1368         # Start with something easy: JW Player in SWFObject
1369         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1370         if mobj is None:
1371             # Broaden the search a little bit
1372             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1373         if mobj is None:
1374             # Broaden the search a little bit: JWPlayer JS loader
1375             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1376         if mobj is None:
1377             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1378             return
1379
1380         # It's possible that one of the regexes
1381         # matched, but returned an empty group:
1382         if mobj.group(1) is None:
1383             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1384             return
1385
1386         video_url = compat_urllib_parse.unquote(mobj.group(1))
1387         video_id = os.path.basename(video_url)
1388
1389         # here's a fun little line of code for you:
1390         video_extension = os.path.splitext(video_id)[1][1:]
1391         video_id = os.path.splitext(video_id)[0]
1392
1393         # it's tempting to parse this further, but you would
1394         # have to take into account all the variations like
1395         #   Video Title - Site Name
1396         #   Site Name | Video Title
1397         #   Video Title - Tagline | Site Name
1398         # and so on and so forth; it's just not practical
1399         mobj = re.search(r'<title>(.*)</title>', webpage)
1400         if mobj is None:
1401             self._downloader.trouble(u'ERROR: unable to extract title')
1402             return
1403         video_title = mobj.group(1)
1404
1405         # video uploader is domain name
1406         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1407         if mobj is None:
1408             self._downloader.trouble(u'ERROR: unable to extract title')
1409             return
1410         video_uploader = mobj.group(1)
1411
1412         return [{
1413             'id':       video_id,
1414             'url':      video_url,
1415             'uploader': video_uploader,
1416             'upload_date':  None,
1417             'title':    video_title,
1418             'ext':      video_extension,
1419         }]
1420
1421
1422 class YoutubeSearchIE(InfoExtractor):
1423     """Information Extractor for YouTube search queries."""
1424     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1425     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1426     _max_youtube_results = 1000
1427     IE_NAME = u'youtube:search'
1428
1429     def __init__(self, downloader=None):
1430         InfoExtractor.__init__(self, downloader)
1431
1432     def report_download_page(self, query, pagenum):
1433         """Report attempt to download search page with given number."""
1434         query = query.decode(preferredencoding())
1435         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1436
1437     def _real_extract(self, query):
1438         mobj = re.match(self._VALID_URL, query)
1439         if mobj is None:
1440             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1441             return
1442
1443         prefix, query = query.split(':')
1444         prefix = prefix[8:]
1445         query = query.encode('utf-8')
1446         if prefix == '':
1447             self._download_n_results(query, 1)
1448             return
1449         elif prefix == 'all':
1450             self._download_n_results(query, self._max_youtube_results)
1451             return
1452         else:
1453             try:
1454                 n = int(prefix)
1455                 if n <= 0:
1456                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1457                     return
1458                 elif n > self._max_youtube_results:
1459                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1460                     n = self._max_youtube_results
1461                 self._download_n_results(query, n)
1462                 return
1463             except ValueError: # parsing prefix as integer fails
1464                 self._download_n_results(query, 1)
1465                 return
1466
1467     def _download_n_results(self, query, n):
1468         """Downloads a specified number of results for a query"""
1469
1470         video_ids = []
1471         pagenum = 0
1472         limit = n
1473
1474         while (50 * pagenum) < limit:
1475             self.report_download_page(query, pagenum+1)
1476             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1477             request = compat_urllib_request.Request(result_url)
1478             try:
1479                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1480             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1481                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1482                 return
1483             api_response = json.loads(data)['data']
1484
1485             if not 'items' in api_response:
1486                 self._downloader.trouble(u'[youtube] No video results')
1487                 return
1488
1489             new_ids = list(video['id'] for video in api_response['items'])
1490             video_ids += new_ids
1491
1492             limit = min(n, api_response['totalItems'])
1493             pagenum += 1
1494
1495         if len(video_ids) > n:
1496             video_ids = video_ids[:n]
1497         for id in video_ids:
1498             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1499         return
1500
1501
1502 class GoogleSearchIE(InfoExtractor):
1503     """Information Extractor for Google Video search queries."""
1504     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1505     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1506     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1507     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1508     _max_google_results = 1000
1509     IE_NAME = u'video.google:search'
1510
1511     def __init__(self, downloader=None):
1512         InfoExtractor.__init__(self, downloader)
1513
1514     def report_download_page(self, query, pagenum):
1515         """Report attempt to download playlist page with given number."""
1516         query = query.decode(preferredencoding())
1517         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1518
1519     def _real_extract(self, query):
1520         mobj = re.match(self._VALID_URL, query)
1521         if mobj is None:
1522             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1523             return
1524
1525         prefix, query = query.split(':')
1526         prefix = prefix[8:]
1527         query = query.encode('utf-8')
1528         if prefix == '':
1529             self._download_n_results(query, 1)
1530             return
1531         elif prefix == 'all':
1532             self._download_n_results(query, self._max_google_results)
1533             return
1534         else:
1535             try:
1536                 n = int(prefix)
1537                 if n <= 0:
1538                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1539                     return
1540                 elif n > self._max_google_results:
1541                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1542                     n = self._max_google_results
1543                 self._download_n_results(query, n)
1544                 return
1545             except ValueError: # parsing prefix as integer fails
1546                 self._download_n_results(query, 1)
1547                 return
1548
1549     def _download_n_results(self, query, n):
1550         """Downloads a specified number of results for a query"""
1551
1552         video_ids = []
1553         pagenum = 0
1554
1555         while True:
1556             self.report_download_page(query, pagenum)
1557             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1558             request = compat_urllib_request.Request(result_url)
1559             try:
1560                 page = compat_urllib_request.urlopen(request).read()
1561             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1562                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1563                 return
1564
1565             # Extract video identifiers
1566             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1567                 video_id = mobj.group(1)
1568                 if video_id not in video_ids:
1569                     video_ids.append(video_id)
1570                     if len(video_ids) == n:
1571                         # Specified n videos reached
1572                         for id in video_ids:
1573                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1574                         return
1575
1576             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1577                 for id in video_ids:
1578                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1579                 return
1580
1581             pagenum = pagenum + 1
1582
1583
1584 class YahooSearchIE(InfoExtractor):
1585     """Information Extractor for Yahoo! Video search queries."""
1586
1587     _WORKING = False
1588     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1589     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1590     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1591     _MORE_PAGES_INDICATOR = r'\s*Next'
1592     _max_yahoo_results = 1000
1593     IE_NAME = u'video.yahoo:search'
1594
1595     def __init__(self, downloader=None):
1596         InfoExtractor.__init__(self, downloader)
1597
1598     def report_download_page(self, query, pagenum):
1599         """Report attempt to download playlist page with given number."""
1600         query = query.decode(preferredencoding())
1601         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1602
1603     def _real_extract(self, query):
1604         mobj = re.match(self._VALID_URL, query)
1605         if mobj is None:
1606             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1607             return
1608
1609         prefix, query = query.split(':')
1610         prefix = prefix[8:]
1611         query = query.encode('utf-8')
1612         if prefix == '':
1613             self._download_n_results(query, 1)
1614             return
1615         elif prefix == 'all':
1616             self._download_n_results(query, self._max_yahoo_results)
1617             return
1618         else:
1619             try:
1620                 n = int(prefix)
1621                 if n <= 0:
1622                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1623                     return
1624                 elif n > self._max_yahoo_results:
1625                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1626                     n = self._max_yahoo_results
1627                 self._download_n_results(query, n)
1628                 return
1629             except ValueError: # parsing prefix as integer fails
1630                 self._download_n_results(query, 1)
1631                 return
1632
1633     def _download_n_results(self, query, n):
1634         """Downloads a specified number of results for a query"""
1635
1636         video_ids = []
1637         already_seen = set()
1638         pagenum = 1
1639
1640         while True:
1641             self.report_download_page(query, pagenum)
1642             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1643             request = compat_urllib_request.Request(result_url)
1644             try:
1645                 page = compat_urllib_request.urlopen(request).read()
1646             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1647                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1648                 return
1649
1650             # Extract video identifiers
1651             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1652                 video_id = mobj.group(1)
1653                 if video_id not in already_seen:
1654                     video_ids.append(video_id)
1655                     already_seen.add(video_id)
1656                     if len(video_ids) == n:
1657                         # Specified n videos reached
1658                         for id in video_ids:
1659                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1660                         return
1661
1662             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1663                 for id in video_ids:
1664                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1665                 return
1666
1667             pagenum = pagenum + 1
1668
1669
1670 class YoutubePlaylistIE(InfoExtractor):
1671     """Information Extractor for YouTube playlists."""
1672
1673     _VALID_URL = r"""(?:
1674                         (?:https?://)?
1675                         (?:\w+\.)?
1676                         youtube\.com/
1677                         (?:
1678                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1679                            \? (?:.*?&)*? (?:p|a|list)=
1680                         |  user/.*?/user/
1681                         |  p/
1682                         |  user/.*?#[pg]/c/
1683                         )
1684                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1685                         .*
1686                      |
1687                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1688                      )"""
1689     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1690     _MAX_RESULTS = 50
1691     IE_NAME = u'youtube:playlist'
1692
1693     def __init__(self, downloader=None):
1694         InfoExtractor.__init__(self, downloader)
1695
1696     @classmethod
1697     def suitable(cls, url):
1698         """Receives a URL and returns True if suitable for this IE."""
1699         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1700
1701     def report_download_page(self, playlist_id, pagenum):
1702         """Report attempt to download playlist page with given number."""
1703         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1704
1705     def _real_extract(self, url):
1706         # Extract playlist id
1707         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1708         if mobj is None:
1709             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1710             return
1711
1712         # Download playlist videos from API
1713         playlist_id = mobj.group(1) or mobj.group(2)
1714         page_num = 1
1715         videos = []
1716
1717         while True:
1718             self.report_download_page(playlist_id, page_num)
1719
1720             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1721             try:
1722                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1723             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1724                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1725                 return
1726
1727             try:
1728                 response = json.loads(page)
1729             except ValueError as err:
1730                 self._downloader.trouble(u'ERROR: Invalid JSON in API response: ' + compat_str(err))
1731                 return
1732
1733             if not 'feed' in response or not 'entry' in response['feed']:
1734                 self._downloader.trouble(u'ERROR: Got a malformed response from YouTube API')
1735                 return
1736             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1737                         for entry in response['feed']['entry']
1738                         if 'content' in entry ]
1739
1740             if len(response['feed']['entry']) < self._MAX_RESULTS:
1741                 break
1742             page_num += 1
1743
1744         videos = [v[1] for v in sorted(videos)]
1745         total = len(videos)
1746
1747         playliststart = self._downloader.params.get('playliststart', 1) - 1
1748         playlistend = self._downloader.params.get('playlistend', -1)
1749         if playlistend == -1:
1750             videos = videos[playliststart:]
1751         else:
1752             videos = videos[playliststart:playlistend]
1753
1754         if len(videos) == total:
1755             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1756         else:
1757             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1758
1759         for video in videos:
1760             self._downloader.download([video])
1761         return
1762
1763
1764 class YoutubeChannelIE(InfoExtractor):
1765     """Information Extractor for YouTube channels."""
1766
1767     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1768     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1769     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1770     IE_NAME = u'youtube:channel'
1771
1772     def report_download_page(self, channel_id, pagenum):
1773         """Report attempt to download channel page with given number."""
1774         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1775
1776     def _real_extract(self, url):
1777         # Extract channel id
1778         mobj = re.match(self._VALID_URL, url)
1779         if mobj is None:
1780             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1781             return
1782
1783         # Download channel pages
1784         channel_id = mobj.group(1)
1785         video_ids = []
1786         pagenum = 1
1787
1788         while True:
1789             self.report_download_page(channel_id, pagenum)
1790             url = self._TEMPLATE_URL % (channel_id, pagenum)
1791             request = compat_urllib_request.Request(url)
1792             try:
1793                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1794             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1795                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1796                 return
1797
1798             # Extract video identifiers
1799             ids_in_page = []
1800             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1801                 if mobj.group(1) not in ids_in_page:
1802                     ids_in_page.append(mobj.group(1))
1803             video_ids.extend(ids_in_page)
1804
1805             if self._MORE_PAGES_INDICATOR not in page:
1806                 break
1807             pagenum = pagenum + 1
1808
1809         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1810
1811         for id in video_ids:
1812             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1813         return
1814
1815
1816 class YoutubeUserIE(InfoExtractor):
1817     """Information Extractor for YouTube users."""
1818
1819     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1820     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1821     _GDATA_PAGE_SIZE = 50
1822     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1823     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1824     IE_NAME = u'youtube:user'
1825
1826     def __init__(self, downloader=None):
1827         InfoExtractor.__init__(self, downloader)
1828
1829     def report_download_page(self, username, start_index):
1830         """Report attempt to download user page."""
1831         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1832                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1833
1834     def _real_extract(self, url):
1835         # Extract username
1836         mobj = re.match(self._VALID_URL, url)
1837         if mobj is None:
1838             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1839             return
1840
1841         username = mobj.group(1)
1842
1843         # Download video ids using YouTube Data API. Result size per
1844         # query is limited (currently to 50 videos) so we need to query
1845         # page by page until there are no video ids - it means we got
1846         # all of them.
1847
1848         video_ids = []
1849         pagenum = 0
1850
1851         while True:
1852             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1853             self.report_download_page(username, start_index)
1854
1855             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1856
1857             try:
1858                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1859             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1860                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1861                 return
1862
1863             # Extract video identifiers
1864             ids_in_page = []
1865
1866             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1867                 if mobj.group(1) not in ids_in_page:
1868                     ids_in_page.append(mobj.group(1))
1869
1870             video_ids.extend(ids_in_page)
1871
1872             # A little optimization - if current page is not
1873             # "full", ie. does not contain PAGE_SIZE video ids then
1874             # we can assume that this page is the last one - there
1875             # are no more ids on further pages - no need to query
1876             # again.
1877
1878             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1879                 break
1880
1881             pagenum += 1
1882
1883         all_ids_count = len(video_ids)
1884         playliststart = self._downloader.params.get('playliststart', 1) - 1
1885         playlistend = self._downloader.params.get('playlistend', -1)
1886
1887         if playlistend == -1:
1888             video_ids = video_ids[playliststart:]
1889         else:
1890             video_ids = video_ids[playliststart:playlistend]
1891
1892         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1893                 (username, all_ids_count, len(video_ids)))
1894
1895         for video_id in video_ids:
1896             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1897
1898
1899 class BlipTVUserIE(InfoExtractor):
1900     """Information Extractor for blip.tv users."""
1901
1902     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1903     _PAGE_SIZE = 12
1904     IE_NAME = u'blip.tv:user'
1905
1906     def __init__(self, downloader=None):
1907         InfoExtractor.__init__(self, downloader)
1908
1909     def report_download_page(self, username, pagenum):
1910         """Report attempt to download user page."""
1911         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1912                 (self.IE_NAME, username, pagenum))
1913
1914     def _real_extract(self, url):
1915         # Extract username
1916         mobj = re.match(self._VALID_URL, url)
1917         if mobj is None:
1918             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1919             return
1920
1921         username = mobj.group(1)
1922
1923         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1924
1925         request = compat_urllib_request.Request(url)
1926
1927         try:
1928             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1929             mobj = re.search(r'data-users-id="([^"]+)"', page)
1930             page_base = page_base % mobj.group(1)
1931         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1932             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1933             return
1934
1935
1936         # Download video ids using BlipTV Ajax calls. Result size per
1937         # query is limited (currently to 12 videos) so we need to query
1938         # page by page until there are no video ids - it means we got
1939         # all of them.
1940
1941         video_ids = []
1942         pagenum = 1
1943
1944         while True:
1945             self.report_download_page(username, pagenum)
1946             url = page_base + "&page=" + str(pagenum)
1947             request = compat_urllib_request.Request( url )
1948             try:
1949                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1950             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1951                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1952                 return
1953
1954             # Extract video identifiers
1955             ids_in_page = []
1956
1957             for mobj in re.finditer(r'href="/([^"]+)"', page):
1958                 if mobj.group(1) not in ids_in_page:
1959                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1960
1961             video_ids.extend(ids_in_page)
1962
1963             # A little optimization - if current page is not
1964             # "full", ie. does not contain PAGE_SIZE video ids then
1965             # we can assume that this page is the last one - there
1966             # are no more ids on further pages - no need to query
1967             # again.
1968
1969             if len(ids_in_page) < self._PAGE_SIZE:
1970                 break
1971
1972             pagenum += 1
1973
1974         all_ids_count = len(video_ids)
1975         playliststart = self._downloader.params.get('playliststart', 1) - 1
1976         playlistend = self._downloader.params.get('playlistend', -1)
1977
1978         if playlistend == -1:
1979             video_ids = video_ids[playliststart:]
1980         else:
1981             video_ids = video_ids[playliststart:playlistend]
1982
1983         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1984                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1985
1986         for video_id in video_ids:
1987             self._downloader.download([u'http://blip.tv/'+video_id])
1988
1989
1990 class DepositFilesIE(InfoExtractor):
1991     """Information extractor for depositfiles.com"""
1992
1993     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1994
1995     def report_download_webpage(self, file_id):
1996         """Report webpage download."""
1997         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1998
1999     def report_extraction(self, file_id):
2000         """Report information extraction."""
2001         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2002
2003     def _real_extract(self, url):
2004         file_id = url.split('/')[-1]
2005         # Rebuild url in english locale
2006         url = 'http://depositfiles.com/en/files/' + file_id
2007
2008         # Retrieve file webpage with 'Free download' button pressed
2009         free_download_indication = { 'gateway_result' : '1' }
2010         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2011         try:
2012             self.report_download_webpage(file_id)
2013             webpage = compat_urllib_request.urlopen(request).read()
2014         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2015             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
2016             return
2017
2018         # Search for the real file URL
2019         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2020         if (mobj is None) or (mobj.group(1) is None):
2021             # Try to figure out reason of the error.
2022             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2023             if (mobj is not None) and (mobj.group(1) is not None):
2024                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2025                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2026             else:
2027                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2028             return
2029
2030         file_url = mobj.group(1)
2031         file_extension = os.path.splitext(file_url)[1][1:]
2032
2033         # Search for file title
2034         mobj = re.search(r'<b title="(.*?)">', webpage)
2035         if mobj is None:
2036             self._downloader.trouble(u'ERROR: unable to extract title')
2037             return
2038         file_title = mobj.group(1).decode('utf-8')
2039
2040         return [{
2041             'id':       file_id.decode('utf-8'),
2042             'url':      file_url.decode('utf-8'),
2043             'uploader': None,
2044             'upload_date':  None,
2045             'title':    file_title,
2046             'ext':      file_extension.decode('utf-8'),
2047         }]
2048
2049
2050 class FacebookIE(InfoExtractor):
2051     """Information Extractor for Facebook"""
2052
2053     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2054     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2055     _NETRC_MACHINE = 'facebook'
2056     IE_NAME = u'facebook'
2057
2058     def report_login(self):
2059         """Report attempt to log in."""
2060         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2061
2062     def _real_initialize(self):
2063         if self._downloader is None:
2064             return
2065
2066         useremail = None
2067         password = None
2068         downloader_params = self._downloader.params
2069
2070         # Attempt to use provided username and password or .netrc data
2071         if downloader_params.get('username', None) is not None:
2072             useremail = downloader_params['username']
2073             password = downloader_params['password']
2074         elif downloader_params.get('usenetrc', False):
2075             try:
2076                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2077                 if info is not None:
2078                     useremail = info[0]
2079                     password = info[2]
2080                 else:
2081                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2082             except (IOError, netrc.NetrcParseError) as err:
2083                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2084                 return
2085
2086         if useremail is None:
2087             return
2088
2089         # Log in
2090         login_form = {
2091             'email': useremail,
2092             'pass': password,
2093             'login': 'Log+In'
2094             }
2095         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2096         try:
2097             self.report_login()
2098             login_results = compat_urllib_request.urlopen(request).read()
2099             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2100                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2101                 return
2102         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2103             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2104             return
2105
2106     def _real_extract(self, url):
2107         mobj = re.match(self._VALID_URL, url)
2108         if mobj is None:
2109             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2110             return
2111         video_id = mobj.group('ID')
2112
2113         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2114         webpage = self._download_webpage(url, video_id)
2115
2116         BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2117         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2118         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2119         if not m:
2120             raise ExtractorError(u'Cannot parse data')
2121         data = dict(json.loads(m.group(1)))
2122         params_raw = compat_urllib_parse.unquote(data['params'])
2123         params = json.loads(params_raw)
2124         video_url = params['hd_src']
2125         if not video_url:
2126             video_url = params['sd_src']
2127         if not video_url:
2128             raise ExtractorError(u'Cannot find video URL')
2129         video_duration = int(params['video_duration'])
2130
2131         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2132         if not m:
2133             raise ExtractorError(u'Cannot find title in webpage')
2134         video_title = unescapeHTML(m.group(1))
2135
2136         info = {
2137             'id': video_id,
2138             'title': video_title,
2139             'url': video_url,
2140             'ext': 'mp4',
2141             'duration': video_duration,
2142             'thumbnail': params['thumbnail_src'],
2143         }
2144         return [info]
2145
2146
2147 class BlipTVIE(InfoExtractor):
2148     """Information extractor for blip.tv"""
2149
2150     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2151     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2152     IE_NAME = u'blip.tv'
2153
2154     def report_extraction(self, file_id):
2155         """Report information extraction."""
2156         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2157
2158     def report_direct_download(self, title):
2159         """Report information extraction."""
2160         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2161
2162     def _real_extract(self, url):
2163         mobj = re.match(self._VALID_URL, url)
2164         if mobj is None:
2165             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2166             return
2167
2168         urlp = compat_urllib_parse_urlparse(url)
2169         if urlp.path.startswith('/play/'):
2170             request = compat_urllib_request.Request(url)
2171             response = compat_urllib_request.urlopen(request)
2172             redirecturl = response.geturl()
2173             rurlp = compat_urllib_parse_urlparse(redirecturl)
2174             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2175             url = 'http://blip.tv/a/a-' + file_id
2176             return self._real_extract(url)
2177
2178
2179         if '?' in url:
2180             cchar = '&'
2181         else:
2182             cchar = '?'
2183         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2184         request = compat_urllib_request.Request(json_url)
2185         request.add_header('User-Agent', 'iTunes/10.6.1')
2186         self.report_extraction(mobj.group(1))
2187         info = None
2188         try:
2189             urlh = compat_urllib_request.urlopen(request)
2190             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2191                 basename = url.split('/')[-1]
2192                 title,ext = os.path.splitext(basename)
2193                 title = title.decode('UTF-8')
2194                 ext = ext.replace('.', '')
2195                 self.report_direct_download(title)
2196                 info = {
2197                     'id': title,
2198                     'url': url,
2199                     'uploader': None,
2200                     'upload_date': None,
2201                     'title': title,
2202                     'ext': ext,
2203                     'urlhandle': urlh
2204                 }
2205         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2206             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2207         if info is None: # Regular URL
2208             try:
2209                 json_code_bytes = urlh.read()
2210                 json_code = json_code_bytes.decode('utf-8')
2211             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2212                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2213                 return
2214
2215             try:
2216                 json_data = json.loads(json_code)
2217                 if 'Post' in json_data:
2218                     data = json_data['Post']
2219                 else:
2220                     data = json_data
2221
2222                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2223                 video_url = data['media']['url']
2224                 umobj = re.match(self._URL_EXT, video_url)
2225                 if umobj is None:
2226                     raise ValueError('Can not determine filename extension')
2227                 ext = umobj.group(1)
2228
2229                 info = {
2230                     'id': data['item_id'],
2231                     'url': video_url,
2232                     'uploader': data['display_name'],
2233                     'upload_date': upload_date,
2234                     'title': data['title'],
2235                     'ext': ext,
2236                     'format': data['media']['mimeType'],
2237                     'thumbnail': data['thumbnailUrl'],
2238                     'description': data['description'],
2239                     'player_url': data['embedUrl'],
2240                     'user_agent': 'iTunes/10.6.1',
2241                 }
2242             except (ValueError,KeyError) as err:
2243                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2244                 return
2245
2246         return [info]
2247
2248
2249 class MyVideoIE(InfoExtractor):
2250     """Information Extractor for myvideo.de."""
2251
2252     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2253     IE_NAME = u'myvideo'
2254
2255     def __init__(self, downloader=None):
2256         InfoExtractor.__init__(self, downloader)
2257
2258     def report_extraction(self, video_id):
2259         """Report information extraction."""
2260         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2261
2262     def _real_extract(self,url):
2263         mobj = re.match(self._VALID_URL, url)
2264         if mobj is None:
2265             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2266             return
2267
2268         video_id = mobj.group(1)
2269
2270         # Get video webpage
2271         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2272         webpage = self._download_webpage(webpage_url, video_id)
2273
2274         self.report_extraction(video_id)
2275         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
2276                  webpage)
2277         if mobj is None:
2278             self._downloader.trouble(u'ERROR: unable to extract media URL')
2279             return
2280         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2281
2282         mobj = re.search('<title>([^<]+)</title>', webpage)
2283         if mobj is None:
2284             self._downloader.trouble(u'ERROR: unable to extract title')
2285             return
2286
2287         video_title = mobj.group(1)
2288
2289         return [{
2290             'id':       video_id,
2291             'url':      video_url,
2292             'uploader': None,
2293             'upload_date':  None,
2294             'title':    video_title,
2295             'ext':      u'flv',
2296         }]
2297
2298 class ComedyCentralIE(InfoExtractor):
2299     """Information extractor for The Daily Show and Colbert Report """
2300
2301     # urls can be abbreviations like :thedailyshow or :colbert
2302     # urls for episodes like:
2303     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2304     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2305     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2306     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2307                       |(https?://)?(www\.)?
2308                           (?P<showname>thedailyshow|colbertnation)\.com/
2309                          (full-episodes/(?P<episode>.*)|
2310                           (?P<clip>
2311                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2312                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2313                      $"""
2314
2315     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2316
2317     _video_extensions = {
2318         '3500': 'mp4',
2319         '2200': 'mp4',
2320         '1700': 'mp4',
2321         '1200': 'mp4',
2322         '750': 'mp4',
2323         '400': 'mp4',
2324     }
2325     _video_dimensions = {
2326         '3500': '1280x720',
2327         '2200': '960x540',
2328         '1700': '768x432',
2329         '1200': '640x360',
2330         '750': '512x288',
2331         '400': '384x216',
2332     }
2333
2334     @classmethod
2335     def suitable(cls, url):
2336         """Receives a URL and returns True if suitable for this IE."""
2337         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2338
2339     def report_extraction(self, episode_id):
2340         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2341
2342     def report_config_download(self, episode_id, media_id):
2343         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2344
2345     def report_index_download(self, episode_id):
2346         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2347
2348     def _print_formats(self, formats):
2349         print('Available formats:')
2350         for x in formats:
2351             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2352
2353
2354     def _real_extract(self, url):
2355         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2356         if mobj is None:
2357             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2358             return
2359
2360         if mobj.group('shortname'):
2361             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2362                 url = u'http://www.thedailyshow.com/full-episodes/'
2363             else:
2364                 url = u'http://www.colbertnation.com/full-episodes/'
2365             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2366             assert mobj is not None
2367
2368         if mobj.group('clip'):
2369             if mobj.group('showname') == 'thedailyshow':
2370                 epTitle = mobj.group('tdstitle')
2371             else:
2372                 epTitle = mobj.group('cntitle')
2373             dlNewest = False
2374         else:
2375             dlNewest = not mobj.group('episode')
2376             if dlNewest:
2377                 epTitle = mobj.group('showname')
2378             else:
2379                 epTitle = mobj.group('episode')
2380
2381         req = compat_urllib_request.Request(url)
2382         self.report_extraction(epTitle)
2383         try:
2384             htmlHandle = compat_urllib_request.urlopen(req)
2385             html = htmlHandle.read()
2386             webpage = html.decode('utf-8')
2387         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2388             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2389             return
2390         if dlNewest:
2391             url = htmlHandle.geturl()
2392             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2393             if mobj is None:
2394                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2395                 return
2396             if mobj.group('episode') == '':
2397                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2398                 return
2399             epTitle = mobj.group('episode')
2400
2401         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2402
2403         if len(mMovieParams) == 0:
2404             # The Colbert Report embeds the information in a without
2405             # a URL prefix; so extract the alternate reference
2406             # and then add the URL prefix manually.
2407
2408             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2409             if len(altMovieParams) == 0:
2410                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2411                 return
2412             else:
2413                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2414
2415         uri = mMovieParams[0][1]
2416         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2417         self.report_index_download(epTitle)
2418         try:
2419             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2420         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2421             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2422             return
2423
2424         results = []
2425
2426         idoc = xml.etree.ElementTree.fromstring(indexXml)
2427         itemEls = idoc.findall('.//item')
2428         for partNum,itemEl in enumerate(itemEls):
2429             mediaId = itemEl.findall('./guid')[0].text
2430             shortMediaId = mediaId.split(':')[-1]
2431             showId = mediaId.split(':')[-2].replace('.com', '')
2432             officialTitle = itemEl.findall('./title')[0].text
2433             officialDate = itemEl.findall('./pubDate')[0].text
2434
2435             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2436                         compat_urllib_parse.urlencode({'uri': mediaId}))
2437             configReq = compat_urllib_request.Request(configUrl)
2438             self.report_config_download(epTitle, shortMediaId)
2439             try:
2440                 configXml = compat_urllib_request.urlopen(configReq).read()
2441             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2442                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2443                 return
2444
2445             cdoc = xml.etree.ElementTree.fromstring(configXml)
2446             turls = []
2447             for rendition in cdoc.findall('.//rendition'):
2448                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2449                 turls.append(finfo)
2450
2451             if len(turls) == 0:
2452                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2453                 continue
2454
2455             if self._downloader.params.get('listformats', None):
2456                 self._print_formats([i[0] for i in turls])
2457                 return
2458
2459             # For now, just pick the highest bitrate
2460             format,rtmp_video_url = turls[-1]
2461
2462             # Get the format arg from the arg stream
2463             req_format = self._downloader.params.get('format', None)
2464
2465             # Select format if we can find one
2466             for f,v in turls:
2467                 if f == req_format:
2468                     format, rtmp_video_url = f, v
2469                     break
2470
2471             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2472             if not m:
2473                 raise ExtractorError(u'Cannot transform RTMP url')
2474             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2475             video_url = base + m.group('finalid')
2476
2477             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2478             info = {
2479                 'id': shortMediaId,
2480                 'url': video_url,
2481                 'uploader': showId,
2482                 'upload_date': officialDate,
2483                 'title': effTitle,
2484                 'ext': 'mp4',
2485                 'format': format,
2486                 'thumbnail': None,
2487                 'description': officialTitle,
2488             }
2489             results.append(info)
2490
2491         return results
2492
2493
2494 class EscapistIE(InfoExtractor):
2495     """Information extractor for The Escapist """
2496
2497     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2498     IE_NAME = u'escapist'
2499
2500     def report_extraction(self, showName):
2501         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2502
2503     def report_config_download(self, showName):
2504         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2505
2506     def _real_extract(self, url):
2507         mobj = re.match(self._VALID_URL, url)
2508         if mobj is None:
2509             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2510             return
2511         showName = mobj.group('showname')
2512         videoId = mobj.group('episode')
2513
2514         self.report_extraction(showName)
2515         try:
2516             webPage = compat_urllib_request.urlopen(url)
2517             webPageBytes = webPage.read()
2518             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2519             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2520         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2521             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2522             return
2523
2524         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2525         description = unescapeHTML(descMatch.group(1))
2526         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2527         imgUrl = unescapeHTML(imgMatch.group(1))
2528         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2529         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2530         configUrlMatch = re.search('config=(.*)$', playerUrl)
2531         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2532
2533         self.report_config_download(showName)
2534         try:
2535             configJSON = compat_urllib_request.urlopen(configUrl)
2536             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2537             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2538         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2539             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2540             return
2541
2542         # Technically, it's JavaScript, not JSON
2543         configJSON = configJSON.replace("'", '"')
2544
2545         try:
2546             config = json.loads(configJSON)
2547         except (ValueError,) as err:
2548             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2549             return
2550
2551         playlist = config['playlist']
2552         videoUrl = playlist[1]['url']
2553
2554         info = {
2555             'id': videoId,
2556             'url': videoUrl,
2557             'uploader': showName,
2558             'upload_date': None,
2559             'title': showName,
2560             'ext': 'mp4',
2561             'thumbnail': imgUrl,
2562             'description': description,
2563             'player_url': playerUrl,
2564         }
2565
2566         return [info]
2567
2568 class CollegeHumorIE(InfoExtractor):
2569     """Information extractor for collegehumor.com"""
2570
2571     _WORKING = False
2572     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2573     IE_NAME = u'collegehumor'
2574
2575     def report_manifest(self, video_id):
2576         """Report information extraction."""
2577         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2578
2579     def report_extraction(self, video_id):
2580         """Report information extraction."""
2581         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2582
2583     def _real_extract(self, url):
2584         mobj = re.match(self._VALID_URL, url)
2585         if mobj is None:
2586             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2587             return
2588         video_id = mobj.group('videoid')
2589
2590         info = {
2591             'id': video_id,
2592             'uploader': None,
2593             'upload_date': None,
2594         }
2595
2596         self.report_extraction(video_id)
2597         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2598         try:
2599             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2600         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2601             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2602             return
2603
2604         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2605         try:
2606             videoNode = mdoc.findall('./video')[0]
2607             info['description'] = videoNode.findall('./description')[0].text
2608             info['title'] = videoNode.findall('./caption')[0].text
2609             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2610             manifest_url = videoNode.findall('./file')[0].text
2611         except IndexError:
2612             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2613             return
2614
2615         manifest_url += '?hdcore=2.10.3'
2616         self.report_manifest(video_id)
2617         try:
2618             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2619         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2620             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2621             return
2622
2623         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2624         try:
2625             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2626             node_id = media_node.attrib['url']
2627             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2628         except IndexError as err:
2629             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2630             return
2631
2632         url_pr = compat_urllib_parse_urlparse(manifest_url)
2633         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2634
2635         info['url'] = url
2636         info['ext'] = 'f4f'
2637         return [info]
2638
2639
2640 class XVideosIE(InfoExtractor):
2641     """Information extractor for xvideos.com"""
2642
2643     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2644     IE_NAME = u'xvideos'
2645
2646     def report_extraction(self, video_id):
2647         """Report information extraction."""
2648         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2649
2650     def _real_extract(self, url):
2651         mobj = re.match(self._VALID_URL, url)
2652         if mobj is None:
2653             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2654             return
2655         video_id = mobj.group(1)
2656
2657         webpage = self._download_webpage(url, video_id)
2658
2659         self.report_extraction(video_id)
2660
2661
2662         # Extract video URL
2663         mobj = re.search(r'flv_url=(.+?)&', webpage)
2664         if mobj is None:
2665             self._downloader.trouble(u'ERROR: unable to extract video url')
2666             return
2667         video_url = compat_urllib_parse.unquote(mobj.group(1))
2668
2669
2670         # Extract title
2671         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2672         if mobj is None:
2673             self._downloader.trouble(u'ERROR: unable to extract video title')
2674             return
2675         video_title = mobj.group(1)
2676
2677
2678         # Extract video thumbnail
2679         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2680         if mobj is None:
2681             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2682             return
2683         video_thumbnail = mobj.group(0)
2684
2685         info = {
2686             'id': video_id,
2687             'url': video_url,
2688             'uploader': None,
2689             'upload_date': None,
2690             'title': video_title,
2691             'ext': 'flv',
2692             'thumbnail': video_thumbnail,
2693             'description': None,
2694         }
2695
2696         return [info]
2697
2698
2699 class SoundcloudIE(InfoExtractor):
2700     """Information extractor for soundcloud.com
2701        To access the media, the uid of the song and a stream token
2702        must be extracted from the page source and the script must make
2703        a request to media.soundcloud.com/crossdomain.xml. Then
2704        the media can be grabbed by requesting from an url composed
2705        of the stream token and uid
2706      """
2707
2708     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2709     IE_NAME = u'soundcloud'
2710
2711     def __init__(self, downloader=None):
2712         InfoExtractor.__init__(self, downloader)
2713
2714     def report_resolve(self, video_id):
2715         """Report information extraction."""
2716         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2717
2718     def report_extraction(self, video_id):
2719         """Report information extraction."""
2720         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2721
2722     def _real_extract(self, url):
2723         mobj = re.match(self._VALID_URL, url)
2724         if mobj is None:
2725             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2726             return
2727
2728         # extract uploader (which is in the url)
2729         uploader = mobj.group(1)
2730         # extract simple title (uploader + slug of song title)
2731         slug_title =  mobj.group(2)
2732         simple_title = uploader + u'-' + slug_title
2733
2734         self.report_resolve('%s/%s' % (uploader, slug_title))
2735
2736         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2737         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2738         request = compat_urllib_request.Request(resolv_url)
2739         try:
2740             info_json_bytes = compat_urllib_request.urlopen(request).read()
2741             info_json = info_json_bytes.decode('utf-8')
2742         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2743             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2744             return
2745
2746         info = json.loads(info_json)
2747         video_id = info['id']
2748         self.report_extraction('%s/%s' % (uploader, slug_title))
2749
2750         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2751         request = compat_urllib_request.Request(streams_url)
2752         try:
2753             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2754             stream_json = stream_json_bytes.decode('utf-8')
2755         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2756             self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2757             return
2758
2759         streams = json.loads(stream_json)
2760         mediaURL = streams['http_mp3_128_url']
2761
2762         return [{
2763             'id':       info['id'],
2764             'url':      mediaURL,
2765             'uploader': info['user']['username'],
2766             'upload_date':  info['created_at'],
2767             'title':    info['title'],
2768             'ext':      u'mp3',
2769             'description': info['description'],
2770         }]
2771
2772
2773 class InfoQIE(InfoExtractor):
2774     """Information extractor for infoq.com"""
2775     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2776
2777     def report_extraction(self, video_id):
2778         """Report information extraction."""
2779         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2780
2781     def _real_extract(self, url):
2782         mobj = re.match(self._VALID_URL, url)
2783         if mobj is None:
2784             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2785             return
2786
2787         webpage = self._download_webpage(url, video_id=url)
2788         self.report_extraction(url)
2789
2790         # Extract video URL
2791         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2792         if mobj is None:
2793             self._downloader.trouble(u'ERROR: unable to extract video url')
2794             return
2795         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2796         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2797
2798         # Extract title
2799         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2800         if mobj is None:
2801             self._downloader.trouble(u'ERROR: unable to extract video title')
2802             return
2803         video_title = mobj.group(1)
2804
2805         # Extract description
2806         video_description = u'No description available.'
2807         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2808         if mobj is not None:
2809             video_description = mobj.group(1)
2810
2811         video_filename = video_url.split('/')[-1]
2812         video_id, extension = video_filename.split('.')
2813
2814         info = {
2815             'id': video_id,
2816             'url': video_url,
2817             'uploader': None,
2818             'upload_date': None,
2819             'title': video_title,
2820             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2821             'thumbnail': None,
2822             'description': video_description,
2823         }
2824
2825         return [info]
2826
2827 class MixcloudIE(InfoExtractor):
2828     """Information extractor for www.mixcloud.com"""
2829
2830     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2831     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2832     IE_NAME = u'mixcloud'
2833
2834     def __init__(self, downloader=None):
2835         InfoExtractor.__init__(self, downloader)
2836
2837     def report_download_json(self, file_id):
2838         """Report JSON download."""
2839         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2840
2841     def report_extraction(self, file_id):
2842         """Report information extraction."""
2843         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2844
2845     def get_urls(self, jsonData, fmt, bitrate='best'):
2846         """Get urls from 'audio_formats' section in json"""
2847         file_url = None
2848         try:
2849             bitrate_list = jsonData[fmt]
2850             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2851                 bitrate = max(bitrate_list) # select highest
2852
2853             url_list = jsonData[fmt][bitrate]
2854         except TypeError: # we have no bitrate info.
2855             url_list = jsonData[fmt]
2856         return url_list
2857
2858     def check_urls(self, url_list):
2859         """Returns 1st active url from list"""
2860         for url in url_list:
2861             try:
2862                 compat_urllib_request.urlopen(url)
2863                 return url
2864             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2865                 url = None
2866
2867         return None
2868
2869     def _print_formats(self, formats):
2870         print('Available formats:')
2871         for fmt in formats.keys():
2872             for b in formats[fmt]:
2873                 try:
2874                     ext = formats[fmt][b][0]
2875                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2876                 except TypeError: # we have no bitrate info
2877                     ext = formats[fmt][0]
2878                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2879                     break
2880
2881     def _real_extract(self, url):
2882         mobj = re.match(self._VALID_URL, url)
2883         if mobj is None:
2884             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2885             return
2886         # extract uploader & filename from url
2887         uploader = mobj.group(1).decode('utf-8')
2888         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2889
2890         # construct API request
2891         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2892         # retrieve .json file with links to files
2893         request = compat_urllib_request.Request(file_url)
2894         try:
2895             self.report_download_json(file_url)
2896             jsonData = compat_urllib_request.urlopen(request).read()
2897         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2898             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2899             return
2900
2901         # parse JSON
2902         json_data = json.loads(jsonData)
2903         player_url = json_data['player_swf_url']
2904         formats = dict(json_data['audio_formats'])
2905
2906         req_format = self._downloader.params.get('format', None)
2907         bitrate = None
2908
2909         if self._downloader.params.get('listformats', None):
2910             self._print_formats(formats)
2911             return
2912
2913         if req_format is None or req_format == 'best':
2914             for format_param in formats.keys():
2915                 url_list = self.get_urls(formats, format_param)
2916                 # check urls
2917                 file_url = self.check_urls(url_list)
2918                 if file_url is not None:
2919                     break # got it!
2920         else:
2921             if req_format not in formats:
2922                 self._downloader.trouble(u'ERROR: format is not available')
2923                 return
2924
2925             url_list = self.get_urls(formats, req_format)
2926             file_url = self.check_urls(url_list)
2927             format_param = req_format
2928
2929         return [{
2930             'id': file_id.decode('utf-8'),
2931             'url': file_url.decode('utf-8'),
2932             'uploader': uploader.decode('utf-8'),
2933             'upload_date': None,
2934             'title': json_data['name'],
2935             'ext': file_url.split('.')[-1].decode('utf-8'),
2936             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2937             'thumbnail': json_data['thumbnail_url'],
2938             'description': json_data['description'],
2939             'player_url': player_url.decode('utf-8'),
2940         }]
2941
2942 class StanfordOpenClassroomIE(InfoExtractor):
2943     """Information extractor for Stanford's Open ClassRoom"""
2944
2945     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2946     IE_NAME = u'stanfordoc'
2947
2948     def report_download_webpage(self, objid):
2949         """Report information extraction."""
2950         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2951
2952     def report_extraction(self, video_id):
2953         """Report information extraction."""
2954         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2955
2956     def _real_extract(self, url):
2957         mobj = re.match(self._VALID_URL, url)
2958         if mobj is None:
2959             raise ExtractorError(u'Invalid URL: %s' % url)
2960
2961         if mobj.group('course') and mobj.group('video'): # A specific video
2962             course = mobj.group('course')
2963             video = mobj.group('video')
2964             info = {
2965                 'id': course + '_' + video,
2966                 'uploader': None,
2967                 'upload_date': None,
2968             }
2969
2970             self.report_extraction(info['id'])
2971             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2972             xmlUrl = baseUrl + video + '.xml'
2973             try:
2974                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2975             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2976                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2977                 return
2978             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2979             try:
2980                 info['title'] = mdoc.findall('./title')[0].text
2981                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2982             except IndexError:
2983                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2984                 return
2985             info['ext'] = info['url'].rpartition('.')[2]
2986             return [info]
2987         elif mobj.group('course'): # A course page
2988             course = mobj.group('course')
2989             info = {
2990                 'id': course,
2991                 'type': 'playlist',
2992                 'uploader': None,
2993                 'upload_date': None,
2994             }
2995
2996             coursepage = self._download_webpage(url, info['id'],
2997                                         note='Downloading course info page',
2998                                         errnote='Unable to download course info page')
2999
3000             m = re.search('<h1>([^<]+)</h1>', coursepage)
3001             if m:
3002                 info['title'] = unescapeHTML(m.group(1))
3003             else:
3004                 info['title'] = info['id']
3005
3006             m = re.search('<description>([^<]+)</description>', coursepage)
3007             if m:
3008                 info['description'] = unescapeHTML(m.group(1))
3009
3010             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3011             info['list'] = [
3012                 {
3013                     'type': 'reference',
3014                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3015                 }
3016                     for vpage in links]
3017             results = []
3018             for entry in info['list']:
3019                 assert entry['type'] == 'reference'
3020                 results += self.extract(entry['url'])
3021             return results
3022         else: # Root page
3023             info = {
3024                 'id': 'Stanford OpenClassroom',
3025                 'type': 'playlist',
3026                 'uploader': None,
3027                 'upload_date': None,
3028             }
3029
3030             self.report_download_webpage(info['id'])
3031             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3032             try:
3033                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3034             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3035                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3036                 return
3037
3038             info['title'] = info['id']
3039
3040             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3041             info['list'] = [
3042                 {
3043                     'type': 'reference',
3044                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3045                 }
3046                     for cpage in links]
3047
3048             results = []
3049             for entry in info['list']:
3050                 assert entry['type'] == 'reference'
3051                 results += self.extract(entry['url'])
3052             return results
3053
3054 class MTVIE(InfoExtractor):
3055     """Information extractor for MTV.com"""
3056
3057     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3058     IE_NAME = u'mtv'
3059
3060     def report_extraction(self, video_id):
3061         """Report information extraction."""
3062         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3063
3064     def _real_extract(self, url):
3065         mobj = re.match(self._VALID_URL, url)
3066         if mobj is None:
3067             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3068             return
3069         if not mobj.group('proto'):
3070             url = 'http://' + url
3071         video_id = mobj.group('videoid')
3072
3073         webpage = self._download_webpage(url, video_id)
3074
3075         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3076         if mobj is None:
3077             self._downloader.trouble(u'ERROR: unable to extract song name')
3078             return
3079         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3080         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3081         if mobj is None:
3082             self._downloader.trouble(u'ERROR: unable to extract performer')
3083             return
3084         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3085         video_title = performer + ' - ' + song_name
3086
3087         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3088         if mobj is None:
3089             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3090             return
3091         mtvn_uri = mobj.group(1)
3092
3093         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3094         if mobj is None:
3095             self._downloader.trouble(u'ERROR: unable to extract content id')
3096             return
3097         content_id = mobj.group(1)
3098
3099         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3100         self.report_extraction(video_id)
3101         request = compat_urllib_request.Request(videogen_url)
3102         try:
3103             metadataXml = compat_urllib_request.urlopen(request).read()
3104         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3105             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3106             return
3107
3108         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3109         renditions = mdoc.findall('.//rendition')
3110
3111         # For now, always pick the highest quality.
3112         rendition = renditions[-1]
3113
3114         try:
3115             _,_,ext = rendition.attrib['type'].partition('/')
3116             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3117             video_url = rendition.find('./src').text
3118         except KeyError:
3119             self._downloader.trouble('Invalid rendition field.')
3120             return
3121
3122         info = {
3123             'id': video_id,
3124             'url': video_url,
3125             'uploader': performer,
3126             'upload_date': None,
3127             'title': video_title,
3128             'ext': ext,
3129             'format': format,
3130         }
3131
3132         return [info]
3133
3134
3135 class YoukuIE(InfoExtractor):
3136     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3137
3138     def report_download_webpage(self, file_id):
3139         """Report webpage download."""
3140         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3141
3142     def report_extraction(self, file_id):
3143         """Report information extraction."""
3144         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3145
3146     def _gen_sid(self):
3147         nowTime = int(time.time() * 1000)
3148         random1 = random.randint(1000,1998)
3149         random2 = random.randint(1000,9999)
3150
3151         return "%d%d%d" %(nowTime,random1,random2)
3152
3153     def _get_file_ID_mix_string(self, seed):
3154         mixed = []
3155         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3156         seed = float(seed)
3157         for i in range(len(source)):
3158             seed  =  (seed * 211 + 30031 ) % 65536
3159             index  =  math.floor(seed / 65536 * len(source) )
3160             mixed.append(source[int(index)])
3161             source.remove(source[int(index)])
3162         #return ''.join(mixed)
3163         return mixed
3164
3165     def _get_file_id(self, fileId, seed):
3166         mixed = self._get_file_ID_mix_string(seed)
3167         ids = fileId.split('*')
3168         realId = []
3169         for ch in ids:
3170             if ch:
3171                 realId.append(mixed[int(ch)])
3172         return ''.join(realId)
3173
3174     def _real_extract(self, url):
3175         mobj = re.match(self._VALID_URL, url)
3176         if mobj is None:
3177             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3178             return
3179         video_id = mobj.group('ID')
3180
3181         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3182
3183         request = compat_urllib_request.Request(info_url, None, std_headers)
3184         try:
3185             self.report_download_webpage(video_id)
3186             jsondata = compat_urllib_request.urlopen(request).read()
3187         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3188             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3189             return
3190
3191         self.report_extraction(video_id)
3192         try:
3193             jsonstr = jsondata.decode('utf-8')
3194             config = json.loads(jsonstr)
3195
3196             video_title =  config['data'][0]['title']
3197             seed = config['data'][0]['seed']
3198
3199             format = self._downloader.params.get('format', None)
3200             supported_format = list(config['data'][0]['streamfileids'].keys())
3201
3202             if format is None or format == 'best':
3203                 if 'hd2' in supported_format:
3204                     format = 'hd2'
3205                 else:
3206                     format = 'flv'
3207                 ext = u'flv'
3208             elif format == 'worst':
3209                 format = 'mp4'
3210                 ext = u'mp4'
3211             else:
3212                 format = 'flv'
3213                 ext = u'flv'
3214
3215
3216             fileid = config['data'][0]['streamfileids'][format]
3217             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3218         except (UnicodeDecodeError, ValueError, KeyError):
3219             self._downloader.trouble(u'ERROR: unable to extract info section')
3220             return
3221
3222         files_info=[]
3223         sid = self._gen_sid()
3224         fileid = self._get_file_id(fileid, seed)
3225
3226         #column 8,9 of fileid represent the segment number
3227         #fileid[7:9] should be changed
3228         for index, key in enumerate(keys):
3229
3230             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3231             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3232
3233             info = {
3234                 'id': '%s_part%02d' % (video_id, index),
3235                 'url': download_url,
3236                 'uploader': None,
3237                 'upload_date': None,
3238                 'title': video_title,
3239                 'ext': ext,
3240             }
3241             files_info.append(info)
3242
3243         return files_info
3244
3245
3246 class XNXXIE(InfoExtractor):
3247     """Information extractor for xnxx.com"""
3248
3249     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3250     IE_NAME = u'xnxx'
3251     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3252     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3253     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3254
3255     def report_webpage(self, video_id):
3256         """Report information extraction"""
3257         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3258
3259     def report_extraction(self, video_id):
3260         """Report information extraction"""
3261         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3262
3263     def _real_extract(self, url):
3264         mobj = re.match(self._VALID_URL, url)
3265         if mobj is None:
3266             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3267             return
3268         video_id = mobj.group(1)
3269
3270         self.report_webpage(video_id)
3271
3272         # Get webpage content
3273         try:
3274             webpage_bytes = compat_urllib_request.urlopen(url).read()
3275             webpage = webpage_bytes.decode('utf-8')
3276         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3277             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3278             return
3279
3280         result = re.search(self.VIDEO_URL_RE, webpage)
3281         if result is None:
3282             self._downloader.trouble(u'ERROR: unable to extract video url')
3283             return
3284         video_url = compat_urllib_parse.unquote(result.group(1))
3285
3286         result = re.search(self.VIDEO_TITLE_RE, webpage)
3287         if result is None:
3288             self._downloader.trouble(u'ERROR: unable to extract video title')
3289             return
3290         video_title = result.group(1)
3291
3292         result = re.search(self.VIDEO_THUMB_RE, webpage)
3293         if result is None:
3294             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3295             return
3296         video_thumbnail = result.group(1)
3297
3298         return [{
3299             'id': video_id,
3300             'url': video_url,
3301             'uploader': None,
3302             'upload_date': None,
3303             'title': video_title,
3304             'ext': 'flv',
3305             'thumbnail': video_thumbnail,
3306             'description': None,
3307         }]
3308
3309
3310 class GooglePlusIE(InfoExtractor):
3311     """Information extractor for plus.google.com."""
3312
3313     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3314     IE_NAME = u'plus.google'
3315
3316     def __init__(self, downloader=None):
3317         InfoExtractor.__init__(self, downloader)
3318
3319     def report_extract_entry(self, url):
3320         """Report downloading extry"""
3321         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3322
3323     def report_date(self, upload_date):
3324         """Report downloading extry"""
3325         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3326
3327     def report_uploader(self, uploader):
3328         """Report downloading extry"""
3329         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3330
3331     def report_title(self, video_title):
3332         """Report downloading extry"""
3333         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3334
3335     def report_extract_vid_page(self, video_page):
3336         """Report information extraction."""
3337         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3338
3339     def _real_extract(self, url):
3340         # Extract id from URL
3341         mobj = re.match(self._VALID_URL, url)
3342         if mobj is None:
3343             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3344             return
3345
3346         post_url = mobj.group(0)
3347         video_id = mobj.group(1)
3348
3349         video_extension = 'flv'
3350
3351         # Step 1, Retrieve post webpage to extract further information
3352         self.report_extract_entry(post_url)
3353         request = compat_urllib_request.Request(post_url)
3354         try:
3355             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3356         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3357             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3358             return
3359
3360         # Extract update date
3361         upload_date = None
3362         pattern = 'title="Timestamp">(.*?)</a>'
3363         mobj = re.search(pattern, webpage)
3364         if mobj:
3365             upload_date = mobj.group(1)
3366             # Convert timestring to a format suitable for filename
3367             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3368             upload_date = upload_date.strftime('%Y%m%d')
3369         self.report_date(upload_date)
3370
3371         # Extract uploader
3372         uploader = None
3373         pattern = r'rel\="author".*?>(.*?)</a>'
3374         mobj = re.search(pattern, webpage)
3375         if mobj:
3376             uploader = mobj.group(1)
3377         self.report_uploader(uploader)
3378
3379         # Extract title
3380         # Get the first line for title
3381         video_title = u'NA'
3382         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3383         mobj = re.search(pattern, webpage)
3384         if mobj:
3385             video_title = mobj.group(1)
3386         self.report_title(video_title)
3387
3388         # Step 2, Stimulate clicking the image box to launch video
3389         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3390         mobj = re.search(pattern, webpage)
3391         if mobj is None:
3392             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3393
3394         video_page = mobj.group(1)
3395         request = compat_urllib_request.Request(video_page)
3396         try:
3397             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3398         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3399             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3400             return
3401         self.report_extract_vid_page(video_page)
3402
3403
3404         # Extract video links on video page
3405         """Extract video links of all sizes"""
3406         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3407         mobj = re.findall(pattern, webpage)
3408         if len(mobj) == 0:
3409             self._downloader.trouble(u'ERROR: unable to extract video links')
3410
3411         # Sort in resolution
3412         links = sorted(mobj)
3413
3414         # Choose the lowest of the sort, i.e. highest resolution
3415         video_url = links[-1]
3416         # Only get the url. The resolution part in the tuple has no use anymore
3417         video_url = video_url[-1]
3418         # Treat escaped \u0026 style hex
3419         try:
3420             video_url = video_url.decode("unicode_escape")
3421         except AttributeError: # Python 3
3422             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3423
3424
3425         return [{
3426             'id':       video_id,
3427             'url':      video_url,
3428             'uploader': uploader,
3429             'upload_date':  upload_date,
3430             'title':    video_title,
3431             'ext':      video_extension,
3432         }]
3433
3434 class NBAIE(InfoExtractor):
3435     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3436     IE_NAME = u'nba'
3437
3438     def _real_extract(self, url):
3439         mobj = re.match(self._VALID_URL, url)
3440         if mobj is None:
3441             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3442             return
3443
3444         video_id = mobj.group(1)
3445         if video_id.endswith('/index.html'):
3446             video_id = video_id[:-len('/index.html')]
3447
3448         webpage = self._download_webpage(url, video_id)
3449
3450         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3451         def _findProp(rexp, default=None):
3452             m = re.search(rexp, webpage)
3453             if m:
3454                 return unescapeHTML(m.group(1))
3455             else:
3456                 return default
3457
3458         shortened_video_id = video_id.rpartition('/')[2]
3459         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3460         info = {
3461             'id': shortened_video_id,
3462             'url': video_url,
3463             'ext': 'mp4',
3464             'title': title,
3465             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3466             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3467         }
3468         return [info]
3469
3470 class JustinTVIE(InfoExtractor):
3471     """Information extractor for justin.tv and twitch.tv"""
3472     # TODO: One broadcast may be split into multiple videos. The key
3473     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3474     # starts at 1 and increases. Can we treat all parts as one video?
3475
3476     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3477         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3478     _JUSTIN_PAGE_LIMIT = 100
3479     IE_NAME = u'justin.tv'
3480
3481     def report_extraction(self, file_id):
3482         """Report information extraction."""
3483         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3484
3485     def report_download_page(self, channel, offset):
3486         """Report attempt to download a single page of videos."""
3487         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3488                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3489
3490     # Return count of items, list of *valid* items
3491     def _parse_page(self, url):
3492         try:
3493             urlh = compat_urllib_request.urlopen(url)
3494             webpage_bytes = urlh.read()
3495             webpage = webpage_bytes.decode('utf-8', 'ignore')
3496         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3497             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3498             return
3499
3500         response = json.loads(webpage)
3501         if type(response) != list:
3502             error_text = response.get('error', 'unknown error')
3503             self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3504             return
3505         info = []
3506         for clip in response:
3507             video_url = clip['video_file_url']
3508             if video_url:
3509                 video_extension = os.path.splitext(video_url)[1][1:]
3510                 video_date = re.sub('-', '', clip['start_time'][:10])
3511                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3512                 video_id = clip['id']
3513                 video_title = clip.get('title', video_id)
3514                 info.append({
3515                     'id': video_id,
3516                     'url': video_url,
3517                     'title': video_title,
3518                     'uploader': clip.get('channel_name', video_uploader_id),
3519                     'uploader_id': video_uploader_id,
3520                     'upload_date': video_date,
3521                     'ext': video_extension,
3522                 })
3523         return (len(response), info)
3524
3525     def _real_extract(self, url):
3526         mobj = re.match(self._VALID_URL, url)
3527         if mobj is None:
3528             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3529             return
3530
3531         api = 'http://api.justin.tv'
3532         video_id = mobj.group(mobj.lastindex)
3533         paged = False
3534         if mobj.lastindex == 1:
3535             paged = True
3536             api += '/channel/archives/%s.json'
3537         else:
3538             api += '/broadcast/by_archive/%s.json'
3539         api = api % (video_id,)
3540
3541         self.report_extraction(video_id)
3542
3543         info = []
3544         offset = 0
3545         limit = self._JUSTIN_PAGE_LIMIT
3546         while True:
3547             if paged:
3548                 self.report_download_page(video_id, offset)
3549             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3550             page_count, page_info = self._parse_page(page_url)
3551             info.extend(page_info)
3552             if not paged or page_count != limit:
3553                 break
3554             offset += limit
3555         return info
3556
3557 class FunnyOrDieIE(InfoExtractor):
3558     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3559
3560     def _real_extract(self, url):
3561         mobj = re.match(self._VALID_URL, url)
3562         if mobj is None:
3563             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3564             return
3565
3566         video_id = mobj.group('id')
3567         webpage = self._download_webpage(url, video_id)
3568
3569         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3570         if not m:
3571             self._downloader.trouble(u'ERROR: unable to find video information')
3572         video_url = unescapeHTML(m.group('url'))
3573
3574         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3575         if not m:
3576             self._downloader.trouble(u'Cannot find video title')
3577         title = unescapeHTML(m.group('title'))
3578
3579         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3580         if m:
3581             desc = unescapeHTML(m.group('desc'))
3582         else:
3583             desc = None
3584
3585         info = {
3586             'id': video_id,
3587             'url': video_url,
3588             'ext': 'mp4',
3589             'title': title,
3590             'description': desc,
3591         }
3592         return [info]
3593
3594 class SteamIE(InfoExtractor):
3595     _VALID_URL = r"""http://store.steampowered.com/
3596                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3597                 (?P<gameID>\d+)/?
3598                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3599                 """
3600
3601     @classmethod
3602     def suitable(cls, url):
3603         """Receives a URL and returns True if suitable for this IE."""
3604         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3605
3606     def _real_extract(self, url):
3607         m = re.match(self._VALID_URL, url, re.VERBOSE)
3608         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3609         gameID = m.group('gameID')
3610         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3611         webpage = self._download_webpage(videourl, gameID)
3612         mweb = re.finditer(urlRE, webpage)
3613         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3614         titles = re.finditer(namesRE, webpage)
3615         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3616         thumbs = re.finditer(thumbsRE, webpage)
3617         videos = []
3618         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3619             video_id = vid.group('videoID')
3620             title = vtitle.group('videoName')
3621             video_url = vid.group('videoURL')
3622             video_thumb = thumb.group('thumbnail')
3623             if not video_url:
3624                 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3625             info = {
3626                 'id':video_id,
3627                 'url':video_url,
3628                 'ext': 'flv',
3629                 'title': unescapeHTML(title),
3630                 'thumbnail': video_thumb
3631                   }
3632             videos.append(info)
3633         return videos
3634
3635 class UstreamIE(InfoExtractor):
3636     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3637     IE_NAME = u'ustream'
3638
3639     def _real_extract(self, url):
3640         m = re.match(self._VALID_URL, url)
3641         video_id = m.group('videoID')
3642         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3643         webpage = self._download_webpage(url, video_id)
3644         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3645         title = m.group('title')
3646         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3647         uploader = m.group('uploader')
3648         info = {
3649                 'id':video_id,
3650                 'url':video_url,
3651                 'ext': 'flv',
3652                 'title': title,
3653                 'uploader': uploader
3654                   }
3655         return [info]
3656
3657 class WorldStarHipHopIE(InfoExtractor):
3658     _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3659     IE_NAME = u'WorldStarHipHop'
3660
3661     def _real_extract(self, url):
3662         _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3663
3664         webpage_src = compat_urllib_request.urlopen(str(url)).read()
3665         webpage_src = webpage_src.decode('utf-8')
3666
3667         mobj = re.search(_src_url, webpage_src)
3668
3669         if mobj is not None:
3670             video_url = mobj.group()
3671             if 'mp4' in video_url:
3672                 ext = 'mp4'
3673             else:
3674                 ext = 'flv'
3675         else:
3676             video_url = None
3677             ext = None
3678
3679         _title = r"""<title>(.*)</title>"""
3680
3681         mobj = re.search(_title, webpage_src)
3682
3683         if mobj is not None:
3684             title = mobj.group(1)
3685         else:
3686             title = 'World Start Hip Hop - %s' % time.ctime()
3687
3688         _thumbnail = r"""rel="image_src" href="(.*)" />"""
3689         mobj = re.search(_thumbnail, webpage_src)
3690
3691         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3692         if mobj is not None:
3693             thumbnail = mobj.group(1)
3694         else:
3695             _title = r"""candytitles.*>(.*)</span>"""
3696             mobj = re.search(_title, webpage_src)
3697             if mobj is not None:
3698                 title = mobj.group(1)
3699             thumbnail = None
3700
3701         m = re.match(self._VALID_URL, url)
3702         video_id = m.group('id')
3703
3704         results = [{
3705                     'id': video_id,
3706                     'url' : video_url,
3707                     'title' : title,
3708                     'thumbnail' : thumbnail,
3709                     'ext' : ext,
3710                     }]
3711         return results
3712
3713 class RBMARadioIE(InfoExtractor):
3714     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3715
3716     def _real_extract(self, url):
3717         m = re.match(self._VALID_URL, url)
3718         video_id = m.group('videoID')
3719
3720         webpage = self._download_webpage(url, video_id)
3721         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3722         if not m:
3723             raise ExtractorError(u'Cannot find metadata')
3724         json_data = m.group(1)
3725
3726         try:
3727             data = json.loads(json_data)
3728         except ValueError as e:
3729             raise ExtractorError(u'Invalid JSON: ' + str(e))
3730
3731         video_url = data['akamai_url'] + '&cbr=256'
3732         url_parts = compat_urllib_parse_urlparse(video_url)
3733         video_ext = url_parts.path.rpartition('.')[2]
3734         info = {
3735                 'id': video_id,
3736                 'url': video_url,
3737                 'ext': video_ext,
3738                 'title': data['title'],
3739                 'description': data.get('teaser_text'),
3740                 'location': data.get('country_of_origin'),
3741                 'uploader': data.get('host', {}).get('name'),
3742                 'uploader_id': data.get('host', {}).get('slug'),
3743                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3744                 'duration': data.get('duration'),
3745         }
3746         return [info]
3747
3748
3749 class YouPornIE(InfoExtractor):
3750     """Information extractor for youporn.com."""
3751     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3752
3753     def _print_formats(self, formats):
3754         """Print all available formats"""
3755         print(u'Available formats:')
3756         print(u'ext\t\tformat')
3757         print(u'---------------------------------')
3758         for format in formats:
3759             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3760
3761     def _specific(self, req_format, formats):
3762         for x in formats:
3763             if(x["format"]==req_format):
3764                 return x
3765         return None
3766
3767     def _real_extract(self, url):
3768         mobj = re.match(self._VALID_URL, url)
3769         if mobj is None:
3770             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3771             return
3772
3773         video_id = mobj.group('videoid')
3774
3775         req = compat_urllib_request.Request(url)
3776         req.add_header('Cookie', 'age_verified=1')
3777         webpage = self._download_webpage(req, video_id)
3778
3779         # Get the video title
3780         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3781         if result is None:
3782             raise ExtractorError(u'Unable to extract video title')
3783         video_title = result.group('title').strip()
3784
3785         # Get the video date
3786         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3787         if result is None:
3788             self._downloader.report_warning(u'unable to extract video date')
3789             upload_date = None
3790         else:
3791             upload_date = result.group('date').strip()
3792
3793         # Get the video uploader
3794         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3795         if result is None:
3796             self._downloader.report_warning(u'unable to extract uploader')
3797             video_uploader = None
3798         else:
3799             video_uploader = result.group('uploader').strip()
3800             video_uploader = clean_html( video_uploader )
3801
3802         # Get all of the formats available
3803         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3804         result = re.search(DOWNLOAD_LIST_RE, webpage)
3805         if result is None:
3806             raise ExtractorError(u'Unable to extract download list')
3807         download_list_html = result.group('download_list').strip()
3808
3809         # Get all of the links from the page
3810         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3811         links = re.findall(LINK_RE, download_list_html)
3812         if(len(links) == 0):
3813             raise ExtractorError(u'ERROR: no known formats available for video')
3814
3815         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3816
3817         formats = []
3818         for link in links:
3819
3820             # A link looks like this:
3821             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3822             # A path looks like this:
3823             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3824             video_url = unescapeHTML( link )
3825             path = compat_urllib_parse_urlparse( video_url ).path
3826             extension = os.path.splitext( path )[1][1:]
3827             format = path.split('/')[4].split('_')[:2]
3828             size = format[0]
3829             bitrate = format[1]
3830             format = "-".join( format )
3831             title = u'%s-%s-%s' % (video_title, size, bitrate)
3832
3833             formats.append({
3834                 'id': video_id,
3835                 'url': video_url,
3836                 'uploader': video_uploader,
3837                 'upload_date': upload_date,
3838                 'title': title,
3839                 'ext': extension,
3840                 'format': format,
3841                 'thumbnail': None,
3842                 'description': None,
3843                 'player_url': None
3844             })
3845
3846         if self._downloader.params.get('listformats', None):
3847             self._print_formats(formats)
3848             return
3849
3850         req_format = self._downloader.params.get('format', None)
3851         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3852
3853         if req_format is None or req_format == 'best':
3854             return [formats[0]]
3855         elif req_format == 'worst':
3856             return [formats[-1]]
3857         elif req_format in ('-1', 'all'):
3858             return formats
3859         else:
3860             format = self._specific( req_format, formats )
3861             if result is None:
3862                 self._downloader.trouble(u'ERROR: requested format not available')
3863                 return
3864             return [format]
3865
3866
3867
3868 class PornotubeIE(InfoExtractor):
3869     """Information extractor for pornotube.com."""
3870     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3871
3872     def _real_extract(self, url):
3873         mobj = re.match(self._VALID_URL, url)
3874         if mobj is None:
3875             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3876             return
3877
3878         video_id = mobj.group('videoid')
3879         video_title = mobj.group('title')
3880
3881         # Get webpage content
3882         webpage = self._download_webpage(url, video_id)
3883
3884         # Get the video URL
3885         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3886         result = re.search(VIDEO_URL_RE, webpage)
3887         if result is None:
3888             self._downloader.trouble(u'ERROR: unable to extract video url')
3889             return
3890         video_url = compat_urllib_parse.unquote(result.group('url'))
3891
3892         #Get the uploaded date
3893         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3894         result = re.search(VIDEO_UPLOADED_RE, webpage)
3895         if result is None:
3896             self._downloader.trouble(u'ERROR: unable to extract video title')
3897             return
3898         upload_date = result.group('date')
3899
3900         info = {'id': video_id,
3901                 'url': video_url,
3902                 'uploader': None,
3903                 'upload_date': upload_date,
3904                 'title': video_title,
3905                 'ext': 'flv',
3906                 'format': 'flv'}
3907
3908         return [info]
3909
3910 class YouJizzIE(InfoExtractor):
3911     """Information extractor for youjizz.com."""
3912     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3913
3914     def _real_extract(self, url):
3915         mobj = re.match(self._VALID_URL, url)
3916         if mobj is None:
3917             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3918             return
3919
3920         video_id = mobj.group('videoid')
3921
3922         # Get webpage content
3923         webpage = self._download_webpage(url, video_id)
3924
3925         # Get the video title
3926         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3927         if result is None:
3928             raise ExtractorError(u'ERROR: unable to extract video title')
3929         video_title = result.group('title').strip()
3930
3931         # Get the embed page
3932         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3933         if result is None:
3934             raise ExtractorError(u'ERROR: unable to extract embed page')
3935
3936         embed_page_url = result.group(0).strip()
3937         video_id = result.group('videoid')
3938
3939         webpage = self._download_webpage(embed_page_url, video_id)
3940
3941         # Get the video URL
3942         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3943         if result is None:
3944             raise ExtractorError(u'ERROR: unable to extract video url')
3945         video_url = result.group('source')
3946
3947         info = {'id': video_id,
3948                 'url': video_url,
3949                 'title': video_title,
3950                 'ext': 'flv',
3951                 'format': 'flv',
3952                 'player_url': embed_page_url}
3953
3954         return [info]
3955
3956 class EightTracksIE(InfoExtractor):
3957     IE_NAME = '8tracks'
3958     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3959
3960     def _real_extract(self, url):
3961         mobj = re.match(self._VALID_URL, url)
3962         if mobj is None:
3963             raise ExtractorError(u'Invalid URL: %s' % url)
3964         playlist_id = mobj.group('id')
3965
3966         webpage = self._download_webpage(url, playlist_id)
3967
3968         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3969         if not m:
3970             raise ExtractorError(u'Cannot find trax information')
3971         json_like = m.group(1)
3972         data = json.loads(json_like)
3973
3974         session = str(random.randint(0, 1000000000))
3975         mix_id = data['id']
3976         track_count = data['tracks_count']
3977         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3978         next_url = first_url
3979         res = []
3980         for i in itertools.count():
3981             api_json = self._download_webpage(next_url, playlist_id,
3982                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3983                 errnote=u'Failed to download song information')
3984             api_data = json.loads(api_json)
3985             track_data = api_data[u'set']['track']
3986             info = {
3987                 'id': track_data['id'],
3988                 'url': track_data['track_file_stream_url'],
3989                 'title': track_data['performer'] + u' - ' + track_data['name'],
3990                 'raw_title': track_data['name'],
3991                 'uploader_id': data['user']['login'],
3992                 'ext': 'm4a',
3993             }
3994             res.append(info)
3995             if api_data['set']['at_last_track']:
3996                 break
3997             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3998         return res
3999
4000 class KeekIE(InfoExtractor):
4001     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4002     IE_NAME = u'keek'
4003
4004     def _real_extract(self, url):
4005         m = re.match(self._VALID_URL, url)
4006         video_id = m.group('videoID')
4007         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4008         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4009         webpage = self._download_webpage(url, video_id)
4010         m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
4011         title = unescapeHTML(m.group('title'))
4012         m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
4013         uploader = unescapeHTML(m.group('uploader'))
4014         info = {
4015                 'id':video_id,
4016                 'url':video_url,
4017                 'ext': 'mp4',
4018                 'title': title,
4019                 'thumbnail': thumbnail,
4020                 'uploader': uploader
4021         }
4022         return [info]
4023
4024 class TEDIE(InfoExtractor):
4025     _VALID_URL=r'''http://www.ted.com/
4026                    (
4027                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4028                         |
4029                         ((?P<type_talk>talks)) # We have a simple talk
4030                    )
4031                    /(?P<name>\w+) # Here goes the name and then ".html"
4032                    '''
4033
4034     @classmethod
4035     def suitable(cls, url):
4036         """Receives a URL and returns True if suitable for this IE."""
4037         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4038
4039     def _real_extract(self, url):
4040         m=re.match(self._VALID_URL, url, re.VERBOSE)
4041         if m.group('type_talk'):
4042             return [self._talk_info(url)]
4043         else :
4044             playlist_id=m.group('playlist_id')
4045             name=m.group('name')
4046             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4047             return self._playlist_videos_info(url,name,playlist_id)
4048
4049     def _talk_video_link(self,mediaSlug):
4050         '''Returns the video link for that mediaSlug'''
4051         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4052
4053     def _playlist_videos_info(self,url,name,playlist_id=0):
4054         '''Returns the videos of the playlist'''
4055         video_RE=r'''
4056                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4057                      ([.\s]*?)data-playlist_item_id="(\d+)"
4058                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4059                      '''
4060         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4061         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4062         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4063         m_names=re.finditer(video_name_RE,webpage)
4064         info=[]
4065         for m_video, m_name in zip(m_videos,m_names):
4066             video_id=m_video.group('video_id')
4067             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4068             info.append(self._talk_info(talk_url,video_id))
4069         return info
4070
4071     def _talk_info(self, url, video_id=0):
4072         """Return the video for the talk in the url"""
4073         m=re.match(self._VALID_URL, url,re.VERBOSE)
4074         videoName=m.group('name')
4075         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4076         # If the url includes the language we get the title translated
4077         title_RE=r'<h1><span id="altHeadline" >(?P<title>.*)</span></h1>'
4078         title=re.search(title_RE, webpage).group('title')
4079         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4080                         "id":(?P<videoID>[\d]+).*?
4081                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4082         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4083         thumb_match=re.search(thumb_RE,webpage)
4084         info_match=re.search(info_RE,webpage,re.VERBOSE)
4085         video_id=info_match.group('videoID')
4086         mediaSlug=info_match.group('mediaSlug')
4087         video_url=self._talk_video_link(mediaSlug)
4088         info = {
4089                 'id': video_id,
4090                 'url': video_url,
4091                 'ext': 'mp4',
4092                 'title': title,
4093                 'thumbnail': thumb_match.group('thumbnail')
4094                 }
4095         return info
4096
4097 class MySpassIE(InfoExtractor):
4098     _VALID_URL = r'http://www.myspass.de/.*'
4099
4100     def _real_extract(self, url):
4101         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4102
4103         # video id is the last path element of the URL
4104         # usually there is a trailing slash, so also try the second but last
4105         url_path = compat_urllib_parse_urlparse(url).path
4106         url_parent_path, video_id = os.path.split(url_path)
4107         if not video_id:
4108             _, video_id = os.path.split(url_parent_path)
4109
4110         # get metadata
4111         metadata_url = META_DATA_URL_TEMPLATE % video_id
4112         metadata_text = self._download_webpage(metadata_url, video_id)
4113         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4114
4115         # extract values from metadata
4116         url_flv_el = metadata.find('url_flv')
4117         if url_flv_el is None:
4118             self._downloader.trouble(u'ERROR: unable to extract download url')
4119             return
4120         video_url = url_flv_el.text
4121         extension = os.path.splitext(video_url)[1][1:]
4122         title_el = metadata.find('title')
4123         if title_el is None:
4124             self._downloader.trouble(u'ERROR: unable to extract title')
4125             return
4126         title = title_el.text
4127         format_id_el = metadata.find('format_id')
4128         if format_id_el is None:
4129             format = ext
4130         else:
4131             format = format_id_el.text
4132         description_el = metadata.find('description')
4133         if description_el is not None:
4134             description = description_el.text
4135         else:
4136             description = None
4137         imagePreview_el = metadata.find('imagePreview')
4138         if imagePreview_el is not None:
4139             thumbnail = imagePreview_el.text
4140         else:
4141             thumbnail = None
4142         info = {
4143             'id': video_id,
4144             'url': video_url,
4145             'title': title,
4146             'ext': extension,
4147             'format': format,
4148             'thumbnail': thumbnail,
4149             'description': description
4150         }
4151         return [info]
4152
4153 def gen_extractors():
4154     """ Return a list of an instance of every supported extractor.
4155     The order does matter; the first extractor matched is the one handling the URL.
4156     """
4157     return [
4158         YoutubePlaylistIE(),
4159         YoutubeChannelIE(),
4160         YoutubeUserIE(),
4161         YoutubeSearchIE(),
4162         YoutubeIE(),
4163         MetacafeIE(),
4164         DailymotionIE(),
4165         GoogleSearchIE(),
4166         PhotobucketIE(),
4167         YahooIE(),
4168         YahooSearchIE(),
4169         DepositFilesIE(),
4170         FacebookIE(),
4171         BlipTVUserIE(),
4172         BlipTVIE(),
4173         VimeoIE(),
4174         MyVideoIE(),
4175         ComedyCentralIE(),
4176         EscapistIE(),
4177         CollegeHumorIE(),
4178         XVideosIE(),
4179         SoundcloudIE(),
4180         InfoQIE(),
4181         MixcloudIE(),
4182         StanfordOpenClassroomIE(),
4183         MTVIE(),
4184         YoukuIE(),
4185         XNXXIE(),
4186         YouJizzIE(),
4187         PornotubeIE(),
4188         YouPornIE(),
4189         GooglePlusIE(),
4190         ArteTvIE(),
4191         NBAIE(),
4192         WorldStarHipHopIE(),
4193         JustinTVIE(),
4194         FunnyOrDieIE(),
4195         SteamIE(),
4196         UstreamIE(),
4197         RBMARadioIE(),
4198         EightTracksIE(),
4199         KeekIE(),
4200         TEDIE(),
4201         MySpassIE(),
4202         GenericIE()
4203     ]
4204
4205