_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19
  20 from .utils import *
  21
  22
  23 class InfoExtractor(object):
  24     """Information Extractor class.
  25
  26     Information extractors are the classes that, given a URL, extract
  27     information about the video (or videos) the URL refers to. This
  28     information includes the real video URL, the video title, author and
  29     others. The information is stored in a dictionary which is then
  30     passed to the FileDownloader. The FileDownloader processes this
  31     information possibly downloading the video to the file system, among
  32     other possible outcomes.
  33
  34     The dictionaries must include the following fields:
  35
  36     id:             Video identifier.
  37     url:            Final video URL.
  38     title:          Video title, unescaped.
  39     ext:            Video filename extension.
  40
  41     The following fields are optional:
  42
  43     format:         The video format, defaults to ext (used for --get-format)
  44     thumbnail:      Full URL to a video thumbnail image.
  45     description:    One-line video description.
  46     uploader:       Full name of the video uploader.
  47     upload_date:    Video upload date (YYYYMMDD).
  48     uploader_id:    Nickname or id of the video uploader.
  49     location:       Physical location of the video.
  50     player_url:     SWF Player URL (used for rtmpdump).
  51     subtitles:      The .srt file contents.
  52     urlhandle:      [internal] The urlHandle to be used to download the file,
  53                     like returned by urllib.request.urlopen
  54
  55     The fields should all be Unicode strings.
  56
  57     Subclasses of this one should re-define the _real_initialize() and
  58     _real_extract() methods and define a _VALID_URL regexp.
  59     Probably, they should also be added to the list of extractors.
  60
  61     _real_extract() must return a *list* of information dictionaries as
  62     described above.
  63
  64     Finally, the _WORKING attribute should be set to False for broken IEs
  65     in order to warn the users and skip the tests.
  66     """
  67
  68     _ready = False
  69     _downloader = None
  70     _WORKING = True
  71
  72     def __init__(self, downloader=None):
  73         """Constructor. Receives an optional downloader."""
  74         self._ready = False
  75         self.set_downloader(downloader)
  76
  77     @classmethod
  78     def suitable(cls, url):
  79         """Receives a URL and returns True if suitable for this IE."""
  80         return re.match(cls._VALID_URL, url) is not None
  81
  82     @classmethod
  83     def working(cls):
  84         """Getter method for _WORKING."""
  85         return cls._WORKING
  86
  87     def initialize(self):
  88         """Initializes an instance (authentication, etc)."""
  89         if not self._ready:
  90             self._real_initialize()
  91             self._ready = True
  92
  93     def extract(self, url):
  94         """Extracts URL information and returns it in list of dicts."""
  95         self.initialize()
  96         return self._real_extract(url)
  97
  98     def set_downloader(self, downloader):
  99         """Sets the downloader for this IE."""
 100         self._downloader = downloader
 101
 102     def _real_initialize(self):
 103         """Real initialization process. Redefine in subclasses."""
 104         pass
 105
 106     def _real_extract(self, url):
 107         """Real extraction process. Redefine in subclasses."""
 108         pass
 109
 110     @property
 111     def IE_NAME(self):
 112         return type(self).__name__[:-2]
 113
 114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 115         """ Returns the response handle """
 116         if note is None:
 117             note = u'Downloading video webpage'
 118         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
 119         try:
 120             return compat_urllib_request.urlopen(url_or_request)
 121         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 122             if errnote is None:
 123                 errnote = u'Unable to download webpage'
 124             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 125
 126     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 127         """ Returns the data of the page as a string """
 128         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 129         webpage_bytes = urlh.read()
 130         return webpage_bytes.decode('utf-8', 'replace')
 131
 132
 133 class YoutubeIE(InfoExtractor):
 134     """Information extractor for youtube.com."""
 135
 136     _VALID_URL = r"""^
 137                      (
 138                          (?:https?://)?                                       # http(s):// (optional)
 139                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 140                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 141                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 142                          (?:                                                  # the various things that can precede the ID:
 143                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 144                              |(?:                                             # or the v= param in all its forms
 145                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 146                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 147                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 148                                  v=
 149                              )
 150                          )?                                                   # optional -> youtube.com/xxxx is OK
 151                      )?                                                       # all until now is optional -> you can pass the naked ID
 152                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 153                      (?(1).+)?                                                # if we found the ID, everything can follow
 154                      $"""
 155     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 156     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 157     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 158     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 159     _NETRC_MACHINE = 'youtube'
 160     # Listed in order of quality
 161     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 162     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 163     _video_extensions = {
 164         '13': '3gp',
 165         '17': 'mp4',
 166         '18': 'mp4',
 167         '22': 'mp4',
 168         '37': 'mp4',
 169         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 170         '43': 'webm',
 171         '44': 'webm',
 172         '45': 'webm',
 173         '46': 'webm',
 174     }
 175     _video_dimensions = {
 176         '5': '240x400',
 177         '6': '???',
 178         '13': '???',
 179         '17': '144x176',
 180         '18': '360x640',
 181         '22': '720x1280',
 182         '34': '360x640',
 183         '35': '480x854',
 184         '37': '1080x1920',
 185         '38': '3072x4096',
 186         '43': '360x640',
 187         '44': '480x854',
 188         '45': '720x1280',
 189         '46': '1080x1920',
 190     }
 191     IE_NAME = u'youtube'
 192
 193     @classmethod
 194     def suitable(cls, url):
 195         """Receives a URL and returns True if suitable for this IE."""
 196         if YoutubePlaylistIE.suitable(url): return False
 197         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 198
 199     def report_lang(self):
 200         """Report attempt to set language."""
 201         self._downloader.to_screen(u'[youtube] Setting language')
 202
 203     def report_login(self):
 204         """Report attempt to log in."""
 205         self._downloader.to_screen(u'[youtube] Logging in')
 206
 207     def report_age_confirmation(self):
 208         """Report attempt to confirm age."""
 209         self._downloader.to_screen(u'[youtube] Confirming age')
 210
 211     def report_video_webpage_download(self, video_id):
 212         """Report attempt to download video webpage."""
 213         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 214
 215     def report_video_info_webpage_download(self, video_id):
 216         """Report attempt to download video info webpage."""
 217         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 218
 219     def report_video_subtitles_download(self, video_id):
 220         """Report attempt to download video info webpage."""
 221         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 222
 223     def report_information_extraction(self, video_id):
 224         """Report attempt to extract video information."""
 225         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 226
 227     def report_unavailable_format(self, video_id, format):
 228         """Report extracted video URL."""
 229         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 230
 231     def report_rtmp_download(self):
 232         """Indicate the download will use the RTMP protocol."""
 233         self._downloader.to_screen(u'[youtube] RTMP download detected')
 234
 235     def _closed_captions_xml_to_srt(self, xml_string):
 236         srt = ''
 237         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 238         # TODO parse xml instead of regex
 239         for n, (start, dur_tag, dur, caption) in enumerate(texts):
 240             if not dur: dur = '4'
 241             start = float(start)
 242             end = start + float(dur)
 243             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 244             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 245             caption = unescapeHTML(caption)
 246             caption = unescapeHTML(caption) # double cycle, intentional
 247             srt += str(n+1) + '\n'
 248             srt += start + ' --> ' + end + '\n'
 249             srt += caption + '\n\n'
 250         return srt
 251
 252     def _extract_subtitles(self, video_id):
 253         self.report_video_subtitles_download(video_id)
 254         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 255         try:
 256             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 257         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 258             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 259         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 260         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 261         if not srt_lang_list:
 262             return (u'WARNING: video has no closed captions', None)
 263         if self._downloader.params.get('subtitleslang', False):
 264             srt_lang = self._downloader.params.get('subtitleslang')
 265         elif 'en' in srt_lang_list:
 266             srt_lang = 'en'
 267         else:
 268             srt_lang = list(srt_lang_list.keys())[0]
 269         if not srt_lang in srt_lang_list:
 270             return (u'WARNING: no closed captions found in the specified language', None)
 271         params = compat_urllib_parse.urlencode({
 272             'lang': srt_lang,
 273             'name': srt_lang_list[srt_lang].encode('utf-8'),
 274             'v': video_id,
 275         })
 276         url = 'http://www.youtube.com/api/timedtext?' + params
 277         try:
 278             srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8')
 279         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 280             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 281         if not srt_xml:
 282             return (u'WARNING: Did not fetch video subtitles', None)
 283         return (None, self._closed_captions_xml_to_srt(srt_xml))
 284
 285     def _print_formats(self, formats):
 286         print('Available formats:')
 287         for x in formats:
 288             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 289
 290     def _real_initialize(self):
 291         if self._downloader is None:
 292             return
 293
 294         username = None
 295         password = None
 296         downloader_params = self._downloader.params
 297
 298         # Attempt to use provided username and password or .netrc data
 299         if downloader_params.get('username', None) is not None:
 300             username = downloader_params['username']
 301             password = downloader_params['password']
 302         elif downloader_params.get('usenetrc', False):
 303             try:
 304                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 305                 if info is not None:
 306                     username = info[0]
 307                     password = info[2]
 308                 else:
 309                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 310             except (IOError, netrc.NetrcParseError) as err:
 311                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
 312                 return
 313
 314         # Set language
 315         request = compat_urllib_request.Request(self._LANG_URL)
 316         try:
 317             self.report_lang()
 318             compat_urllib_request.urlopen(request).read()
 319         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 320             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
 321             return
 322
 323         # No authentication to be performed
 324         if username is None:
 325             return
 326
 327         request = compat_urllib_request.Request(self._LOGIN_URL)
 328         try:
 329             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 330         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 331             self._downloader.to_stderr(u'WARNING: unable to fetch login page: %s' % compat_str(err))
 332             return
 333
 334         galx = None
 335         dsh = None
 336         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 337         if match:
 338           galx = match.group(1)
 339
 340         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 341         if match:
 342           dsh = match.group(1)
 343
 344         # Log in
 345         login_form_strs = {
 346                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 347                 u'Email': username,
 348                 u'GALX': galx,
 349                 u'Passwd': password,
 350                 u'PersistentCookie': u'yes',
 351                 u'_utf8': u'霱',
 352                 u'bgresponse': u'js_disabled',
 353                 u'checkConnection': u'',
 354                 u'checkedDomains': u'youtube',
 355                 u'dnConn': u'',
 356                 u'dsh': dsh,
 357                 u'pstMsg': u'0',
 358                 u'rmShown': u'1',
 359                 u'secTok': u'',
 360                 u'signIn': u'Sign in',
 361                 u'timeStmp': u'',
 362                 u'service': u'youtube',
 363                 u'uilel': u'3',
 364                 u'hl': u'en_US',
 365         }
 366         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 367         # chokes on unicode
 368         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 369         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 370         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 371         try:
 372             self.report_login()
 373             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 374             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 375                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 376                 return
 377         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 378             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
 379             return
 380
 381         # Confirm age
 382         age_form = {
 383                 'next_url':     '/',
 384                 'action_confirm':   'Confirm',
 385                 }
 386         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 387         try:
 388             self.report_age_confirmation()
 389             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 390         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 391             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 392             return
 393
 394     def _extract_id(self, url):
 395         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 396         if mobj is None:
 397             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 398             return
 399         video_id = mobj.group(2)
 400         return video_id
 401
 402     def _real_extract(self, url):
 403         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 404         mobj = re.search(self._NEXT_URL_RE, url)
 405         if mobj:
 406             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 407         video_id = self._extract_id(url)
 408
 409         # Get video webpage
 410         self.report_video_webpage_download(video_id)
 411         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 412         request = compat_urllib_request.Request(url)
 413         try:
 414             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 415         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 416             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
 417             return
 418
 419         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 420
 421         # Attempt to extract SWF player URL
 422         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 423         if mobj is not None:
 424             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 425         else:
 426             player_url = None
 427
 428         # Get video info
 429         self.report_video_info_webpage_download(video_id)
 430         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 431             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 432                     % (video_id, el_type))
 433             request = compat_urllib_request.Request(video_info_url)
 434             try:
 435                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
 436                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
 437                 video_info = compat_parse_qs(video_info_webpage)
 438                 if 'token' in video_info:
 439                     break
 440             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 441                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 442                 return
 443         if 'token' not in video_info:
 444             if 'reason' in video_info:
 445                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
 446             else:
 447                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 448             return
 449
 450         # Check for "rental" videos
 451         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 452             self._downloader.trouble(u'ERROR: "rental" videos not supported')
 453             return
 454
 455         # Start extracting information
 456         self.report_information_extraction(video_id)
 457
 458         # uploader
 459         if 'author' not in video_info:
 460             self._downloader.trouble(u'ERROR: unable to extract uploader name')
 461             return
 462         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 463
 464         # uploader_id
 465         video_uploader_id = None
 466         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 467         if mobj is not None:
 468             video_uploader_id = mobj.group(1)
 469         else:
 470             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 471
 472         # title
 473         if 'title' not in video_info:
 474             self._downloader.trouble(u'ERROR: unable to extract video title')
 475             return
 476         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 477
 478         # thumbnail image
 479         if 'thumbnail_url' not in video_info:
 480             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 481             video_thumbnail = ''
 482         else:   # don't panic if we can't find it
 483             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 484
 485         # upload date
 486         upload_date = None
 487         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 488         if mobj is not None:
 489             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 490             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 491             for expression in format_expressions:
 492                 try:
 493                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 494                 except:
 495                     pass
 496
 497         # description
 498         video_description = get_element_by_id("eow-description", video_webpage)
 499         if video_description:
 500             video_description = clean_html(video_description)
 501         else:
 502             video_description = ''
 503
 504         # closed captions
 505         video_subtitles = None
 506         if self._downloader.params.get('writesubtitles', False):
 507             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
 508             if srt_error:
 509                 self._downloader.trouble(srt_error)
 510
 511         if 'length_seconds' not in video_info:
 512             self._downloader.trouble(u'WARNING: unable to extract video duration')
 513             video_duration = ''
 514         else:
 515             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 516
 517         # token
 518         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 519
 520         # Decide which formats to download
 521         req_format = self._downloader.params.get('format', None)
 522
 523         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 524             self.report_rtmp_download()
 525             video_url_list = [(None, video_info['conn'][0])]
 526         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 527             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 528             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 529             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 530             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 531
 532             format_limit = self._downloader.params.get('format_limit', None)
 533             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 534             if format_limit is not None and format_limit in available_formats:
 535                 format_list = available_formats[available_formats.index(format_limit):]
 536             else:
 537                 format_list = available_formats
 538             existing_formats = [x for x in format_list if x in url_map]
 539             if len(existing_formats) == 0:
 540                 self._downloader.trouble(u'ERROR: no known formats available for video')
 541                 return
 542             if self._downloader.params.get('listformats', None):
 543                 self._print_formats(existing_formats)
 544                 return
 545             if req_format is None or req_format == 'best':
 546                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 547             elif req_format == 'worst':
 548                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 549             elif req_format in ('-1', 'all'):
 550                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 551             else:
 552                 # Specific formats. We pick the first in a slash-delimeted sequence.
 553                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 554                 req_formats = req_format.split('/')
 555                 video_url_list = None
 556                 for rf in req_formats:
 557                     if rf in url_map:
 558                         video_url_list = [(rf, url_map[rf])]
 559                         break
 560                 if video_url_list is None:
 561                     self._downloader.trouble(u'ERROR: requested format not available')
 562                     return
 563         else:
 564             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 565             return
 566
 567         results = []
 568         for format_param, video_real_url in video_url_list:
 569             # Extension
 570             video_extension = self._video_extensions.get(format_param, 'flv')
 571
 572             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 573                                               self._video_dimensions.get(format_param, '???'))
 574
 575             results.append({
 576                 'id':       video_id,
 577                 'url':      video_real_url,
 578                 'uploader': video_uploader,
 579                 'uploader_id': video_uploader_id,
 580                 'upload_date':  upload_date,
 581                 'title':    video_title,
 582                 'ext':      video_extension,
 583                 'format':   video_format,
 584                 'thumbnail':    video_thumbnail,
 585                 'description':  video_description,
 586                 'player_url':   player_url,
 587                 'subtitles':    video_subtitles,
 588                 'duration':     video_duration
 589             })
 590         return results
 591
 592
 593 class MetacafeIE(InfoExtractor):
 594     """Information Extractor for metacafe.com."""
 595
 596     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 597     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 598     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 599     IE_NAME = u'metacafe'
 600
 601     def __init__(self, downloader=None):
 602         InfoExtractor.__init__(self, downloader)
 603
 604     def report_disclaimer(self):
 605         """Report disclaimer retrieval."""
 606         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 607
 608     def report_age_confirmation(self):
 609         """Report attempt to confirm age."""
 610         self._downloader.to_screen(u'[metacafe] Confirming age')
 611
 612     def report_download_webpage(self, video_id):
 613         """Report webpage download."""
 614         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 615
 616     def report_extraction(self, video_id):
 617         """Report information extraction."""
 618         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 619
 620     def _real_initialize(self):
 621         # Retrieve disclaimer
 622         request = compat_urllib_request.Request(self._DISCLAIMER)
 623         try:
 624             self.report_disclaimer()
 625             disclaimer = compat_urllib_request.urlopen(request).read()
 626         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 627             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
 628             return
 629
 630         # Confirm age
 631         disclaimer_form = {
 632             'filters': '0',
 633             'submit': "Continue - I'm over 18",
 634             }
 635         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 636         try:
 637             self.report_age_confirmation()
 638             disclaimer = compat_urllib_request.urlopen(request).read()
 639         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 640             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 641             return
 642
 643     def _real_extract(self, url):
 644         # Extract id and simplified title from URL
 645         mobj = re.match(self._VALID_URL, url)
 646         if mobj is None:
 647             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 648             return
 649
 650         video_id = mobj.group(1)
 651
 652         # Check if video comes from YouTube
 653         mobj2 = re.match(r'^yt-(.*)$', video_id)
 654         if mobj2 is not None:
 655             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 656             return
 657
 658         # Retrieve video webpage to extract further information
 659         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
 660         try:
 661             self.report_download_webpage(video_id)
 662             webpage = compat_urllib_request.urlopen(request).read()
 663         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 664             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 665             return
 666
 667         # Extract URL, uploader and title from webpage
 668         self.report_extraction(video_id)
 669         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 670         if mobj is not None:
 671             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 672             video_extension = mediaURL[-3:]
 673
 674             # Extract gdaKey if available
 675             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 676             if mobj is None:
 677                 video_url = mediaURL
 678             else:
 679                 gdaKey = mobj.group(1)
 680                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 681         else:
 682             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 683             if mobj is None:
 684                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 685                 return
 686             vardict = compat_parse_qs(mobj.group(1))
 687             if 'mediaData' not in vardict:
 688                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 689                 return
 690             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 691             if mobj is None:
 692                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 693                 return
 694             mediaURL = mobj.group(1).replace('\\/', '/')
 695             video_extension = mediaURL[-3:]
 696             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 697
 698         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 699         if mobj is None:
 700             self._downloader.trouble(u'ERROR: unable to extract title')
 701             return
 702         video_title = mobj.group(1).decode('utf-8')
 703
 704         mobj = re.search(r'submitter=(.*?);', webpage)
 705         if mobj is None:
 706             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 707             return
 708         video_uploader = mobj.group(1)
 709
 710         return [{
 711             'id':       video_id.decode('utf-8'),
 712             'url':      video_url.decode('utf-8'),
 713             'uploader': video_uploader.decode('utf-8'),
 714             'upload_date':  None,
 715             'title':    video_title,
 716             'ext':      video_extension.decode('utf-8'),
 717         }]
 718
 719
 720 class DailymotionIE(InfoExtractor):
 721     """Information Extractor for Dailymotion"""
 722
 723     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 724     IE_NAME = u'dailymotion'
 725     _WORKING = False
 726
 727     def __init__(self, downloader=None):
 728         InfoExtractor.__init__(self, downloader)
 729
 730     def report_extraction(self, video_id):
 731         """Report information extraction."""
 732         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 733
 734     def _real_extract(self, url):
 735         # Extract id and simplified title from URL
 736         mobj = re.match(self._VALID_URL, url)
 737         if mobj is None:
 738             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 739             return
 740
 741         video_id = mobj.group(1).split('_')[0].split('?')[0]
 742
 743         video_extension = 'mp4'
 744
 745         # Retrieve video webpage to extract further information
 746         request = compat_urllib_request.Request(url)
 747         request.add_header('Cookie', 'family_filter=off')
 748         webpage = self._download_webpage(request, video_id)
 749
 750         # Extract URL, uploader and title from webpage
 751         self.report_extraction(video_id)
 752         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 753         if mobj is None:
 754             self._downloader.trouble(u'ERROR: unable to extract media URL')
 755             return
 756         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 757
 758         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 759             if key in flashvars:
 760                 max_quality = key
 761                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 762                 break
 763         else:
 764             self._downloader.trouble(u'ERROR: unable to extract video URL')
 765             return
 766
 767         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 768         if mobj is None:
 769             self._downloader.trouble(u'ERROR: unable to extract video URL')
 770             return
 771
 772         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 773
 774         # TODO: support choosing qualities
 775
 776         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 777         if mobj is None:
 778             self._downloader.trouble(u'ERROR: unable to extract title')
 779             return
 780         video_title = unescapeHTML(mobj.group('title'))
 781
 782         video_uploader = None
 783         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 784         if mobj is None:
 785             # lookin for official user
 786             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 787             if mobj_official is None:
 788                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 789             else:
 790                 video_uploader = mobj_official.group(1)
 791         else:
 792             video_uploader = mobj.group(1)
 793
 794         video_upload_date = None
 795         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 796         if mobj is not None:
 797             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 798
 799         return [{
 800             'id':       video_id,
 801             'url':      video_url,
 802             'uploader': video_uploader,
 803             'upload_date':  video_upload_date,
 804             'title':    video_title,
 805             'ext':      video_extension,
 806         }]
 807
 808
 809 class PhotobucketIE(InfoExtractor):
 810     """Information extractor for photobucket.com."""
 811
 812     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 813     IE_NAME = u'photobucket'
 814
 815     def __init__(self, downloader=None):
 816         InfoExtractor.__init__(self, downloader)
 817
 818     def report_download_webpage(self, video_id):
 819         """Report webpage download."""
 820         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 821
 822     def report_extraction(self, video_id):
 823         """Report information extraction."""
 824         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 825
 826     def _real_extract(self, url):
 827         # Extract id from URL
 828         mobj = re.match(self._VALID_URL, url)
 829         if mobj is None:
 830             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 831             return
 832
 833         video_id = mobj.group(1)
 834
 835         video_extension = 'flv'
 836
 837         # Retrieve video webpage to extract further information
 838         request = compat_urllib_request.Request(url)
 839         try:
 840             self.report_download_webpage(video_id)
 841             webpage = compat_urllib_request.urlopen(request).read()
 842         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 843             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 844             return
 845
 846         # Extract URL, uploader, and title from webpage
 847         self.report_extraction(video_id)
 848         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 849         if mobj is None:
 850             self._downloader.trouble(u'ERROR: unable to extract media URL')
 851             return
 852         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 853
 854         video_url = mediaURL
 855
 856         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 857         if mobj is None:
 858             self._downloader.trouble(u'ERROR: unable to extract title')
 859             return
 860         video_title = mobj.group(1).decode('utf-8')
 861
 862         video_uploader = mobj.group(2).decode('utf-8')
 863
 864         return [{
 865             'id':       video_id.decode('utf-8'),
 866             'url':      video_url.decode('utf-8'),
 867             'uploader': video_uploader,
 868             'upload_date':  None,
 869             'title':    video_title,
 870             'ext':      video_extension.decode('utf-8'),
 871         }]
 872
 873
 874 class YahooIE(InfoExtractor):
 875     """Information extractor for video.yahoo.com."""
 876
 877     _WORKING = False
 878     # _VALID_URL matches all Yahoo! Video URLs
 879     # _VPAGE_URL matches only the extractable '/watch/' URLs
 880     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 881     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 882     IE_NAME = u'video.yahoo'
 883
 884     def __init__(self, downloader=None):
 885         InfoExtractor.__init__(self, downloader)
 886
 887     def report_download_webpage(self, video_id):
 888         """Report webpage download."""
 889         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 890
 891     def report_extraction(self, video_id):
 892         """Report information extraction."""
 893         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 894
 895     def _real_extract(self, url, new_video=True):
 896         # Extract ID from URL
 897         mobj = re.match(self._VALID_URL, url)
 898         if mobj is None:
 899             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 900             return
 901
 902         video_id = mobj.group(2)
 903         video_extension = 'flv'
 904
 905         # Rewrite valid but non-extractable URLs as
 906         # extractable English language /watch/ URLs
 907         if re.match(self._VPAGE_URL, url) is None:
 908             request = compat_urllib_request.Request(url)
 909             try:
 910                 webpage = compat_urllib_request.urlopen(request).read()
 911             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 912                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 913                 return
 914
 915             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 916             if mobj is None:
 917                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 918                 return
 919             yahoo_id = mobj.group(1)
 920
 921             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 922             if mobj is None:
 923                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 924                 return
 925             yahoo_vid = mobj.group(1)
 926
 927             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 928             return self._real_extract(url, new_video=False)
 929
 930         # Retrieve video webpage to extract further information
 931         request = compat_urllib_request.Request(url)
 932         try:
 933             self.report_download_webpage(video_id)
 934             webpage = compat_urllib_request.urlopen(request).read()
 935         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 936             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 937             return
 938
 939         # Extract uploader and title from webpage
 940         self.report_extraction(video_id)
 941         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 942         if mobj is None:
 943             self._downloader.trouble(u'ERROR: unable to extract video title')
 944             return
 945         video_title = mobj.group(1).decode('utf-8')
 946
 947         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 948         if mobj is None:
 949             self._downloader.trouble(u'ERROR: unable to extract video uploader')
 950             return
 951         video_uploader = mobj.group(1).decode('utf-8')
 952
 953         # Extract video thumbnail
 954         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 955         if mobj is None:
 956             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 957             return
 958         video_thumbnail = mobj.group(1).decode('utf-8')
 959
 960         # Extract video description
 961         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 962         if mobj is None:
 963             self._downloader.trouble(u'ERROR: unable to extract video description')
 964             return
 965         video_description = mobj.group(1).decode('utf-8')
 966         if not video_description:
 967             video_description = 'No description available.'
 968
 969         # Extract video height and width
 970         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 971         if mobj is None:
 972             self._downloader.trouble(u'ERROR: unable to extract video height')
 973             return
 974         yv_video_height = mobj.group(1)
 975
 976         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 977         if mobj is None:
 978             self._downloader.trouble(u'ERROR: unable to extract video width')
 979             return
 980         yv_video_width = mobj.group(1)
 981
 982         # Retrieve video playlist to extract media URL
 983         # I'm not completely sure what all these options are, but we
 984         # seem to need most of them, otherwise the server sends a 401.
 985         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 986         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 987         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 988                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 989                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 990         try:
 991             self.report_download_webpage(video_id)
 992             webpage = compat_urllib_request.urlopen(request).read()
 993         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 994             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 995             return
 996
 997         # Extract media URL from playlist XML
 998         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 999         if mobj is None:
1000             self._downloader.trouble(u'ERROR: Unable to extract media URL')
1001             return
1002         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1003         video_url = unescapeHTML(video_url)
1004
1005         return [{
1006             'id':       video_id.decode('utf-8'),
1007             'url':      video_url,
1008             'uploader': video_uploader,
1009             'upload_date':  None,
1010             'title':    video_title,
1011             'ext':      video_extension.decode('utf-8'),
1012             'thumbnail':    video_thumbnail.decode('utf-8'),
1013             'description':  video_description,
1014         }]
1015
1016
1017 class VimeoIE(InfoExtractor):
1018     """Information extractor for vimeo.com."""
1019
1020     # _VALID_URL matches Vimeo URLs
1021     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1022     IE_NAME = u'vimeo'
1023
1024     def __init__(self, downloader=None):
1025         InfoExtractor.__init__(self, downloader)
1026
1027     def report_download_webpage(self, video_id):
1028         """Report webpage download."""
1029         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1030
1031     def report_extraction(self, video_id):
1032         """Report information extraction."""
1033         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1034
1035     def _real_extract(self, url, new_video=True):
1036         # Extract ID from URL
1037         mobj = re.match(self._VALID_URL, url)
1038         if mobj is None:
1039             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1040             return
1041
1042         video_id = mobj.group('id')
1043         if not mobj.group('proto'):
1044             url = 'https://' + url
1045         if mobj.group('direct_link'):
1046             url = 'https://vimeo.com/' + video_id
1047
1048         # Retrieve video webpage to extract further information
1049         request = compat_urllib_request.Request(url, None, std_headers)
1050         try:
1051             self.report_download_webpage(video_id)
1052             webpage_bytes = compat_urllib_request.urlopen(request).read()
1053             webpage = webpage_bytes.decode('utf-8')
1054         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1055             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1056             return
1057
1058         # Now we begin extracting as much information as we can from what we
1059         # retrieved. First we extract the information common to all extractors,
1060         # and latter we extract those that are Vimeo specific.
1061         self.report_extraction(video_id)
1062
1063         # Extract the config JSON
1064         try:
1065             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1066             config = json.loads(config)
1067         except:
1068             self._downloader.trouble(u'ERROR: unable to extract info section')
1069             return
1070
1071         # Extract title
1072         video_title = config["video"]["title"]
1073
1074         # Extract uploader and uploader_id
1075         video_uploader = config["video"]["owner"]["name"]
1076         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1077
1078         # Extract video thumbnail
1079         video_thumbnail = config["video"]["thumbnail"]
1080
1081         # Extract video description
1082         video_description = get_element_by_attribute("itemprop", "description", webpage)
1083         if video_description: video_description = clean_html(video_description)
1084         else: video_description = ''
1085
1086         # Extract upload date
1087         video_upload_date = None
1088         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1089         if mobj is not None:
1090             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1091
1092         # Vimeo specific: extract request signature and timestamp
1093         sig = config['request']['signature']
1094         timestamp = config['request']['timestamp']
1095
1096         # Vimeo specific: extract video codec and quality information
1097         # First consider quality, then codecs, then take everything
1098         # TODO bind to format param
1099         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1100         files = { 'hd': [], 'sd': [], 'other': []}
1101         for codec_name, codec_extension in codecs:
1102             if codec_name in config["video"]["files"]:
1103                 if 'hd' in config["video"]["files"][codec_name]:
1104                     files['hd'].append((codec_name, codec_extension, 'hd'))
1105                 elif 'sd' in config["video"]["files"][codec_name]:
1106                     files['sd'].append((codec_name, codec_extension, 'sd'))
1107                 else:
1108                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1109
1110         for quality in ('hd', 'sd', 'other'):
1111             if len(files[quality]) > 0:
1112                 video_quality = files[quality][0][2]
1113                 video_codec = files[quality][0][0]
1114                 video_extension = files[quality][0][1]
1115                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1116                 break
1117         else:
1118             self._downloader.trouble(u'ERROR: no known codec found')
1119             return
1120
1121         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1122                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1123
1124         return [{
1125             'id':       video_id,
1126             'url':      video_url,
1127             'uploader': video_uploader,
1128             'uploader_id': video_uploader_id,
1129             'upload_date':  video_upload_date,
1130             'title':    video_title,
1131             'ext':      video_extension,
1132             'thumbnail':    video_thumbnail,
1133             'description':  video_description,
1134         }]
1135
1136
1137 class ArteTvIE(InfoExtractor):
1138     """arte.tv information extractor."""
1139
1140     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1141     _LIVE_URL = r'index-[0-9]+\.html$'
1142
1143     IE_NAME = u'arte.tv'
1144
1145     def __init__(self, downloader=None):
1146         InfoExtractor.__init__(self, downloader)
1147
1148     def report_download_webpage(self, video_id):
1149         """Report webpage download."""
1150         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1151
1152     def report_extraction(self, video_id):
1153         """Report information extraction."""
1154         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1155
1156     def fetch_webpage(self, url):
1157         request = compat_urllib_request.Request(url)
1158         try:
1159             self.report_download_webpage(url)
1160             webpage = compat_urllib_request.urlopen(request).read()
1161         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1162             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1163             return
1164         except ValueError as err:
1165             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1166             return
1167         return webpage
1168
1169     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1170         page = self.fetch_webpage(url)
1171         mobj = re.search(regex, page, regexFlags)
1172         info = {}
1173
1174         if mobj is None:
1175             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1176             return
1177
1178         for (i, key, err) in matchTuples:
1179             if mobj.group(i) is None:
1180                 self._downloader.trouble(err)
1181                 return
1182             else:
1183                 info[key] = mobj.group(i)
1184
1185         return info
1186
1187     def extractLiveStream(self, url):
1188         video_lang = url.split('/')[-4]
1189         info = self.grep_webpage(
1190             url,
1191             r'src="(.*?/videothek_js.*?\.js)',
1192             0,
1193             [
1194                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1195             ]
1196         )
1197         http_host = url.split('/')[2]
1198         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1199         info = self.grep_webpage(
1200             next_url,
1201             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1202                 '(http://.*?\.swf).*?' +
1203                 '(rtmp://.*?)\'',
1204             re.DOTALL,
1205             [
1206                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1207                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1208                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1209             ]
1210         )
1211         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1212
1213     def extractPlus7Stream(self, url):
1214         video_lang = url.split('/')[-3]
1215         info = self.grep_webpage(
1216             url,
1217             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1218             0,
1219             [
1220                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1221             ]
1222         )
1223         next_url = compat_urllib_parse.unquote(info.get('url'))
1224         info = self.grep_webpage(
1225             next_url,
1226             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1227             0,
1228             [
1229                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1230             ]
1231         )
1232         next_url = compat_urllib_parse.unquote(info.get('url'))
1233
1234         info = self.grep_webpage(
1235             next_url,
1236             r'<video id="(.*?)".*?>.*?' +
1237                 '<name>(.*?)</name>.*?' +
1238                 '<dateVideo>(.*?)</dateVideo>.*?' +
1239                 '<url quality="hd">(.*?)</url>',
1240             re.DOTALL,
1241             [
1242                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1243                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1244                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1245                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1246             ]
1247         )
1248
1249         return {
1250             'id':           info.get('id'),
1251             'url':          compat_urllib_parse.unquote(info.get('url')),
1252             'uploader':     u'arte.tv',
1253             'upload_date':  info.get('date'),
1254             'title':        info.get('title').decode('utf-8'),
1255             'ext':          u'mp4',
1256             'format':       u'NA',
1257             'player_url':   None,
1258         }
1259
1260     def _real_extract(self, url):
1261         video_id = url.split('/')[-1]
1262         self.report_extraction(video_id)
1263
1264         if re.search(self._LIVE_URL, video_id) is not None:
1265             self.extractLiveStream(url)
1266             return
1267         else:
1268             info = self.extractPlus7Stream(url)
1269
1270         return [info]
1271
1272
1273 class GenericIE(InfoExtractor):
1274     """Generic last-resort information extractor."""
1275
1276     _VALID_URL = r'.*'
1277     IE_NAME = u'generic'
1278
1279     def __init__(self, downloader=None):
1280         InfoExtractor.__init__(self, downloader)
1281
1282     def report_download_webpage(self, video_id):
1283         """Report webpage download."""
1284         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1285         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1286
1287     def report_extraction(self, video_id):
1288         """Report information extraction."""
1289         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1290
1291     def report_following_redirect(self, new_url):
1292         """Report information extraction."""
1293         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1294
1295     def _test_redirect(self, url):
1296         """Check if it is a redirect, like url shorteners, in case restart chain."""
1297         class HeadRequest(compat_urllib_request.Request):
1298             def get_method(self):
1299                 return "HEAD"
1300
1301         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1302             """
1303             Subclass the HTTPRedirectHandler to make it use our
1304             HeadRequest also on the redirected URL
1305             """
1306             def redirect_request(self, req, fp, code, msg, headers, newurl):
1307                 if code in (301, 302, 303, 307):
1308                     newurl = newurl.replace(' ', '%20')
1309                     newheaders = dict((k,v) for k,v in req.headers.items()
1310                                       if k.lower() not in ("content-length", "content-type"))
1311                     return HeadRequest(newurl,
1312                                        headers=newheaders,
1313                                        origin_req_host=req.get_origin_req_host(),
1314                                        unverifiable=True)
1315                 else:
1316                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1317
1318         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1319             """
1320             Fallback to GET if HEAD is not allowed (405 HTTP error)
1321             """
1322             def http_error_405(self, req, fp, code, msg, headers):
1323                 fp.read()
1324                 fp.close()
1325
1326                 newheaders = dict((k,v) for k,v in req.headers.items()
1327                                   if k.lower() not in ("content-length", "content-type"))
1328                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1329                                                  headers=newheaders,
1330                                                  origin_req_host=req.get_origin_req_host(),
1331                                                  unverifiable=True))
1332
1333         # Build our opener
1334         opener = compat_urllib_request.OpenerDirector()
1335         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1336                         HTTPMethodFallback, HEADRedirectHandler,
1337                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1338             opener.add_handler(handler())
1339
1340         response = opener.open(HeadRequest(url))
1341         new_url = response.geturl()
1342
1343         if url == new_url:
1344             return False
1345
1346         self.report_following_redirect(new_url)
1347         self._downloader.download([new_url])
1348         return True
1349
1350     def _real_extract(self, url):
1351         if self._test_redirect(url): return
1352
1353         video_id = url.split('/')[-1]
1354         request = compat_urllib_request.Request(url)
1355         try:
1356             self.report_download_webpage(video_id)
1357             webpage = compat_urllib_request.urlopen(request).read()
1358         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1359             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1360             return
1361         except ValueError as err:
1362             # since this is the last-resort InfoExtractor, if
1363             # this error is thrown, it'll be thrown here
1364             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1365             return
1366
1367         self.report_extraction(video_id)
1368         # Start with something easy: JW Player in SWFObject
1369         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1370         if mobj is None:
1371             # Broaden the search a little bit
1372             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1373         if mobj is None:
1374             # Broaden the search a little bit: JWPlayer JS loader
1375             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1376         if mobj is None:
1377             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1378             return
1379
1380         # It's possible that one of the regexes
1381         # matched, but returned an empty group:
1382         if mobj.group(1) is None:
1383             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1384             return
1385
1386         video_url = compat_urllib_parse.unquote(mobj.group(1))
1387         video_id = os.path.basename(video_url)
1388
1389         # here's a fun little line of code for you:
1390         video_extension = os.path.splitext(video_id)[1][1:]
1391         video_id = os.path.splitext(video_id)[0]
1392
1393         # it's tempting to parse this further, but you would
1394         # have to take into account all the variations like
1395         #   Video Title - Site Name
1396         #   Site Name | Video Title
1397         #   Video Title - Tagline | Site Name
1398         # and so on and so forth; it's just not practical
1399         mobj = re.search(r'<title>(.*)</title>', webpage)
1400         if mobj is None:
1401             self._downloader.trouble(u'ERROR: unable to extract title')
1402             return
1403         video_title = mobj.group(1)
1404
1405         # video uploader is domain name
1406         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1407         if mobj is None:
1408             self._downloader.trouble(u'ERROR: unable to extract title')
1409             return
1410         video_uploader = mobj.group(1)
1411
1412         return [{
1413             'id':       video_id,
1414             'url':      video_url,
1415             'uploader': video_uploader,
1416             'upload_date':  None,
1417             'title':    video_title,
1418             'ext':      video_extension,
1419         }]
1420
1421
1422 class YoutubeSearchIE(InfoExtractor):
1423     """Information Extractor for YouTube search queries."""
1424     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1425     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1426     _max_youtube_results = 1000
1427     IE_NAME = u'youtube:search'
1428
1429     def __init__(self, downloader=None):
1430         InfoExtractor.__init__(self, downloader)
1431
1432     def report_download_page(self, query, pagenum):
1433         """Report attempt to download search page with given number."""
1434         query = query.decode(preferredencoding())
1435         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1436
1437     def _real_extract(self, query):
1438         mobj = re.match(self._VALID_URL, query)
1439         if mobj is None:
1440             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1441             return
1442
1443         prefix, query = query.split(':')
1444         prefix = prefix[8:]
1445         query = query.encode('utf-8')
1446         if prefix == '':
1447             self._download_n_results(query, 1)
1448             return
1449         elif prefix == 'all':
1450             self._download_n_results(query, self._max_youtube_results)
1451             return
1452         else:
1453             try:
1454                 n = int(prefix)
1455                 if n <= 0:
1456                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1457                     return
1458                 elif n > self._max_youtube_results:
1459                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1460                     n = self._max_youtube_results
1461                 self._download_n_results(query, n)
1462                 return
1463             except ValueError: # parsing prefix as integer fails
1464                 self._download_n_results(query, 1)
1465                 return
1466
1467     def _download_n_results(self, query, n):
1468         """Downloads a specified number of results for a query"""
1469
1470         video_ids = []
1471         pagenum = 0
1472         limit = n
1473
1474         while (50 * pagenum) < limit:
1475             self.report_download_page(query, pagenum+1)
1476             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1477             request = compat_urllib_request.Request(result_url)
1478             try:
1479                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1480             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1481                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1482                 return
1483             api_response = json.loads(data)['data']
1484
1485             if not 'items' in api_response:
1486                 self._downloader.trouble(u'[youtube] No video results')
1487                 return
1488
1489             new_ids = list(video['id'] for video in api_response['items'])
1490             video_ids += new_ids
1491
1492             limit = min(n, api_response['totalItems'])
1493             pagenum += 1
1494
1495         if len(video_ids) > n:
1496             video_ids = video_ids[:n]
1497         for id in video_ids:
1498             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1499         return
1500
1501
1502 class GoogleSearchIE(InfoExtractor):
1503     """Information Extractor for Google Video search queries."""
1504     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1505     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1506     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1507     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1508     _max_google_results = 1000
1509     IE_NAME = u'video.google:search'
1510
1511     def __init__(self, downloader=None):
1512         InfoExtractor.__init__(self, downloader)
1513
1514     def report_download_page(self, query, pagenum):
1515         """Report attempt to download playlist page with given number."""
1516         query = query.decode(preferredencoding())
1517         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1518
1519     def _real_extract(self, query):
1520         mobj = re.match(self._VALID_URL, query)
1521         if mobj is None:
1522             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1523             return
1524
1525         prefix, query = query.split(':')
1526         prefix = prefix[8:]
1527         query = query.encode('utf-8')
1528         if prefix == '':
1529             self._download_n_results(query, 1)
1530             return
1531         elif prefix == 'all':
1532             self._download_n_results(query, self._max_google_results)
1533             return
1534         else:
1535             try:
1536                 n = int(prefix)
1537                 if n <= 0:
1538                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1539                     return
1540                 elif n > self._max_google_results:
1541                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1542                     n = self._max_google_results
1543                 self._download_n_results(query, n)
1544                 return
1545             except ValueError: # parsing prefix as integer fails
1546                 self._download_n_results(query, 1)
1547                 return
1548
1549     def _download_n_results(self, query, n):
1550         """Downloads a specified number of results for a query"""
1551
1552         video_ids = []
1553         pagenum = 0
1554
1555         while True:
1556             self.report_download_page(query, pagenum)
1557             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1558             request = compat_urllib_request.Request(result_url)
1559             try:
1560                 page = compat_urllib_request.urlopen(request).read()
1561             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1562                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1563                 return
1564
1565             # Extract video identifiers
1566             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1567                 video_id = mobj.group(1)
1568                 if video_id not in video_ids:
1569                     video_ids.append(video_id)
1570                     if len(video_ids) == n:
1571                         # Specified n videos reached
1572                         for id in video_ids:
1573                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1574                         return
1575
1576             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1577                 for id in video_ids:
1578                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1579                 return
1580
1581             pagenum = pagenum + 1
1582
1583
1584 class YahooSearchIE(InfoExtractor):
1585     """Information Extractor for Yahoo! Video search queries."""
1586
1587     _WORKING = False
1588     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1589     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1590     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1591     _MORE_PAGES_INDICATOR = r'\s*Next'
1592     _max_yahoo_results = 1000
1593     IE_NAME = u'video.yahoo:search'
1594
1595     def __init__(self, downloader=None):
1596         InfoExtractor.__init__(self, downloader)
1597
1598     def report_download_page(self, query, pagenum):
1599         """Report attempt to download playlist page with given number."""
1600         query = query.decode(preferredencoding())
1601         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1602
1603     def _real_extract(self, query):
1604         mobj = re.match(self._VALID_URL, query)
1605         if mobj is None:
1606             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1607             return
1608
1609         prefix, query = query.split(':')
1610         prefix = prefix[8:]
1611         query = query.encode('utf-8')
1612         if prefix == '':
1613             self._download_n_results(query, 1)
1614             return
1615         elif prefix == 'all':
1616             self._download_n_results(query, self._max_yahoo_results)
1617             return
1618         else:
1619             try:
1620                 n = int(prefix)
1621                 if n <= 0:
1622                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1623                     return
1624                 elif n > self._max_yahoo_results:
1625                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1626                     n = self._max_yahoo_results
1627                 self._download_n_results(query, n)
1628                 return
1629             except ValueError: # parsing prefix as integer fails
1630                 self._download_n_results(query, 1)
1631                 return
1632
1633     def _download_n_results(self, query, n):
1634         """Downloads a specified number of results for a query"""
1635
1636         video_ids = []
1637         already_seen = set()
1638         pagenum = 1
1639
1640         while True:
1641             self.report_download_page(query, pagenum)
1642             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1643             request = compat_urllib_request.Request(result_url)
1644             try:
1645                 page = compat_urllib_request.urlopen(request).read()
1646             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1647                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1648                 return
1649
1650             # Extract video identifiers
1651             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1652                 video_id = mobj.group(1)
1653                 if video_id not in already_seen:
1654                     video_ids.append(video_id)
1655                     already_seen.add(video_id)
1656                     if len(video_ids) == n:
1657                         # Specified n videos reached
1658                         for id in video_ids:
1659                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1660                         return
1661
1662             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1663                 for id in video_ids:
1664                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1665                 return
1666
1667             pagenum = pagenum + 1
1668
1669
1670 class YoutubePlaylistIE(InfoExtractor):
1671     """Information Extractor for YouTube playlists."""
1672
1673     _VALID_URL = r"""(?:
1674                         (?:https?://)?
1675                         (?:\w+\.)?
1676                         youtube\.com/
1677                         (?:
1678                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1679                            \? (?:.*?&)*? (?:p|a|list)=
1680                         |  user/.*?/user/
1681                         |  p/
1682                         |  user/.*?#[pg]/c/
1683                         )
1684                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1685                         .*
1686                      |
1687                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1688                      )"""
1689     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1690     _MAX_RESULTS = 50
1691     IE_NAME = u'youtube:playlist'
1692
1693     def __init__(self, downloader=None):
1694         InfoExtractor.__init__(self, downloader)
1695
1696     @classmethod
1697     def suitable(cls, url):
1698         """Receives a URL and returns True if suitable for this IE."""
1699         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1700
1701     def report_download_page(self, playlist_id, pagenum):
1702         """Report attempt to download playlist page with given number."""
1703         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1704
1705     def _real_extract(self, url):
1706         # Extract playlist id
1707         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1708         if mobj is None:
1709             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1710             return
1711
1712         # Download playlist videos from API
1713         playlist_id = mobj.group(1) or mobj.group(2)
1714         page_num = 1
1715         videos = []
1716
1717         while True:
1718             self.report_download_page(playlist_id, page_num)
1719
1720             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1721             try:
1722                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1723             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1724                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1725                 return
1726
1727             try:
1728                 response = json.loads(page)
1729             except ValueError as err:
1730                 self._downloader.trouble(u'ERROR: Invalid JSON in API response: ' + compat_str(err))
1731                 return
1732
1733             if not 'feed' in response or not 'entry' in response['feed']:
1734                 self._downloader.trouble(u'ERROR: Got a malformed response from YouTube API')
1735                 return
1736             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1737                         for entry in response['feed']['entry']
1738                         if 'content' in entry ]
1739
1740             if len(response['feed']['entry']) < self._MAX_RESULTS:
1741                 break
1742             page_num += 1
1743
1744         videos = map(operator.itemgetter(1), sorted(videos))
1745
1746         total = len(videos)
1747
1748         playliststart = self._downloader.params.get('playliststart', 1) - 1
1749         playlistend = self._downloader.params.get('playlistend', -1)
1750         if playlistend == -1:
1751             videos = videos[playliststart:]
1752         else:
1753             videos = videos[playliststart:playlistend]
1754
1755         if len(videos) == total:
1756             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1757         else:
1758             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1759
1760         for video in videos:
1761             self._downloader.download([video])
1762         return
1763
1764
1765 class YoutubeChannelIE(InfoExtractor):
1766     """Information Extractor for YouTube channels."""
1767
1768     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1769     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1770     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1771     IE_NAME = u'youtube:channel'
1772
1773     def report_download_page(self, channel_id, pagenum):
1774         """Report attempt to download channel page with given number."""
1775         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1776
1777     def _real_extract(self, url):
1778         # Extract channel id
1779         mobj = re.match(self._VALID_URL, url)
1780         if mobj is None:
1781             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1782             return
1783
1784         # Download channel pages
1785         channel_id = mobj.group(1)
1786         video_ids = []
1787         pagenum = 1
1788
1789         while True:
1790             self.report_download_page(channel_id, pagenum)
1791             url = self._TEMPLATE_URL % (channel_id, pagenum)
1792             request = compat_urllib_request.Request(url)
1793             try:
1794                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1795             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1796                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1797                 return
1798
1799             # Extract video identifiers
1800             ids_in_page = []
1801             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1802                 if mobj.group(1) not in ids_in_page:
1803                     ids_in_page.append(mobj.group(1))
1804             video_ids.extend(ids_in_page)
1805
1806             if self._MORE_PAGES_INDICATOR not in page:
1807                 break
1808             pagenum = pagenum + 1
1809
1810         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1811
1812         for id in video_ids:
1813             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1814         return
1815
1816
1817 class YoutubeUserIE(InfoExtractor):
1818     """Information Extractor for YouTube users."""
1819
1820     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1821     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1822     _GDATA_PAGE_SIZE = 50
1823     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1824     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1825     IE_NAME = u'youtube:user'
1826
1827     def __init__(self, downloader=None):
1828         InfoExtractor.__init__(self, downloader)
1829
1830     def report_download_page(self, username, start_index):
1831         """Report attempt to download user page."""
1832         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1833                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1834
1835     def _real_extract(self, url):
1836         # Extract username
1837         mobj = re.match(self._VALID_URL, url)
1838         if mobj is None:
1839             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1840             return
1841
1842         username = mobj.group(1)
1843
1844         # Download video ids using YouTube Data API. Result size per
1845         # query is limited (currently to 50 videos) so we need to query
1846         # page by page until there are no video ids - it means we got
1847         # all of them.
1848
1849         video_ids = []
1850         pagenum = 0
1851
1852         while True:
1853             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1854             self.report_download_page(username, start_index)
1855
1856             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1857
1858             try:
1859                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1860             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1861                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1862                 return
1863
1864             # Extract video identifiers
1865             ids_in_page = []
1866
1867             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1868                 if mobj.group(1) not in ids_in_page:
1869                     ids_in_page.append(mobj.group(1))
1870
1871             video_ids.extend(ids_in_page)
1872
1873             # A little optimization - if current page is not
1874             # "full", ie. does not contain PAGE_SIZE video ids then
1875             # we can assume that this page is the last one - there
1876             # are no more ids on further pages - no need to query
1877             # again.
1878
1879             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1880                 break
1881
1882             pagenum += 1
1883
1884         all_ids_count = len(video_ids)
1885         playliststart = self._downloader.params.get('playliststart', 1) - 1
1886         playlistend = self._downloader.params.get('playlistend', -1)
1887
1888         if playlistend == -1:
1889             video_ids = video_ids[playliststart:]
1890         else:
1891             video_ids = video_ids[playliststart:playlistend]
1892
1893         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1894                 (username, all_ids_count, len(video_ids)))
1895
1896         for video_id in video_ids:
1897             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1898
1899
1900 class BlipTVUserIE(InfoExtractor):
1901     """Information Extractor for blip.tv users."""
1902
1903     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1904     _PAGE_SIZE = 12
1905     IE_NAME = u'blip.tv:user'
1906
1907     def __init__(self, downloader=None):
1908         InfoExtractor.__init__(self, downloader)
1909
1910     def report_download_page(self, username, pagenum):
1911         """Report attempt to download user page."""
1912         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1913                 (self.IE_NAME, username, pagenum))
1914
1915     def _real_extract(self, url):
1916         # Extract username
1917         mobj = re.match(self._VALID_URL, url)
1918         if mobj is None:
1919             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1920             return
1921
1922         username = mobj.group(1)
1923
1924         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1925
1926         request = compat_urllib_request.Request(url)
1927
1928         try:
1929             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1930             mobj = re.search(r'data-users-id="([^"]+)"', page)
1931             page_base = page_base % mobj.group(1)
1932         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1933             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1934             return
1935
1936
1937         # Download video ids using BlipTV Ajax calls. Result size per
1938         # query is limited (currently to 12 videos) so we need to query
1939         # page by page until there are no video ids - it means we got
1940         # all of them.
1941
1942         video_ids = []
1943         pagenum = 1
1944
1945         while True:
1946             self.report_download_page(username, pagenum)
1947             url = page_base + "&page=" + str(pagenum)
1948             request = compat_urllib_request.Request( url )
1949             try:
1950                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1951             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1952                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1953                 return
1954
1955             # Extract video identifiers
1956             ids_in_page = []
1957
1958             for mobj in re.finditer(r'href="/([^"]+)"', page):
1959                 if mobj.group(1) not in ids_in_page:
1960                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1961
1962             video_ids.extend(ids_in_page)
1963
1964             # A little optimization - if current page is not
1965             # "full", ie. does not contain PAGE_SIZE video ids then
1966             # we can assume that this page is the last one - there
1967             # are no more ids on further pages - no need to query
1968             # again.
1969
1970             if len(ids_in_page) < self._PAGE_SIZE:
1971                 break
1972
1973             pagenum += 1
1974
1975         all_ids_count = len(video_ids)
1976         playliststart = self._downloader.params.get('playliststart', 1) - 1
1977         playlistend = self._downloader.params.get('playlistend', -1)
1978
1979         if playlistend == -1:
1980             video_ids = video_ids[playliststart:]
1981         else:
1982             video_ids = video_ids[playliststart:playlistend]
1983
1984         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1985                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1986
1987         for video_id in video_ids:
1988             self._downloader.download([u'http://blip.tv/'+video_id])
1989
1990
1991 class DepositFilesIE(InfoExtractor):
1992     """Information extractor for depositfiles.com"""
1993
1994     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1995
1996     def report_download_webpage(self, file_id):
1997         """Report webpage download."""
1998         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1999
2000     def report_extraction(self, file_id):
2001         """Report information extraction."""
2002         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2003
2004     def _real_extract(self, url):
2005         file_id = url.split('/')[-1]
2006         # Rebuild url in english locale
2007         url = 'http://depositfiles.com/en/files/' + file_id
2008
2009         # Retrieve file webpage with 'Free download' button pressed
2010         free_download_indication = { 'gateway_result' : '1' }
2011         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2012         try:
2013             self.report_download_webpage(file_id)
2014             webpage = compat_urllib_request.urlopen(request).read()
2015         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2016             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
2017             return
2018
2019         # Search for the real file URL
2020         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2021         if (mobj is None) or (mobj.group(1) is None):
2022             # Try to figure out reason of the error.
2023             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2024             if (mobj is not None) and (mobj.group(1) is not None):
2025                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2026                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2027             else:
2028                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2029             return
2030
2031         file_url = mobj.group(1)
2032         file_extension = os.path.splitext(file_url)[1][1:]
2033
2034         # Search for file title
2035         mobj = re.search(r'<b title="(.*?)">', webpage)
2036         if mobj is None:
2037             self._downloader.trouble(u'ERROR: unable to extract title')
2038             return
2039         file_title = mobj.group(1).decode('utf-8')
2040
2041         return [{
2042             'id':       file_id.decode('utf-8'),
2043             'url':      file_url.decode('utf-8'),
2044             'uploader': None,
2045             'upload_date':  None,
2046             'title':    file_title,
2047             'ext':      file_extension.decode('utf-8'),
2048         }]
2049
2050
2051 class FacebookIE(InfoExtractor):
2052     """Information Extractor for Facebook"""
2053
2054     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2055     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2056     _NETRC_MACHINE = 'facebook'
2057     IE_NAME = u'facebook'
2058
2059     def report_login(self):
2060         """Report attempt to log in."""
2061         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2062
2063     def _real_initialize(self):
2064         if self._downloader is None:
2065             return
2066
2067         useremail = None
2068         password = None
2069         downloader_params = self._downloader.params
2070
2071         # Attempt to use provided username and password or .netrc data
2072         if downloader_params.get('username', None) is not None:
2073             useremail = downloader_params['username']
2074             password = downloader_params['password']
2075         elif downloader_params.get('usenetrc', False):
2076             try:
2077                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2078                 if info is not None:
2079                     useremail = info[0]
2080                     password = info[2]
2081                 else:
2082                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2083             except (IOError, netrc.NetrcParseError) as err:
2084                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2085                 return
2086
2087         if useremail is None:
2088             return
2089
2090         # Log in
2091         login_form = {
2092             'email': useremail,
2093             'pass': password,
2094             'login': 'Log+In'
2095             }
2096         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2097         try:
2098             self.report_login()
2099             login_results = compat_urllib_request.urlopen(request).read()
2100             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2101                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2102                 return
2103         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2104             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2105             return
2106
2107     def _real_extract(self, url):
2108         mobj = re.match(self._VALID_URL, url)
2109         if mobj is None:
2110             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2111             return
2112         video_id = mobj.group('ID')
2113
2114         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2115         webpage = self._download_webpage(url, video_id)
2116
2117         BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2118         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2119         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2120         if not m:
2121             raise ExtractorError(u'Cannot parse data')
2122         data = dict(json.loads(m.group(1)))
2123         params_raw = compat_urllib_parse.unquote(data['params'])
2124         params = json.loads(params_raw)
2125         video_url = params['hd_src']
2126         if not video_url:
2127             video_url = params['sd_src']
2128         if not video_url:
2129             raise ExtractorError(u'Cannot find video URL')
2130         video_duration = int(params['video_duration'])
2131
2132         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2133         if not m:
2134             raise ExtractorError(u'Cannot find title in webpage')
2135         video_title = unescapeHTML(m.group(1))
2136
2137         info = {
2138             'id': video_id,
2139             'title': video_title,
2140             'url': video_url,
2141             'ext': 'mp4',
2142             'duration': video_duration,
2143             'thumbnail': params['thumbnail_src'],
2144         }
2145         return [info]
2146
2147
2148 class BlipTVIE(InfoExtractor):
2149     """Information extractor for blip.tv"""
2150
2151     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2152     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2153     IE_NAME = u'blip.tv'
2154
2155     def report_extraction(self, file_id):
2156         """Report information extraction."""
2157         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2158
2159     def report_direct_download(self, title):
2160         """Report information extraction."""
2161         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2162
2163     def _real_extract(self, url):
2164         mobj = re.match(self._VALID_URL, url)
2165         if mobj is None:
2166             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2167             return
2168
2169         if '?' in url:
2170             cchar = '&'
2171         else:
2172             cchar = '?'
2173         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2174         request = compat_urllib_request.Request(json_url)
2175         request.add_header('User-Agent', 'iTunes/10.6.1')
2176         self.report_extraction(mobj.group(1))
2177         info = None
2178         try:
2179             urlh = compat_urllib_request.urlopen(request)
2180             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2181                 basename = url.split('/')[-1]
2182                 title,ext = os.path.splitext(basename)
2183                 title = title.decode('UTF-8')
2184                 ext = ext.replace('.', '')
2185                 self.report_direct_download(title)
2186                 info = {
2187                     'id': title,
2188                     'url': url,
2189                     'uploader': None,
2190                     'upload_date': None,
2191                     'title': title,
2192                     'ext': ext,
2193                     'urlhandle': urlh
2194                 }
2195         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2196             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2197         if info is None: # Regular URL
2198             try:
2199                 json_code_bytes = urlh.read()
2200                 json_code = json_code_bytes.decode('utf-8')
2201             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2202                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2203                 return
2204
2205             try:
2206                 json_data = json.loads(json_code)
2207                 if 'Post' in json_data:
2208                     data = json_data['Post']
2209                 else:
2210                     data = json_data
2211
2212                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2213                 video_url = data['media']['url']
2214                 umobj = re.match(self._URL_EXT, video_url)
2215                 if umobj is None:
2216                     raise ValueError('Can not determine filename extension')
2217                 ext = umobj.group(1)
2218
2219                 info = {
2220                     'id': data['item_id'],
2221                     'url': video_url,
2222                     'uploader': data['display_name'],
2223                     'upload_date': upload_date,
2224                     'title': data['title'],
2225                     'ext': ext,
2226                     'format': data['media']['mimeType'],
2227                     'thumbnail': data['thumbnailUrl'],
2228                     'description': data['description'],
2229                     'player_url': data['embedUrl'],
2230                     'user_agent': 'iTunes/10.6.1',
2231                 }
2232             except (ValueError,KeyError) as err:
2233                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2234                 return
2235
2236         return [info]
2237
2238
2239 class MyVideoIE(InfoExtractor):
2240     """Information Extractor for myvideo.de."""
2241
2242     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2243     IE_NAME = u'myvideo'
2244
2245     def __init__(self, downloader=None):
2246         InfoExtractor.__init__(self, downloader)
2247
2248     def report_extraction(self, video_id):
2249         """Report information extraction."""
2250         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2251
2252     def _real_extract(self,url):
2253         mobj = re.match(self._VALID_URL, url)
2254         if mobj is None:
2255             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2256             return
2257
2258         video_id = mobj.group(1)
2259
2260         # Get video webpage
2261         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2262         webpage = self._download_webpage(webpage_url, video_id)
2263
2264         self.report_extraction(video_id)
2265         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
2266                  webpage)
2267         if mobj is None:
2268             self._downloader.trouble(u'ERROR: unable to extract media URL')
2269             return
2270         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2271
2272         mobj = re.search('<title>([^<]+)</title>', webpage)
2273         if mobj is None:
2274             self._downloader.trouble(u'ERROR: unable to extract title')
2275             return
2276
2277         video_title = mobj.group(1)
2278
2279         return [{
2280             'id':       video_id,
2281             'url':      video_url,
2282             'uploader': None,
2283             'upload_date':  None,
2284             'title':    video_title,
2285             'ext':      u'flv',
2286         }]
2287
2288 class ComedyCentralIE(InfoExtractor):
2289     """Information extractor for The Daily Show and Colbert Report """
2290
2291     # urls can be abbreviations like :thedailyshow or :colbert
2292     # urls for episodes like:
2293     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2294     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2295     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2296     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2297                       |(https?://)?(www\.)?
2298                           (?P<showname>thedailyshow|colbertnation)\.com/
2299                          (full-episodes/(?P<episode>.*)|
2300                           (?P<clip>
2301                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2302                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2303                      $"""
2304
2305     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2306
2307     _video_extensions = {
2308         '3500': 'mp4',
2309         '2200': 'mp4',
2310         '1700': 'mp4',
2311         '1200': 'mp4',
2312         '750': 'mp4',
2313         '400': 'mp4',
2314     }
2315     _video_dimensions = {
2316         '3500': '1280x720',
2317         '2200': '960x540',
2318         '1700': '768x432',
2319         '1200': '640x360',
2320         '750': '512x288',
2321         '400': '384x216',
2322     }
2323
2324     @classmethod
2325     def suitable(cls, url):
2326         """Receives a URL and returns True if suitable for this IE."""
2327         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2328
2329     def report_extraction(self, episode_id):
2330         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2331
2332     def report_config_download(self, episode_id, media_id):
2333         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2334
2335     def report_index_download(self, episode_id):
2336         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2337
2338     def _print_formats(self, formats):
2339         print('Available formats:')
2340         for x in formats:
2341             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2342
2343
2344     def _real_extract(self, url):
2345         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2346         if mobj is None:
2347             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2348             return
2349
2350         if mobj.group('shortname'):
2351             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2352                 url = u'http://www.thedailyshow.com/full-episodes/'
2353             else:
2354                 url = u'http://www.colbertnation.com/full-episodes/'
2355             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2356             assert mobj is not None
2357
2358         if mobj.group('clip'):
2359             if mobj.group('showname') == 'thedailyshow':
2360                 epTitle = mobj.group('tdstitle')
2361             else:
2362                 epTitle = mobj.group('cntitle')
2363             dlNewest = False
2364         else:
2365             dlNewest = not mobj.group('episode')
2366             if dlNewest:
2367                 epTitle = mobj.group('showname')
2368             else:
2369                 epTitle = mobj.group('episode')
2370
2371         req = compat_urllib_request.Request(url)
2372         self.report_extraction(epTitle)
2373         try:
2374             htmlHandle = compat_urllib_request.urlopen(req)
2375             html = htmlHandle.read()
2376             webpage = html.decode('utf-8')
2377         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2378             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2379             return
2380         if dlNewest:
2381             url = htmlHandle.geturl()
2382             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2383             if mobj is None:
2384                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2385                 return
2386             if mobj.group('episode') == '':
2387                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2388                 return
2389             epTitle = mobj.group('episode')
2390
2391         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2392
2393         if len(mMovieParams) == 0:
2394             # The Colbert Report embeds the information in a without
2395             # a URL prefix; so extract the alternate reference
2396             # and then add the URL prefix manually.
2397
2398             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2399             if len(altMovieParams) == 0:
2400                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2401                 return
2402             else:
2403                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2404
2405         uri = mMovieParams[0][1]
2406         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2407         self.report_index_download(epTitle)
2408         try:
2409             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2410         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2411             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2412             return
2413
2414         results = []
2415
2416         idoc = xml.etree.ElementTree.fromstring(indexXml)
2417         itemEls = idoc.findall('.//item')
2418         for partNum,itemEl in enumerate(itemEls):
2419             mediaId = itemEl.findall('./guid')[0].text
2420             shortMediaId = mediaId.split(':')[-1]
2421             showId = mediaId.split(':')[-2].replace('.com', '')
2422             officialTitle = itemEl.findall('./title')[0].text
2423             officialDate = itemEl.findall('./pubDate')[0].text
2424
2425             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2426                         compat_urllib_parse.urlencode({'uri': mediaId}))
2427             configReq = compat_urllib_request.Request(configUrl)
2428             self.report_config_download(epTitle, shortMediaId)
2429             try:
2430                 configXml = compat_urllib_request.urlopen(configReq).read()
2431             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2432                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2433                 return
2434
2435             cdoc = xml.etree.ElementTree.fromstring(configXml)
2436             turls = []
2437             for rendition in cdoc.findall('.//rendition'):
2438                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2439                 turls.append(finfo)
2440
2441             if len(turls) == 0:
2442                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2443                 continue
2444
2445             if self._downloader.params.get('listformats', None):
2446                 self._print_formats([i[0] for i in turls])
2447                 return
2448
2449             # For now, just pick the highest bitrate
2450             format,rtmp_video_url = turls[-1]
2451
2452             # Get the format arg from the arg stream
2453             req_format = self._downloader.params.get('format', None)
2454
2455             # Select format if we can find one
2456             for f,v in turls:
2457                 if f == req_format:
2458                     format, rtmp_video_url = f, v
2459                     break
2460
2461             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2462             if not m:
2463                 raise ExtractorError(u'Cannot transform RTMP url')
2464             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2465             video_url = base + m.group('finalid')
2466
2467             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2468             info = {
2469                 'id': shortMediaId,
2470                 'url': video_url,
2471                 'uploader': showId,
2472                 'upload_date': officialDate,
2473                 'title': effTitle,
2474                 'ext': 'mp4',
2475                 'format': format,
2476                 'thumbnail': None,
2477                 'description': officialTitle,
2478             }
2479             results.append(info)
2480
2481         return results
2482
2483
2484 class EscapistIE(InfoExtractor):
2485     """Information extractor for The Escapist """
2486
2487     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2488     IE_NAME = u'escapist'
2489
2490     def report_extraction(self, showName):
2491         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2492
2493     def report_config_download(self, showName):
2494         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2495
2496     def _real_extract(self, url):
2497         mobj = re.match(self._VALID_URL, url)
2498         if mobj is None:
2499             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2500             return
2501         showName = mobj.group('showname')
2502         videoId = mobj.group('episode')
2503
2504         self.report_extraction(showName)
2505         try:
2506             webPage = compat_urllib_request.urlopen(url)
2507             webPageBytes = webPage.read()
2508             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2509             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2510         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2511             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2512             return
2513
2514         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2515         description = unescapeHTML(descMatch.group(1))
2516         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2517         imgUrl = unescapeHTML(imgMatch.group(1))
2518         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2519         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2520         configUrlMatch = re.search('config=(.*)$', playerUrl)
2521         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2522
2523         self.report_config_download(showName)
2524         try:
2525             configJSON = compat_urllib_request.urlopen(configUrl)
2526             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2527             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2528         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2529             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2530             return
2531
2532         # Technically, it's JavaScript, not JSON
2533         configJSON = configJSON.replace("'", '"')
2534
2535         try:
2536             config = json.loads(configJSON)
2537         except (ValueError,) as err:
2538             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2539             return
2540
2541         playlist = config['playlist']
2542         videoUrl = playlist[1]['url']
2543
2544         info = {
2545             'id': videoId,
2546             'url': videoUrl,
2547             'uploader': showName,
2548             'upload_date': None,
2549             'title': showName,
2550             'ext': 'flv',
2551             'thumbnail': imgUrl,
2552             'description': description,
2553             'player_url': playerUrl,
2554         }
2555
2556         return [info]
2557
2558 class CollegeHumorIE(InfoExtractor):
2559     """Information extractor for collegehumor.com"""
2560
2561     _WORKING = False
2562     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2563     IE_NAME = u'collegehumor'
2564
2565     def report_manifest(self, video_id):
2566         """Report information extraction."""
2567         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2568
2569     def report_extraction(self, video_id):
2570         """Report information extraction."""
2571         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2572
2573     def _real_extract(self, url):
2574         mobj = re.match(self._VALID_URL, url)
2575         if mobj is None:
2576             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2577             return
2578         video_id = mobj.group('videoid')
2579
2580         info = {
2581             'id': video_id,
2582             'uploader': None,
2583             'upload_date': None,
2584         }
2585
2586         self.report_extraction(video_id)
2587         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2588         try:
2589             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2590         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2591             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2592             return
2593
2594         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2595         try:
2596             videoNode = mdoc.findall('./video')[0]
2597             info['description'] = videoNode.findall('./description')[0].text
2598             info['title'] = videoNode.findall('./caption')[0].text
2599             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2600             manifest_url = videoNode.findall('./file')[0].text
2601         except IndexError:
2602             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2603             return
2604
2605         manifest_url += '?hdcore=2.10.3'
2606         self.report_manifest(video_id)
2607         try:
2608             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2609         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2610             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2611             return
2612
2613         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2614         try:
2615             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2616             node_id = media_node.attrib['url']
2617             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2618         except IndexError as err:
2619             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2620             return
2621
2622         url_pr = compat_urllib_parse_urlparse(manifest_url)
2623         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2624
2625         info['url'] = url
2626         info['ext'] = 'f4f'
2627         return [info]
2628
2629
2630 class XVideosIE(InfoExtractor):
2631     """Information extractor for xvideos.com"""
2632
2633     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2634     IE_NAME = u'xvideos'
2635
2636     def report_extraction(self, video_id):
2637         """Report information extraction."""
2638         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2639
2640     def _real_extract(self, url):
2641         mobj = re.match(self._VALID_URL, url)
2642         if mobj is None:
2643             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2644             return
2645         video_id = mobj.group(1)
2646
2647         webpage = self._download_webpage(url, video_id)
2648
2649         self.report_extraction(video_id)
2650
2651
2652         # Extract video URL
2653         mobj = re.search(r'flv_url=(.+?)&', webpage)
2654         if mobj is None:
2655             self._downloader.trouble(u'ERROR: unable to extract video url')
2656             return
2657         video_url = compat_urllib_parse.unquote(mobj.group(1))
2658
2659
2660         # Extract title
2661         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2662         if mobj is None:
2663             self._downloader.trouble(u'ERROR: unable to extract video title')
2664             return
2665         video_title = mobj.group(1)
2666
2667
2668         # Extract video thumbnail
2669         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2670         if mobj is None:
2671             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2672             return
2673         video_thumbnail = mobj.group(0)
2674
2675         info = {
2676             'id': video_id,
2677             'url': video_url,
2678             'uploader': None,
2679             'upload_date': None,
2680             'title': video_title,
2681             'ext': 'flv',
2682             'thumbnail': video_thumbnail,
2683             'description': None,
2684         }
2685
2686         return [info]
2687
2688
2689 class SoundcloudIE(InfoExtractor):
2690     """Information extractor for soundcloud.com
2691        To access the media, the uid of the song and a stream token
2692        must be extracted from the page source and the script must make
2693        a request to media.soundcloud.com/crossdomain.xml. Then
2694        the media can be grabbed by requesting from an url composed
2695        of the stream token and uid
2696      """
2697
2698     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2699     IE_NAME = u'soundcloud'
2700
2701     def __init__(self, downloader=None):
2702         InfoExtractor.__init__(self, downloader)
2703
2704     def report_resolve(self, video_id):
2705         """Report information extraction."""
2706         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2707
2708     def report_extraction(self, video_id):
2709         """Report information extraction."""
2710         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2711
2712     def _real_extract(self, url):
2713         mobj = re.match(self._VALID_URL, url)
2714         if mobj is None:
2715             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2716             return
2717
2718         # extract uploader (which is in the url)
2719         uploader = mobj.group(1)
2720         # extract simple title (uploader + slug of song title)
2721         slug_title =  mobj.group(2)
2722         simple_title = uploader + u'-' + slug_title
2723
2724         self.report_resolve('%s/%s' % (uploader, slug_title))
2725
2726         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2727         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2728         request = compat_urllib_request.Request(resolv_url)
2729         try:
2730             info_json_bytes = compat_urllib_request.urlopen(request).read()
2731             info_json = info_json_bytes.decode('utf-8')
2732         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2733             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2734             return
2735
2736         info = json.loads(info_json)
2737         video_id = info['id']
2738         self.report_extraction('%s/%s' % (uploader, slug_title))
2739
2740         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2741         request = compat_urllib_request.Request(streams_url)
2742         try:
2743             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2744             stream_json = stream_json_bytes.decode('utf-8')
2745         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2746             self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2747             return
2748
2749         streams = json.loads(stream_json)
2750         mediaURL = streams['http_mp3_128_url']
2751
2752         return [{
2753             'id':       info['id'],
2754             'url':      mediaURL,
2755             'uploader': info['user']['username'],
2756             'upload_date':  info['created_at'],
2757             'title':    info['title'],
2758             'ext':      u'mp3',
2759             'description': info['description'],
2760         }]
2761
2762
2763 class InfoQIE(InfoExtractor):
2764     """Information extractor for infoq.com"""
2765     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2766
2767     def report_extraction(self, video_id):
2768         """Report information extraction."""
2769         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2770
2771     def _real_extract(self, url):
2772         mobj = re.match(self._VALID_URL, url)
2773         if mobj is None:
2774             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2775             return
2776
2777         webpage = self._download_webpage(url, video_id=url)
2778         self.report_extraction(url)
2779
2780         # Extract video URL
2781         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2782         if mobj is None:
2783             self._downloader.trouble(u'ERROR: unable to extract video url')
2784             return
2785         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2786         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2787
2788         # Extract title
2789         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2790         if mobj is None:
2791             self._downloader.trouble(u'ERROR: unable to extract video title')
2792             return
2793         video_title = mobj.group(1)
2794
2795         # Extract description
2796         video_description = u'No description available.'
2797         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2798         if mobj is not None:
2799             video_description = mobj.group(1)
2800
2801         video_filename = video_url.split('/')[-1]
2802         video_id, extension = video_filename.split('.')
2803
2804         info = {
2805             'id': video_id,
2806             'url': video_url,
2807             'uploader': None,
2808             'upload_date': None,
2809             'title': video_title,
2810             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2811             'thumbnail': None,
2812             'description': video_description,
2813         }
2814
2815         return [info]
2816
2817 class MixcloudIE(InfoExtractor):
2818     """Information extractor for www.mixcloud.com"""
2819
2820     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2821     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2822     IE_NAME = u'mixcloud'
2823
2824     def __init__(self, downloader=None):
2825         InfoExtractor.__init__(self, downloader)
2826
2827     def report_download_json(self, file_id):
2828         """Report JSON download."""
2829         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2830
2831     def report_extraction(self, file_id):
2832         """Report information extraction."""
2833         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2834
2835     def get_urls(self, jsonData, fmt, bitrate='best'):
2836         """Get urls from 'audio_formats' section in json"""
2837         file_url = None
2838         try:
2839             bitrate_list = jsonData[fmt]
2840             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2841                 bitrate = max(bitrate_list) # select highest
2842
2843             url_list = jsonData[fmt][bitrate]
2844         except TypeError: # we have no bitrate info.
2845             url_list = jsonData[fmt]
2846         return url_list
2847
2848     def check_urls(self, url_list):
2849         """Returns 1st active url from list"""
2850         for url in url_list:
2851             try:
2852                 compat_urllib_request.urlopen(url)
2853                 return url
2854             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2855                 url = None
2856
2857         return None
2858
2859     def _print_formats(self, formats):
2860         print('Available formats:')
2861         for fmt in formats.keys():
2862             for b in formats[fmt]:
2863                 try:
2864                     ext = formats[fmt][b][0]
2865                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2866                 except TypeError: # we have no bitrate info
2867                     ext = formats[fmt][0]
2868                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2869                     break
2870
2871     def _real_extract(self, url):
2872         mobj = re.match(self._VALID_URL, url)
2873         if mobj is None:
2874             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2875             return
2876         # extract uploader & filename from url
2877         uploader = mobj.group(1).decode('utf-8')
2878         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2879
2880         # construct API request
2881         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2882         # retrieve .json file with links to files
2883         request = compat_urllib_request.Request(file_url)
2884         try:
2885             self.report_download_json(file_url)
2886             jsonData = compat_urllib_request.urlopen(request).read()
2887         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2888             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2889             return
2890
2891         # parse JSON
2892         json_data = json.loads(jsonData)
2893         player_url = json_data['player_swf_url']
2894         formats = dict(json_data['audio_formats'])
2895
2896         req_format = self._downloader.params.get('format', None)
2897         bitrate = None
2898
2899         if self._downloader.params.get('listformats', None):
2900             self._print_formats(formats)
2901             return
2902
2903         if req_format is None or req_format == 'best':
2904             for format_param in formats.keys():
2905                 url_list = self.get_urls(formats, format_param)
2906                 # check urls
2907                 file_url = self.check_urls(url_list)
2908                 if file_url is not None:
2909                     break # got it!
2910         else:
2911             if req_format not in formats:
2912                 self._downloader.trouble(u'ERROR: format is not available')
2913                 return
2914
2915             url_list = self.get_urls(formats, req_format)
2916             file_url = self.check_urls(url_list)
2917             format_param = req_format
2918
2919         return [{
2920             'id': file_id.decode('utf-8'),
2921             'url': file_url.decode('utf-8'),
2922             'uploader': uploader.decode('utf-8'),
2923             'upload_date': None,
2924             'title': json_data['name'],
2925             'ext': file_url.split('.')[-1].decode('utf-8'),
2926             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2927             'thumbnail': json_data['thumbnail_url'],
2928             'description': json_data['description'],
2929             'player_url': player_url.decode('utf-8'),
2930         }]
2931
2932 class StanfordOpenClassroomIE(InfoExtractor):
2933     """Information extractor for Stanford's Open ClassRoom"""
2934
2935     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2936     IE_NAME = u'stanfordoc'
2937
2938     def report_download_webpage(self, objid):
2939         """Report information extraction."""
2940         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2941
2942     def report_extraction(self, video_id):
2943         """Report information extraction."""
2944         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2945
2946     def _real_extract(self, url):
2947         mobj = re.match(self._VALID_URL, url)
2948         if mobj is None:
2949             raise ExtractorError(u'Invalid URL: %s' % url)
2950
2951         if mobj.group('course') and mobj.group('video'): # A specific video
2952             course = mobj.group('course')
2953             video = mobj.group('video')
2954             info = {
2955                 'id': course + '_' + video,
2956                 'uploader': None,
2957                 'upload_date': None,
2958             }
2959
2960             self.report_extraction(info['id'])
2961             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2962             xmlUrl = baseUrl + video + '.xml'
2963             try:
2964                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2965             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2966                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2967                 return
2968             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2969             try:
2970                 info['title'] = mdoc.findall('./title')[0].text
2971                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2972             except IndexError:
2973                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2974                 return
2975             info['ext'] = info['url'].rpartition('.')[2]
2976             return [info]
2977         elif mobj.group('course'): # A course page
2978             course = mobj.group('course')
2979             info = {
2980                 'id': course,
2981                 'type': 'playlist',
2982                 'uploader': None,
2983                 'upload_date': None,
2984             }
2985
2986             coursepage = self._download_webpage(url, info['id'],
2987                                         note='Downloading course info page',
2988                                         errnote='Unable to download course info page')
2989
2990             m = re.search('<h1>([^<]+)</h1>', coursepage)
2991             if m:
2992                 info['title'] = unescapeHTML(m.group(1))
2993             else:
2994                 info['title'] = info['id']
2995
2996             m = re.search('<description>([^<]+)</description>', coursepage)
2997             if m:
2998                 info['description'] = unescapeHTML(m.group(1))
2999
3000             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3001             info['list'] = [
3002                 {
3003                     'type': 'reference',
3004                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3005                 }
3006                     for vpage in links]
3007             results = []
3008             for entry in info['list']:
3009                 assert entry['type'] == 'reference'
3010                 results += self.extract(entry['url'])
3011             return results
3012         else: # Root page
3013             info = {
3014                 'id': 'Stanford OpenClassroom',
3015                 'type': 'playlist',
3016                 'uploader': None,
3017                 'upload_date': None,
3018             }
3019
3020             self.report_download_webpage(info['id'])
3021             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3022             try:
3023                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3024             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3025                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3026                 return
3027
3028             info['title'] = info['id']
3029
3030             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3031             info['list'] = [
3032                 {
3033                     'type': 'reference',
3034                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3035                 }
3036                     for cpage in links]
3037
3038             results = []
3039             for entry in info['list']:
3040                 assert entry['type'] == 'reference'
3041                 results += self.extract(entry['url'])
3042             return results
3043
3044 class MTVIE(InfoExtractor):
3045     """Information extractor for MTV.com"""
3046
3047     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3048     IE_NAME = u'mtv'
3049
3050     def report_extraction(self, video_id):
3051         """Report information extraction."""
3052         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3053
3054     def _real_extract(self, url):
3055         mobj = re.match(self._VALID_URL, url)
3056         if mobj is None:
3057             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3058             return
3059         if not mobj.group('proto'):
3060             url = 'http://' + url
3061         video_id = mobj.group('videoid')
3062
3063         webpage = self._download_webpage(url, video_id)
3064
3065         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3066         if mobj is None:
3067             self._downloader.trouble(u'ERROR: unable to extract song name')
3068             return
3069         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3070         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3071         if mobj is None:
3072             self._downloader.trouble(u'ERROR: unable to extract performer')
3073             return
3074         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3075         video_title = performer + ' - ' + song_name
3076
3077         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3078         if mobj is None:
3079             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3080             return
3081         mtvn_uri = mobj.group(1)
3082
3083         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3084         if mobj is None:
3085             self._downloader.trouble(u'ERROR: unable to extract content id')
3086             return
3087         content_id = mobj.group(1)
3088
3089         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3090         self.report_extraction(video_id)
3091         request = compat_urllib_request.Request(videogen_url)
3092         try:
3093             metadataXml = compat_urllib_request.urlopen(request).read()
3094         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3095             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3096             return
3097
3098         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3099         renditions = mdoc.findall('.//rendition')
3100
3101         # For now, always pick the highest quality.
3102         rendition = renditions[-1]
3103
3104         try:
3105             _,_,ext = rendition.attrib['type'].partition('/')
3106             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3107             video_url = rendition.find('./src').text
3108         except KeyError:
3109             self._downloader.trouble('Invalid rendition field.')
3110             return
3111
3112         info = {
3113             'id': video_id,
3114             'url': video_url,
3115             'uploader': performer,
3116             'upload_date': None,
3117             'title': video_title,
3118             'ext': ext,
3119             'format': format,
3120         }
3121
3122         return [info]
3123
3124
3125 class YoukuIE(InfoExtractor):
3126     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3127
3128     def report_download_webpage(self, file_id):
3129         """Report webpage download."""
3130         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3131
3132     def report_extraction(self, file_id):
3133         """Report information extraction."""
3134         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3135
3136     def _gen_sid(self):
3137         nowTime = int(time.time() * 1000)
3138         random1 = random.randint(1000,1998)
3139         random2 = random.randint(1000,9999)
3140
3141         return "%d%d%d" %(nowTime,random1,random2)
3142
3143     def _get_file_ID_mix_string(self, seed):
3144         mixed = []
3145         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3146         seed = float(seed)
3147         for i in range(len(source)):
3148             seed  =  (seed * 211 + 30031 ) % 65536
3149             index  =  math.floor(seed / 65536 * len(source) )
3150             mixed.append(source[int(index)])
3151             source.remove(source[int(index)])
3152         #return ''.join(mixed)
3153         return mixed
3154
3155     def _get_file_id(self, fileId, seed):
3156         mixed = self._get_file_ID_mix_string(seed)
3157         ids = fileId.split('*')
3158         realId = []
3159         for ch in ids:
3160             if ch:
3161                 realId.append(mixed[int(ch)])
3162         return ''.join(realId)
3163
3164     def _real_extract(self, url):
3165         mobj = re.match(self._VALID_URL, url)
3166         if mobj is None:
3167             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3168             return
3169         video_id = mobj.group('ID')
3170
3171         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3172
3173         request = compat_urllib_request.Request(info_url, None, std_headers)
3174         try:
3175             self.report_download_webpage(video_id)
3176             jsondata = compat_urllib_request.urlopen(request).read()
3177         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3178             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3179             return
3180
3181         self.report_extraction(video_id)
3182         try:
3183             jsonstr = jsondata.decode('utf-8')
3184             config = json.loads(jsonstr)
3185
3186             video_title =  config['data'][0]['title']
3187             seed = config['data'][0]['seed']
3188
3189             format = self._downloader.params.get('format', None)
3190             supported_format = list(config['data'][0]['streamfileids'].keys())
3191
3192             if format is None or format == 'best':
3193                 if 'hd2' in supported_format:
3194                     format = 'hd2'
3195                 else:
3196                     format = 'flv'
3197                 ext = u'flv'
3198             elif format == 'worst':
3199                 format = 'mp4'
3200                 ext = u'mp4'
3201             else:
3202                 format = 'flv'
3203                 ext = u'flv'
3204
3205
3206             fileid = config['data'][0]['streamfileids'][format]
3207             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3208         except (UnicodeDecodeError, ValueError, KeyError):
3209             self._downloader.trouble(u'ERROR: unable to extract info section')
3210             return
3211
3212         files_info=[]
3213         sid = self._gen_sid()
3214         fileid = self._get_file_id(fileid, seed)
3215
3216         #column 8,9 of fileid represent the segment number
3217         #fileid[7:9] should be changed
3218         for index, key in enumerate(keys):
3219
3220             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3221             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3222
3223             info = {
3224                 'id': '%s_part%02d' % (video_id, index),
3225                 'url': download_url,
3226                 'uploader': None,
3227                 'upload_date': None,
3228                 'title': video_title,
3229                 'ext': ext,
3230             }
3231             files_info.append(info)
3232
3233         return files_info
3234
3235
3236 class XNXXIE(InfoExtractor):
3237     """Information extractor for xnxx.com"""
3238
3239     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3240     IE_NAME = u'xnxx'
3241     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3242     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3243     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3244
3245     def report_webpage(self, video_id):
3246         """Report information extraction"""
3247         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3248
3249     def report_extraction(self, video_id):
3250         """Report information extraction"""
3251         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3252
3253     def _real_extract(self, url):
3254         mobj = re.match(self._VALID_URL, url)
3255         if mobj is None:
3256             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3257             return
3258         video_id = mobj.group(1)
3259
3260         self.report_webpage(video_id)
3261
3262         # Get webpage content
3263         try:
3264             webpage_bytes = compat_urllib_request.urlopen(url).read()
3265             webpage = webpage_bytes.decode('utf-8')
3266         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3267             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3268             return
3269
3270         result = re.search(self.VIDEO_URL_RE, webpage)
3271         if result is None:
3272             self._downloader.trouble(u'ERROR: unable to extract video url')
3273             return
3274         video_url = compat_urllib_parse.unquote(result.group(1))
3275
3276         result = re.search(self.VIDEO_TITLE_RE, webpage)
3277         if result is None:
3278             self._downloader.trouble(u'ERROR: unable to extract video title')
3279             return
3280         video_title = result.group(1)
3281
3282         result = re.search(self.VIDEO_THUMB_RE, webpage)
3283         if result is None:
3284             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3285             return
3286         video_thumbnail = result.group(1)
3287
3288         return [{
3289             'id': video_id,
3290             'url': video_url,
3291             'uploader': None,
3292             'upload_date': None,
3293             'title': video_title,
3294             'ext': 'flv',
3295             'thumbnail': video_thumbnail,
3296             'description': None,
3297         }]
3298
3299
3300 class GooglePlusIE(InfoExtractor):
3301     """Information extractor for plus.google.com."""
3302
3303     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3304     IE_NAME = u'plus.google'
3305
3306     def __init__(self, downloader=None):
3307         InfoExtractor.__init__(self, downloader)
3308
3309     def report_extract_entry(self, url):
3310         """Report downloading extry"""
3311         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3312
3313     def report_date(self, upload_date):
3314         """Report downloading extry"""
3315         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3316
3317     def report_uploader(self, uploader):
3318         """Report downloading extry"""
3319         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3320
3321     def report_title(self, video_title):
3322         """Report downloading extry"""
3323         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3324
3325     def report_extract_vid_page(self, video_page):
3326         """Report information extraction."""
3327         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3328
3329     def _real_extract(self, url):
3330         # Extract id from URL
3331         mobj = re.match(self._VALID_URL, url)
3332         if mobj is None:
3333             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3334             return
3335
3336         post_url = mobj.group(0)
3337         video_id = mobj.group(1)
3338
3339         video_extension = 'flv'
3340
3341         # Step 1, Retrieve post webpage to extract further information
3342         self.report_extract_entry(post_url)
3343         request = compat_urllib_request.Request(post_url)
3344         try:
3345             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3346         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3347             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3348             return
3349
3350         # Extract update date
3351         upload_date = None
3352         pattern = 'title="Timestamp">(.*?)</a>'
3353         mobj = re.search(pattern, webpage)
3354         if mobj:
3355             upload_date = mobj.group(1)
3356             # Convert timestring to a format suitable for filename
3357             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3358             upload_date = upload_date.strftime('%Y%m%d')
3359         self.report_date(upload_date)
3360
3361         # Extract uploader
3362         uploader = None
3363         pattern = r'rel\="author".*?>(.*?)</a>'
3364         mobj = re.search(pattern, webpage)
3365         if mobj:
3366             uploader = mobj.group(1)
3367         self.report_uploader(uploader)
3368
3369         # Extract title
3370         # Get the first line for title
3371         video_title = u'NA'
3372         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3373         mobj = re.search(pattern, webpage)
3374         if mobj:
3375             video_title = mobj.group(1)
3376         self.report_title(video_title)
3377
3378         # Step 2, Stimulate clicking the image box to launch video
3379         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3380         mobj = re.search(pattern, webpage)
3381         if mobj is None:
3382             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3383
3384         video_page = mobj.group(1)
3385         request = compat_urllib_request.Request(video_page)
3386         try:
3387             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3388         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3389             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3390             return
3391         self.report_extract_vid_page(video_page)
3392
3393
3394         # Extract video links on video page
3395         """Extract video links of all sizes"""
3396         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3397         mobj = re.findall(pattern, webpage)
3398         if len(mobj) == 0:
3399             self._downloader.trouble(u'ERROR: unable to extract video links')
3400
3401         # Sort in resolution
3402         links = sorted(mobj)
3403
3404         # Choose the lowest of the sort, i.e. highest resolution
3405         video_url = links[-1]
3406         # Only get the url. The resolution part in the tuple has no use anymore
3407         video_url = video_url[-1]
3408         # Treat escaped \u0026 style hex
3409         try:
3410             video_url = video_url.decode("unicode_escape")
3411         except AttributeError: # Python 3
3412             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3413
3414
3415         return [{
3416             'id':       video_id,
3417             'url':      video_url,
3418             'uploader': uploader,
3419             'upload_date':  upload_date,
3420             'title':    video_title,
3421             'ext':      video_extension,
3422         }]
3423
3424 class NBAIE(InfoExtractor):
3425     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3426     IE_NAME = u'nba'
3427
3428     def _real_extract(self, url):
3429         mobj = re.match(self._VALID_URL, url)
3430         if mobj is None:
3431             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3432             return
3433
3434         video_id = mobj.group(1)
3435         if video_id.endswith('/index.html'):
3436             video_id = video_id[:-len('/index.html')]
3437
3438         webpage = self._download_webpage(url, video_id)
3439
3440         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3441         def _findProp(rexp, default=None):
3442             m = re.search(rexp, webpage)
3443             if m:
3444                 return unescapeHTML(m.group(1))
3445             else:
3446                 return default
3447
3448         shortened_video_id = video_id.rpartition('/')[2]
3449         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3450         info = {
3451             'id': shortened_video_id,
3452             'url': video_url,
3453             'ext': 'mp4',
3454             'title': title,
3455             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3456             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3457         }
3458         return [info]
3459
3460 class JustinTVIE(InfoExtractor):
3461     """Information extractor for justin.tv and twitch.tv"""
3462     # TODO: One broadcast may be split into multiple videos. The key
3463     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3464     # starts at 1 and increases. Can we treat all parts as one video?
3465
3466     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3467         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3468     _JUSTIN_PAGE_LIMIT = 100
3469     IE_NAME = u'justin.tv'
3470
3471     def report_extraction(self, file_id):
3472         """Report information extraction."""
3473         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3474
3475     def report_download_page(self, channel, offset):
3476         """Report attempt to download a single page of videos."""
3477         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3478                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3479
3480     # Return count of items, list of *valid* items
3481     def _parse_page(self, url):
3482         try:
3483             urlh = compat_urllib_request.urlopen(url)
3484             webpage_bytes = urlh.read()
3485             webpage = webpage_bytes.decode('utf-8', 'ignore')
3486         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3487             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3488             return
3489
3490         response = json.loads(webpage)
3491         if type(response) != list:
3492             error_text = response.get('error', 'unknown error')
3493             self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3494             return
3495         info = []
3496         for clip in response:
3497             video_url = clip['video_file_url']
3498             if video_url:
3499                 video_extension = os.path.splitext(video_url)[1][1:]
3500                 video_date = re.sub('-', '', clip['start_time'][:10])
3501                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3502                 video_id = clip['id']
3503                 video_title = clip.get('title', video_id)
3504                 info.append({
3505                     'id': video_id,
3506                     'url': video_url,
3507                     'title': video_title,
3508                     'uploader': clip.get('channel_name', video_uploader_id),
3509                     'uploader_id': video_uploader_id,
3510                     'upload_date': video_date,
3511                     'ext': video_extension,
3512                 })
3513         return (len(response), info)
3514
3515     def _real_extract(self, url):
3516         mobj = re.match(self._VALID_URL, url)
3517         if mobj is None:
3518             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3519             return
3520
3521         api = 'http://api.justin.tv'
3522         video_id = mobj.group(mobj.lastindex)
3523         paged = False
3524         if mobj.lastindex == 1:
3525             paged = True
3526             api += '/channel/archives/%s.json'
3527         else:
3528             api += '/broadcast/by_archive/%s.json'
3529         api = api % (video_id,)
3530
3531         self.report_extraction(video_id)
3532
3533         info = []
3534         offset = 0
3535         limit = self._JUSTIN_PAGE_LIMIT
3536         while True:
3537             if paged:
3538                 self.report_download_page(video_id, offset)
3539             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3540             page_count, page_info = self._parse_page(page_url)
3541             info.extend(page_info)
3542             if not paged or page_count != limit:
3543                 break
3544             offset += limit
3545         return info
3546
3547 class FunnyOrDieIE(InfoExtractor):
3548     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3549
3550     def _real_extract(self, url):
3551         mobj = re.match(self._VALID_URL, url)
3552         if mobj is None:
3553             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3554             return
3555
3556         video_id = mobj.group('id')
3557         webpage = self._download_webpage(url, video_id)
3558
3559         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3560         if not m:
3561             self._downloader.trouble(u'ERROR: unable to find video information')
3562         video_url = unescapeHTML(m.group('url'))
3563
3564         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3565         if not m:
3566             self._downloader.trouble(u'Cannot find video title')
3567         title = unescapeHTML(m.group('title'))
3568
3569         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3570         if m:
3571             desc = unescapeHTML(m.group('desc'))
3572         else:
3573             desc = None
3574
3575         info = {
3576             'id': video_id,
3577             'url': video_url,
3578             'ext': 'mp4',
3579             'title': title,
3580             'description': desc,
3581         }
3582         return [info]
3583
3584 class TweetReelIE(InfoExtractor):
3585     _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3586
3587     def _real_extract(self, url):
3588         mobj = re.match(self._VALID_URL, url)
3589         if mobj is None:
3590             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3591             return
3592
3593         video_id = mobj.group('id')
3594         webpage = self._download_webpage(url, video_id)
3595
3596         m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3597         if not m:
3598             self._downloader.trouble(u'ERROR: Cannot find status ID')
3599         status_id = m.group(1)
3600
3601         m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3602         if not m:
3603             self._downloader.trouble(u'WARNING: Cannot find description')
3604         desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3605
3606         m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3607         if not m:
3608             self._downloader.trouble(u'ERROR: Cannot find uploader')
3609         uploader = unescapeHTML(m.group('uploader'))
3610         uploader_id = unescapeHTML(m.group('uploader_id'))
3611
3612         m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3613         if not m:
3614             self._downloader.trouble(u'ERROR: Cannot find upload date')
3615         upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3616
3617         title = desc
3618         video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3619
3620         info = {
3621             'id': video_id,
3622             'url': video_url,
3623             'ext': 'mov',
3624             'title': title,
3625             'description': desc,
3626             'uploader': uploader,
3627             'uploader_id': uploader_id,
3628             'internal_id': status_id,
3629             'upload_date': upload_date
3630         }
3631         return [info]
3632
3633 class SteamIE(InfoExtractor):
3634     _VALID_URL = r"""http://store.steampowered.com/
3635                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3636                 (?P<gameID>\d+)/?
3637                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3638                 """
3639
3640     @classmethod
3641     def suitable(cls, url):
3642         """Receives a URL and returns True if suitable for this IE."""
3643         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3644
3645     def _real_extract(self, url):
3646         m = re.match(self._VALID_URL, url, re.VERBOSE)
3647         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3648         gameID = m.group('gameID')
3649         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3650         webpage = self._download_webpage(videourl, gameID)
3651         mweb = re.finditer(urlRE, webpage)
3652         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3653         titles = re.finditer(namesRE, webpage)
3654         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3655         thumbs = re.finditer(thumbsRE, webpage)
3656         videos = []
3657         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3658             video_id = vid.group('videoID')
3659             title = vtitle.group('videoName')
3660             video_url = vid.group('videoURL')
3661             video_thumb = thumb.group('thumbnail')
3662             if not video_url:
3663                 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3664             info = {
3665                 'id':video_id,
3666                 'url':video_url,
3667                 'ext': 'flv',
3668                 'title': unescapeHTML(title),
3669                 'thumbnail': video_thumb
3670                   }
3671             videos.append(info)
3672         return videos
3673
3674 class UstreamIE(InfoExtractor):
3675     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3676     IE_NAME = u'ustream'
3677
3678     def _real_extract(self, url):
3679         m = re.match(self._VALID_URL, url)
3680         video_id = m.group('videoID')
3681         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3682         webpage = self._download_webpage(url, video_id)
3683         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3684         title = m.group('title')
3685         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3686         uploader = m.group('uploader')
3687         info = {
3688                 'id':video_id,
3689                 'url':video_url,
3690                 'ext': 'flv',
3691                 'title': title,
3692                 'uploader': uploader
3693                   }
3694         return [info]
3695
3696 class RBMARadioIE(InfoExtractor):
3697     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3698
3699     def _real_extract(self, url):
3700         m = re.match(self._VALID_URL, url)
3701         video_id = m.group('videoID')
3702
3703         webpage = self._download_webpage(url, video_id)
3704         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3705         if not m:
3706             raise ExtractorError(u'Cannot find metadata')
3707         json_data = m.group(1)
3708
3709         try:
3710             data = json.loads(json_data)
3711         except ValueError as e:
3712             raise ExtractorError(u'Invalid JSON: ' + str(e))
3713
3714         video_url = data['akamai_url'] + '&cbr=256'
3715         url_parts = compat_urllib_parse_urlparse(video_url)
3716         video_ext = url_parts.path.rpartition('.')[2]
3717         info = {
3718                 'id': video_id,
3719                 'url': video_url,
3720                 'ext': video_ext,
3721                 'title': data['title'],
3722                 'description': data.get('teaser_text'),
3723                 'location': data.get('country_of_origin'),
3724                 'uploader': data.get('host', {}).get('name'),
3725                 'uploader_id': data.get('host', {}).get('slug'),
3726                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3727                 'duration': data.get('duration'),
3728         }
3729         return [info]
3730
3731
3732 class YouPornIE(InfoExtractor):
3733     """Information extractor for youporn.com."""
3734     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3735
3736     def _print_formats(self, formats):
3737         """Print all available formats"""
3738         print(u'Available formats:')
3739         print(u'ext\t\tformat')
3740         print(u'---------------------------------')
3741         for format in formats:
3742             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3743
3744     def _specific(self, req_format, formats):
3745         for x in formats:
3746             if(x["format"]==req_format):
3747                 return x
3748         return None
3749
3750     def _real_extract(self, url):
3751         mobj = re.match(self._VALID_URL, url)
3752         if mobj is None:
3753             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3754             return
3755
3756         video_id = mobj.group('videoid')
3757
3758         req = compat_urllib_request.Request(url)
3759         req.add_header('Cookie', 'age_verified=1')
3760         webpage = self._download_webpage(req, video_id)
3761
3762         # Get the video title
3763         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3764         if result is None:
3765             raise ExtractorError(u'Unable to extract video title')
3766         video_title = result.group('title').strip()
3767
3768         # Get the video date
3769         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3770         if result is None:
3771             self._downloader.to_stderr(u'WARNING: unable to extract video date')
3772             upload_date = None
3773         else:
3774             upload_date = result.group('date').strip()
3775
3776         # Get the video uploader
3777         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3778         if result is None:
3779             self._downloader.to_stderr(u'WARNING: unable to extract uploader')
3780             video_uploader = None
3781         else:
3782             video_uploader = result.group('uploader').strip()
3783             video_uploader = clean_html( video_uploader )
3784
3785         # Get all of the formats available
3786         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3787         result = re.search(DOWNLOAD_LIST_RE, webpage)
3788         if result is None:
3789             raise ExtractorError(u'Unable to extract download list')
3790         download_list_html = result.group('download_list').strip()
3791
3792         # Get all of the links from the page
3793         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3794         links = re.findall(LINK_RE, download_list_html)
3795         if(len(links) == 0):
3796             raise ExtractorError(u'ERROR: no known formats available for video')
3797
3798         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3799
3800         formats = []
3801         for link in links:
3802
3803             # A link looks like this:
3804             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3805             # A path looks like this:
3806             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3807             video_url = unescapeHTML( link )
3808             path = compat_urllib_parse_urlparse( video_url ).path
3809             extension = os.path.splitext( path )[1][1:]
3810             format = path.split('/')[4].split('_')[:2]
3811             size = format[0]
3812             bitrate = format[1]
3813             format = "-".join( format )
3814             title = u'%s-%s-%s' % (video_title, size, bitrate)
3815
3816             formats.append({
3817                 'id': video_id,
3818                 'url': video_url,
3819                 'uploader': video_uploader,
3820                 'upload_date': upload_date,
3821                 'title': title,
3822                 'ext': extension,
3823                 'format': format,
3824                 'thumbnail': None,
3825                 'description': None,
3826                 'player_url': None
3827             })
3828
3829         if self._downloader.params.get('listformats', None):
3830             self._print_formats(formats)
3831             return
3832
3833         req_format = self._downloader.params.get('format', None)
3834         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3835
3836         if req_format is None or req_format == 'best':
3837             return [formats[0]]
3838         elif req_format == 'worst':
3839             return [formats[-1]]
3840         elif req_format in ('-1', 'all'):
3841             return formats
3842         else:
3843             format = self._specific( req_format, formats )
3844             if result is None:
3845                 self._downloader.trouble(u'ERROR: requested format not available')
3846                 return
3847             return [format]
3848
3849
3850
3851 class PornotubeIE(InfoExtractor):
3852     """Information extractor for pornotube.com."""
3853     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3854
3855     def _real_extract(self, url):
3856         mobj = re.match(self._VALID_URL, url)
3857         if mobj is None:
3858             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3859             return
3860
3861         video_id = mobj.group('videoid')
3862         video_title = mobj.group('title')
3863
3864         # Get webpage content
3865         webpage = self._download_webpage(url, video_id)
3866
3867         # Get the video URL
3868         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3869         result = re.search(VIDEO_URL_RE, webpage)
3870         if result is None:
3871             self._downloader.trouble(u'ERROR: unable to extract video url')
3872             return
3873         video_url = compat_urllib_parse.unquote(result.group('url'))
3874
3875         #Get the uploaded date
3876         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3877         result = re.search(VIDEO_UPLOADED_RE, webpage)
3878         if result is None:
3879             self._downloader.trouble(u'ERROR: unable to extract video title')
3880             return
3881         upload_date = result.group('date')
3882
3883         info = {'id': video_id,
3884                 'url': video_url,
3885                 'uploader': None,
3886                 'upload_date': upload_date,
3887                 'title': video_title,
3888                 'ext': 'flv',
3889                 'format': 'flv'}
3890
3891         return [info]
3892
3893 class YouJizzIE(InfoExtractor):
3894     """Information extractor for youjizz.com."""
3895     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3896
3897     def _real_extract(self, url):
3898         mobj = re.match(self._VALID_URL, url)
3899         if mobj is None:
3900             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3901             return
3902
3903         video_id = mobj.group('videoid')
3904
3905         # Get webpage content
3906         webpage = self._download_webpage(url, video_id)
3907
3908         # Get the video title
3909         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3910         if result is None:
3911             raise ExtractorError(u'ERROR: unable to extract video title')
3912         video_title = result.group('title').strip()
3913
3914         # Get the embed page
3915         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3916         if result is None:
3917             raise ExtractorError(u'ERROR: unable to extract embed page')
3918
3919         embed_page_url = result.group(0).strip()
3920         video_id = result.group('videoid')
3921
3922         webpage = self._download_webpage(embed_page_url, video_id)
3923
3924         # Get the video URL
3925         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3926         if result is None:
3927             raise ExtractorError(u'ERROR: unable to extract video url')
3928         video_url = result.group('source')
3929
3930         info = {'id': video_id,
3931                 'url': video_url,
3932                 'title': video_title,
3933                 'ext': 'flv',
3934                 'format': 'flv',
3935                 'player_url': embed_page_url}
3936
3937         return [info]
3938
3939 class EightTracksIE(InfoExtractor):
3940     IE_NAME = '8tracks'
3941     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3942
3943     def _real_extract(self, url):
3944         mobj = re.match(self._VALID_URL, url)
3945         if mobj is None:
3946             raise ExtractorError(u'Invalid URL: %s' % url)
3947         playlist_id = mobj.group('id')
3948
3949         webpage = self._download_webpage(url, playlist_id)
3950
3951         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3952         if not m:
3953             raise ExtractorError(u'Cannot find trax information')
3954         json_like = m.group(1)
3955         data = json.loads(json_like)
3956
3957         session = str(random.randint(0, 1000000000))
3958         mix_id = data['id']
3959         track_count = data['tracks_count']
3960         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3961         next_url = first_url
3962         res = []
3963         for i in itertools.count():
3964             api_json = self._download_webpage(next_url, playlist_id,
3965                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3966                 errnote=u'Failed to download song information')
3967             api_data = json.loads(api_json)
3968             track_data = api_data[u'set']['track']
3969             info = {
3970                 'id': track_data['id'],
3971                 'url': track_data['track_file_stream_url'],
3972                 'title': track_data['performer'] + u' - ' + track_data['name'],
3973                 'raw_title': track_data['name'],
3974                 'uploader_id': data['user']['login'],
3975                 'ext': 'm4a',
3976             }
3977             res.append(info)
3978             if api_data['set']['at_last_track']:
3979                 break
3980             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3981         return res
3982
3983 class KeekIE(InfoExtractor):
3984     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3985     IE_NAME = u'keek'
3986
3987     def _real_extract(self, url):
3988         m = re.match(self._VALID_URL, url)
3989         video_id = m.group('videoID')
3990         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3991         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3992         webpage = self._download_webpage(url, video_id)
3993         m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
3994         title = unescapeHTML(m.group('title'))
3995         m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
3996         uploader = unescapeHTML(m.group('uploader'))
3997         info = {
3998                 'id':video_id,
3999                 'url':video_url,
4000                 'ext': 'mp4',
4001                 'title': title,
4002                 'thumbnail': thumbnail,
4003                 'uploader': uploader
4004         }
4005         return [info]
4006
4007 class TEDIE(InfoExtractor):
4008     _VALID_URL=r'''http://www.ted.com/
4009                    (
4010                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4011                         |
4012                         ((?P<type_talk>talks)) # We have a simple talk
4013                    )
4014                    /(?P<name>\w+) # Here goes the name and then ".html"
4015                    '''
4016
4017     @classmethod
4018     def suitable(cls, url):
4019         """Receives a URL and returns True if suitable for this IE."""
4020         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4021
4022     def _real_extract(self, url):
4023         m=re.match(self._VALID_URL, url, re.VERBOSE)
4024         if m.group('type_talk'):
4025             return [self._talk_info(url)]
4026         else :
4027             playlist_id=m.group('playlist_id')
4028             name=m.group('name')
4029             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4030             return self._playlist_videos_info(url,name,playlist_id)
4031
4032     def _talk_video_link(self,mediaSlug):
4033         '''Returns the video link for that mediaSlug'''
4034         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4035
4036     def _playlist_videos_info(self,url,name,playlist_id=0):
4037         '''Returns the videos of the playlist'''
4038         video_RE=r'''
4039                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4040                      ([.\s]*?)data-playlist_item_id="(\d+)"
4041                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4042                      '''
4043         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4044         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4045         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4046         m_names=re.finditer(video_name_RE,webpage)
4047         info=[]
4048         for m_video, m_name in zip(m_videos,m_names):
4049             video_id=m_video.group('video_id')
4050             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4051             info.append(self._talk_info(talk_url,video_id))
4052         return info
4053
4054     def _talk_info(self, url, video_id=0):
4055         """Return the video for the talk in the url"""
4056         m=re.match(self._VALID_URL, url,re.VERBOSE)
4057         videoName=m.group('name')
4058         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4059         # If the url includes the language we get the title translated
4060         title_RE=r'<h1><span id="altHeadline" >(?P<title>.*)</span></h1>'
4061         title=re.search(title_RE, webpage).group('title')
4062         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4063                         "id":(?P<videoID>[\d]+).*?
4064                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4065         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4066         thumb_match=re.search(thumb_RE,webpage)
4067         info_match=re.search(info_RE,webpage,re.VERBOSE)
4068         video_id=info_match.group('videoID')
4069         mediaSlug=info_match.group('mediaSlug')
4070         video_url=self._talk_video_link(mediaSlug)
4071         info = {
4072                 'id': video_id,
4073                 'url': video_url,
4074                 'ext': 'mp4',
4075                 'title': title,
4076                 'thumbnail': thumb_match.group('thumbnail')
4077                 }
4078         return info
4079
4080 class MySpassIE(InfoExtractor):
4081     _VALID_URL = r'http://www.myspass.de/.*'
4082
4083     def _real_extract(self, url):
4084         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4085
4086         # video id is the last path element of the URL
4087         # usually there is a trailing slash, so also try the second but last
4088         url_path = compat_urllib_parse_urlparse(url).path
4089         url_parent_path, video_id = os.path.split(url_path)
4090         if not video_id:
4091             _, video_id = os.path.split(url_parent_path)
4092
4093         # get metadata
4094         metadata_url = META_DATA_URL_TEMPLATE % video_id
4095         metadata_text = self._download_webpage(metadata_url, video_id)
4096         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4097
4098         # extract values from metadata
4099         url_flv_el = metadata.find('url_flv')
4100         if url_flv_el is None:
4101             self._downloader.trouble(u'ERROR: unable to extract download url')
4102             return
4103         video_url = url_flv_el.text
4104         extension = os.path.splitext(video_url)[1][1:]
4105         title_el = metadata.find('title')
4106         if title_el is None:
4107             self._downloader.trouble(u'ERROR: unable to extract title')
4108             return
4109         title = title_el.text
4110         format_id_el = metadata.find('format_id')
4111         if format_id_el is None:
4112             format = ext
4113         else:
4114             format = format_id_el.text
4115         description_el = metadata.find('description')
4116         if description_el is not None:
4117             description = description_el.text
4118         else:
4119             description = None
4120         imagePreview_el = metadata.find('imagePreview')
4121         if imagePreview_el is not None:
4122             thumbnail = imagePreview_el.text
4123         else:
4124             thumbnail = None
4125         info = {
4126             'id': video_id,
4127             'url': video_url,
4128             'title': title,
4129             'ext': extension,
4130             'format': format,
4131             'thumbnail': thumbnail,
4132             'description': description
4133         }
4134         return [info]
4135
4136 def gen_extractors():
4137     """ Return a list of an instance of every supported extractor.
4138     The order does matter; the first extractor matched is the one handling the URL.
4139     """
4140     return [
4141         YoutubePlaylistIE(),
4142         YoutubeChannelIE(),
4143         YoutubeUserIE(),
4144         YoutubeSearchIE(),
4145         YoutubeIE(),
4146         MetacafeIE(),
4147         DailymotionIE(),
4148         GoogleSearchIE(),
4149         PhotobucketIE(),
4150         YahooIE(),
4151         YahooSearchIE(),
4152         DepositFilesIE(),
4153         FacebookIE(),
4154         BlipTVUserIE(),
4155         BlipTVIE(),
4156         VimeoIE(),
4157         MyVideoIE(),
4158         ComedyCentralIE(),
4159         EscapistIE(),
4160         CollegeHumorIE(),
4161         XVideosIE(),
4162         SoundcloudIE(),
4163         InfoQIE(),
4164         MixcloudIE(),
4165         StanfordOpenClassroomIE(),
4166         MTVIE(),
4167         YoukuIE(),
4168         XNXXIE(),
4169         YouJizzIE(),
4170         PornotubeIE(),
4171         YouPornIE(),
4172         GooglePlusIE(),
4173         ArteTvIE(),
4174         NBAIE(),
4175         JustinTVIE(),
4176         FunnyOrDieIE(),
4177         TweetReelIE(),
4178         SteamIE(),
4179         UstreamIE(),
4180         RBMARadioIE(),
4181         EightTracksIE(),
4182         KeekIE(),
4183         TEDIE(),
4184         MySpassIE(),
4185         GenericIE()
4186     ]
4187
4188