_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19
  20 from .utils import *
  21
  22
  23 class InfoExtractor(object):
  24     """Information Extractor class.
  25
  26     Information extractors are the classes that, given a URL, extract
  27     information about the video (or videos) the URL refers to. This
  28     information includes the real video URL, the video title, author and
  29     others. The information is stored in a dictionary which is then
  30     passed to the FileDownloader. The FileDownloader processes this
  31     information possibly downloading the video to the file system, among
  32     other possible outcomes.
  33
  34     The dictionaries must include the following fields:
  35
  36     id:             Video identifier.
  37     url:            Final video URL.
  38     title:          Video title, unescaped.
  39     ext:            Video filename extension.
  40
  41     The following fields are optional:
  42
  43     format:         The video format, defaults to ext (used for --get-format)
  44     thumbnail:      Full URL to a video thumbnail image.
  45     description:    One-line video description.
  46     uploader:       Full name of the video uploader.
  47     upload_date:    Video upload date (YYYYMMDD).
  48     uploader_id:    Nickname or id of the video uploader.
  49     location:       Physical location of the video.
  50     player_url:     SWF Player URL (used for rtmpdump).
  51     subtitles:      The .srt file contents.
  52     urlhandle:      [internal] The urlHandle to be used to download the file,
  53                     like returned by urllib.request.urlopen
  54
  55     The fields should all be Unicode strings.
  56
  57     Subclasses of this one should re-define the _real_initialize() and
  58     _real_extract() methods and define a _VALID_URL regexp.
  59     Probably, they should also be added to the list of extractors.
  60
  61     _real_extract() must return a *list* of information dictionaries as
  62     described above.
  63
  64     Finally, the _WORKING attribute should be set to False for broken IEs
  65     in order to warn the users and skip the tests.
  66     """
  67
  68     _ready = False
  69     _downloader = None
  70     _WORKING = True
  71
  72     def __init__(self, downloader=None):
  73         """Constructor. Receives an optional downloader."""
  74         self._ready = False
  75         self.set_downloader(downloader)
  76
  77     @classmethod
  78     def suitable(cls, url):
  79         """Receives a URL and returns True if suitable for this IE."""
  80         return re.match(cls._VALID_URL, url) is not None
  81
  82     @classmethod
  83     def working(cls):
  84         """Getter method for _WORKING."""
  85         return cls._WORKING
  86
  87     def initialize(self):
  88         """Initializes an instance (authentication, etc)."""
  89         if not self._ready:
  90             self._real_initialize()
  91             self._ready = True
  92
  93     def extract(self, url):
  94         """Extracts URL information and returns it in list of dicts."""
  95         self.initialize()
  96         return self._real_extract(url)
  97
  98     def set_downloader(self, downloader):
  99         """Sets the downloader for this IE."""
 100         self._downloader = downloader
 101
 102     def _real_initialize(self):
 103         """Real initialization process. Redefine in subclasses."""
 104         pass
 105
 106     def _real_extract(self, url):
 107         """Real extraction process. Redefine in subclasses."""
 108         pass
 109
 110     @property
 111     def IE_NAME(self):
 112         return type(self).__name__[:-2]
 113
 114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 115         """ Returns the response handle """
 116         if note is None:
 117             note = u'Downloading video webpage'
 118         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
 119         try:
 120             return compat_urllib_request.urlopen(url_or_request)
 121         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 122             if errnote is None:
 123                 errnote = u'Unable to download webpage'
 124             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 125
 126     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 127         """ Returns the data of the page as a string """
 128         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 129         webpage_bytes = urlh.read()
 130         return webpage_bytes.decode('utf-8', 'replace')
 131
 132
 133 class YoutubeIE(InfoExtractor):
 134     """Information extractor for youtube.com."""
 135
 136     _VALID_URL = r"""^
 137                      (
 138                          (?:https?://)?                                       # http(s):// (optional)
 139                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 140                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 141                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 142                          (?:                                                  # the various things that can precede the ID:
 143                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 144                              |(?:                                             # or the v= param in all its forms
 145                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 146                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 147                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 148                                  v=
 149                              )
 150                          )?                                                   # optional -> youtube.com/xxxx is OK
 151                      )?                                                       # all until now is optional -> you can pass the naked ID
 152                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 153                      (?(1).+)?                                                # if we found the ID, everything can follow
 154                      $"""
 155     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 156     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 157     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 158     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 159     _NETRC_MACHINE = 'youtube'
 160     # Listed in order of quality
 161     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 162     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 163     _video_extensions = {
 164         '13': '3gp',
 165         '17': 'mp4',
 166         '18': 'mp4',
 167         '22': 'mp4',
 168         '37': 'mp4',
 169         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 170         '43': 'webm',
 171         '44': 'webm',
 172         '45': 'webm',
 173         '46': 'webm',
 174     }
 175     _video_dimensions = {
 176         '5': '240x400',
 177         '6': '???',
 178         '13': '???',
 179         '17': '144x176',
 180         '18': '360x640',
 181         '22': '720x1280',
 182         '34': '360x640',
 183         '35': '480x854',
 184         '37': '1080x1920',
 185         '38': '3072x4096',
 186         '43': '360x640',
 187         '44': '480x854',
 188         '45': '720x1280',
 189         '46': '1080x1920',
 190     }
 191     IE_NAME = u'youtube'
 192
 193     @classmethod
 194     def suitable(cls, url):
 195         """Receives a URL and returns True if suitable for this IE."""
 196         if YoutubePlaylistIE.suitable(url): return False
 197         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 198
 199     def report_lang(self):
 200         """Report attempt to set language."""
 201         self._downloader.to_screen(u'[youtube] Setting language')
 202
 203     def report_login(self):
 204         """Report attempt to log in."""
 205         self._downloader.to_screen(u'[youtube] Logging in')
 206
 207     def report_age_confirmation(self):
 208         """Report attempt to confirm age."""
 209         self._downloader.to_screen(u'[youtube] Confirming age')
 210
 211     def report_video_webpage_download(self, video_id):
 212         """Report attempt to download video webpage."""
 213         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 214
 215     def report_video_info_webpage_download(self, video_id):
 216         """Report attempt to download video info webpage."""
 217         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 218
 219     def report_video_subtitles_download(self, video_id):
 220         """Report attempt to download video info webpage."""
 221         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 222
 223     def report_information_extraction(self, video_id):
 224         """Report attempt to extract video information."""
 225         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 226
 227     def report_unavailable_format(self, video_id, format):
 228         """Report extracted video URL."""
 229         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 230
 231     def report_rtmp_download(self):
 232         """Indicate the download will use the RTMP protocol."""
 233         self._downloader.to_screen(u'[youtube] RTMP download detected')
 234
 235     def _closed_captions_xml_to_srt(self, xml_string):
 236         srt = ''
 237         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 238         # TODO parse xml instead of regex
 239         for n, (start, dur_tag, dur, caption) in enumerate(texts):
 240             if not dur: dur = '4'
 241             start = float(start)
 242             end = start + float(dur)
 243             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 244             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 245             caption = unescapeHTML(caption)
 246             caption = unescapeHTML(caption) # double cycle, intentional
 247             srt += str(n+1) + '\n'
 248             srt += start + ' --> ' + end + '\n'
 249             srt += caption + '\n\n'
 250         return srt
 251
 252     def _extract_subtitles(self, video_id):
 253         self.report_video_subtitles_download(video_id)
 254         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 255         try:
 256             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 257         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 258             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 259         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 260         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 261         if not srt_lang_list:
 262             return (u'WARNING: video has no closed captions', None)
 263         if self._downloader.params.get('subtitleslang', False):
 264             srt_lang = self._downloader.params.get('subtitleslang')
 265         elif 'en' in srt_lang_list:
 266             srt_lang = 'en'
 267         else:
 268             srt_lang = list(srt_lang_list.keys())[0]
 269         if not srt_lang in srt_lang_list:
 270             return (u'WARNING: no closed captions found in the specified language', None)
 271         params = compat_urllib_parse.urlencode({
 272             'lang': srt_lang,
 273             'name': srt_lang_list[srt_lang].encode('utf-8'),
 274             'v': video_id,
 275         })
 276         url = 'http://www.youtube.com/api/timedtext?' + params
 277         try:
 278             srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8')
 279         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 280             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 281         if not srt_xml:
 282             return (u'WARNING: Did not fetch video subtitles', None)
 283         return (None, self._closed_captions_xml_to_srt(srt_xml))
 284
 285     def _print_formats(self, formats):
 286         print('Available formats:')
 287         for x in formats:
 288             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 289
 290     def _real_initialize(self):
 291         if self._downloader is None:
 292             return
 293
 294         username = None
 295         password = None
 296         downloader_params = self._downloader.params
 297
 298         # Attempt to use provided username and password or .netrc data
 299         if downloader_params.get('username', None) is not None:
 300             username = downloader_params['username']
 301             password = downloader_params['password']
 302         elif downloader_params.get('usenetrc', False):
 303             try:
 304                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 305                 if info is not None:
 306                     username = info[0]
 307                     password = info[2]
 308                 else:
 309                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 310             except (IOError, netrc.NetrcParseError) as err:
 311                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
 312                 return
 313
 314         # Set language
 315         request = compat_urllib_request.Request(self._LANG_URL)
 316         try:
 317             self.report_lang()
 318             compat_urllib_request.urlopen(request).read()
 319         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 320             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
 321             return
 322
 323         # No authentication to be performed
 324         if username is None:
 325             return
 326
 327         request = compat_urllib_request.Request(self._LOGIN_URL)
 328         try:
 329             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 330         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 331             self._downloader.to_stderr(u'WARNING: unable to fetch login page: %s' % compat_str(err))
 332             return
 333
 334         galx = None
 335         dsh = None
 336         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 337         if match:
 338           galx = match.group(1)
 339
 340         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 341         if match:
 342           dsh = match.group(1)
 343
 344         # Log in
 345         login_form_strs = {
 346                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 347                 u'Email': username,
 348                 u'GALX': galx,
 349                 u'Passwd': password,
 350                 u'PersistentCookie': u'yes',
 351                 u'_utf8': u'霱',
 352                 u'bgresponse': u'js_disabled',
 353                 u'checkConnection': u'',
 354                 u'checkedDomains': u'youtube',
 355                 u'dnConn': u'',
 356                 u'dsh': dsh,
 357                 u'pstMsg': u'0',
 358                 u'rmShown': u'1',
 359                 u'secTok': u'',
 360                 u'signIn': u'Sign in',
 361                 u'timeStmp': u'',
 362                 u'service': u'youtube',
 363                 u'uilel': u'3',
 364                 u'hl': u'en_US',
 365         }
 366         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 367         # chokes on unicode
 368         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 369         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 370         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 371         try:
 372             self.report_login()
 373             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 374             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 375                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 376                 return
 377         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 378             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
 379             return
 380
 381         # Confirm age
 382         age_form = {
 383                 'next_url':     '/',
 384                 'action_confirm':   'Confirm',
 385                 }
 386         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 387         try:
 388             self.report_age_confirmation()
 389             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 390         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 391             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 392             return
 393
 394     def _extract_id(self, url):
 395         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 396         if mobj is None:
 397             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 398             return
 399         video_id = mobj.group(2)
 400         return video_id
 401
 402     def _real_extract(self, url):
 403         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 404         mobj = re.search(self._NEXT_URL_RE, url)
 405         if mobj:
 406             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 407         video_id = self._extract_id(url)
 408
 409         # Get video webpage
 410         self.report_video_webpage_download(video_id)
 411         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 412         request = compat_urllib_request.Request(url)
 413         try:
 414             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 415         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 416             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
 417             return
 418
 419         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 420
 421         # Attempt to extract SWF player URL
 422         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 423         if mobj is not None:
 424             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 425         else:
 426             player_url = None
 427
 428         # Get video info
 429         self.report_video_info_webpage_download(video_id)
 430         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 431             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 432                     % (video_id, el_type))
 433             request = compat_urllib_request.Request(video_info_url)
 434             try:
 435                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
 436                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
 437                 video_info = compat_parse_qs(video_info_webpage)
 438                 if 'token' in video_info:
 439                     break
 440             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 441                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 442                 return
 443         if 'token' not in video_info:
 444             if 'reason' in video_info:
 445                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
 446             else:
 447                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 448             return
 449
 450         # Check for "rental" videos
 451         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 452             self._downloader.trouble(u'ERROR: "rental" videos not supported')
 453             return
 454
 455         # Start extracting information
 456         self.report_information_extraction(video_id)
 457
 458         # uploader
 459         if 'author' not in video_info:
 460             self._downloader.trouble(u'ERROR: unable to extract uploader name')
 461             return
 462         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 463
 464         # uploader_id
 465         video_uploader_id = None
 466         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 467         if mobj is not None:
 468             video_uploader_id = mobj.group(1)
 469         else:
 470             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 471
 472         # title
 473         if 'title' not in video_info:
 474             self._downloader.trouble(u'ERROR: unable to extract video title')
 475             return
 476         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 477
 478         # thumbnail image
 479         if 'thumbnail_url' not in video_info:
 480             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 481             video_thumbnail = ''
 482         else:   # don't panic if we can't find it
 483             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 484
 485         # upload date
 486         upload_date = None
 487         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 488         if mobj is not None:
 489             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 490             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 491             for expression in format_expressions:
 492                 try:
 493                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 494                 except:
 495                     pass
 496
 497         # description
 498         video_description = get_element_by_id("eow-description", video_webpage)
 499         if video_description:
 500             video_description = clean_html(video_description)
 501         else:
 502             video_description = ''
 503
 504         # closed captions
 505         video_subtitles = None
 506         if self._downloader.params.get('writesubtitles', False):
 507             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
 508             if srt_error:
 509                 self._downloader.trouble(srt_error)
 510
 511         if 'length_seconds' not in video_info:
 512             self._downloader.trouble(u'WARNING: unable to extract video duration')
 513             video_duration = ''
 514         else:
 515             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 516
 517         # token
 518         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 519
 520         # Decide which formats to download
 521         req_format = self._downloader.params.get('format', None)
 522
 523         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 524             self.report_rtmp_download()
 525             video_url_list = [(None, video_info['conn'][0])]
 526         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 527             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 528             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 529             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 530             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 531
 532             format_limit = self._downloader.params.get('format_limit', None)
 533             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 534             if format_limit is not None and format_limit in available_formats:
 535                 format_list = available_formats[available_formats.index(format_limit):]
 536             else:
 537                 format_list = available_formats
 538             existing_formats = [x for x in format_list if x in url_map]
 539             if len(existing_formats) == 0:
 540                 self._downloader.trouble(u'ERROR: no known formats available for video')
 541                 return
 542             if self._downloader.params.get('listformats', None):
 543                 self._print_formats(existing_formats)
 544                 return
 545             if req_format is None or req_format == 'best':
 546                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 547             elif req_format == 'worst':
 548                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 549             elif req_format in ('-1', 'all'):
 550                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 551             else:
 552                 # Specific formats. We pick the first in a slash-delimeted sequence.
 553                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 554                 req_formats = req_format.split('/')
 555                 video_url_list = None
 556                 for rf in req_formats:
 557                     if rf in url_map:
 558                         video_url_list = [(rf, url_map[rf])]
 559                         break
 560                 if video_url_list is None:
 561                     self._downloader.trouble(u'ERROR: requested format not available')
 562                     return
 563         else:
 564             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 565             return
 566
 567         results = []
 568         for format_param, video_real_url in video_url_list:
 569             # Extension
 570             video_extension = self._video_extensions.get(format_param, 'flv')
 571
 572             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 573                                               self._video_dimensions.get(format_param, '???'))
 574
 575             results.append({
 576                 'id':       video_id,
 577                 'url':      video_real_url,
 578                 'uploader': video_uploader,
 579                 'uploader_id': video_uploader_id,
 580                 'upload_date':  upload_date,
 581                 'title':    video_title,
 582                 'ext':      video_extension,
 583                 'format':   video_format,
 584                 'thumbnail':    video_thumbnail,
 585                 'description':  video_description,
 586                 'player_url':   player_url,
 587                 'subtitles':    video_subtitles,
 588                 'duration':     video_duration
 589             })
 590         return results
 591
 592
 593 class MetacafeIE(InfoExtractor):
 594     """Information Extractor for metacafe.com."""
 595
 596     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 597     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 598     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 599     IE_NAME = u'metacafe'
 600
 601     def __init__(self, downloader=None):
 602         InfoExtractor.__init__(self, downloader)
 603
 604     def report_disclaimer(self):
 605         """Report disclaimer retrieval."""
 606         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 607
 608     def report_age_confirmation(self):
 609         """Report attempt to confirm age."""
 610         self._downloader.to_screen(u'[metacafe] Confirming age')
 611
 612     def report_download_webpage(self, video_id):
 613         """Report webpage download."""
 614         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 615
 616     def report_extraction(self, video_id):
 617         """Report information extraction."""
 618         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 619
 620     def _real_initialize(self):
 621         # Retrieve disclaimer
 622         request = compat_urllib_request.Request(self._DISCLAIMER)
 623         try:
 624             self.report_disclaimer()
 625             disclaimer = compat_urllib_request.urlopen(request).read()
 626         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 627             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
 628             return
 629
 630         # Confirm age
 631         disclaimer_form = {
 632             'filters': '0',
 633             'submit': "Continue - I'm over 18",
 634             }
 635         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 636         try:
 637             self.report_age_confirmation()
 638             disclaimer = compat_urllib_request.urlopen(request).read()
 639         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 640             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 641             return
 642
 643     def _real_extract(self, url):
 644         # Extract id and simplified title from URL
 645         mobj = re.match(self._VALID_URL, url)
 646         if mobj is None:
 647             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 648             return
 649
 650         video_id = mobj.group(1)
 651
 652         # Check if video comes from YouTube
 653         mobj2 = re.match(r'^yt-(.*)$', video_id)
 654         if mobj2 is not None:
 655             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 656             return
 657
 658         # Retrieve video webpage to extract further information
 659         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
 660         try:
 661             self.report_download_webpage(video_id)
 662             webpage = compat_urllib_request.urlopen(request).read()
 663         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 664             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 665             return
 666
 667         # Extract URL, uploader and title from webpage
 668         self.report_extraction(video_id)
 669         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 670         if mobj is not None:
 671             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 672             video_extension = mediaURL[-3:]
 673
 674             # Extract gdaKey if available
 675             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 676             if mobj is None:
 677                 video_url = mediaURL
 678             else:
 679                 gdaKey = mobj.group(1)
 680                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 681         else:
 682             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 683             if mobj is None:
 684                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 685                 return
 686             vardict = compat_parse_qs(mobj.group(1))
 687             if 'mediaData' not in vardict:
 688                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 689                 return
 690             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 691             if mobj is None:
 692                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 693                 return
 694             mediaURL = mobj.group(1).replace('\\/', '/')
 695             video_extension = mediaURL[-3:]
 696             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 697
 698         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 699         if mobj is None:
 700             self._downloader.trouble(u'ERROR: unable to extract title')
 701             return
 702         video_title = mobj.group(1).decode('utf-8')
 703
 704         mobj = re.search(r'submitter=(.*?);', webpage)
 705         if mobj is None:
 706             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 707             return
 708         video_uploader = mobj.group(1)
 709
 710         return [{
 711             'id':       video_id.decode('utf-8'),
 712             'url':      video_url.decode('utf-8'),
 713             'uploader': video_uploader.decode('utf-8'),
 714             'upload_date':  None,
 715             'title':    video_title,
 716             'ext':      video_extension.decode('utf-8'),
 717         }]
 718
 719
 720 class DailymotionIE(InfoExtractor):
 721     """Information Extractor for Dailymotion"""
 722
 723     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 724     IE_NAME = u'dailymotion'
 725     _WORKING = False
 726
 727     def __init__(self, downloader=None):
 728         InfoExtractor.__init__(self, downloader)
 729
 730     def report_extraction(self, video_id):
 731         """Report information extraction."""
 732         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 733
 734     def _real_extract(self, url):
 735         # Extract id and simplified title from URL
 736         mobj = re.match(self._VALID_URL, url)
 737         if mobj is None:
 738             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 739             return
 740
 741         video_id = mobj.group(1).split('_')[0].split('?')[0]
 742
 743         video_extension = 'mp4'
 744
 745         # Retrieve video webpage to extract further information
 746         request = compat_urllib_request.Request(url)
 747         request.add_header('Cookie', 'family_filter=off')
 748         webpage = self._download_webpage(request, video_id)
 749
 750         # Extract URL, uploader and title from webpage
 751         self.report_extraction(video_id)
 752         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 753         if mobj is None:
 754             self._downloader.trouble(u'ERROR: unable to extract media URL')
 755             return
 756         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 757
 758         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 759             if key in flashvars:
 760                 max_quality = key
 761                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 762                 break
 763         else:
 764             self._downloader.trouble(u'ERROR: unable to extract video URL')
 765             return
 766
 767         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 768         if mobj is None:
 769             self._downloader.trouble(u'ERROR: unable to extract video URL')
 770             return
 771
 772         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 773
 774         # TODO: support choosing qualities
 775
 776         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 777         if mobj is None:
 778             self._downloader.trouble(u'ERROR: unable to extract title')
 779             return
 780         video_title = unescapeHTML(mobj.group('title'))
 781
 782         video_uploader = None
 783         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 784         if mobj is None:
 785             # lookin for official user
 786             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 787             if mobj_official is None:
 788                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 789             else:
 790                 video_uploader = mobj_official.group(1)
 791         else:
 792             video_uploader = mobj.group(1)
 793
 794         video_upload_date = None
 795         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 796         if mobj is not None:
 797             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 798
 799         return [{
 800             'id':       video_id,
 801             'url':      video_url,
 802             'uploader': video_uploader,
 803             'upload_date':  video_upload_date,
 804             'title':    video_title,
 805             'ext':      video_extension,
 806         }]
 807
 808
 809 class PhotobucketIE(InfoExtractor):
 810     """Information extractor for photobucket.com."""
 811
 812     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 813     IE_NAME = u'photobucket'
 814
 815     def __init__(self, downloader=None):
 816         InfoExtractor.__init__(self, downloader)
 817
 818     def report_download_webpage(self, video_id):
 819         """Report webpage download."""
 820         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 821
 822     def report_extraction(self, video_id):
 823         """Report information extraction."""
 824         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 825
 826     def _real_extract(self, url):
 827         # Extract id from URL
 828         mobj = re.match(self._VALID_URL, url)
 829         if mobj is None:
 830             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 831             return
 832
 833         video_id = mobj.group(1)
 834
 835         video_extension = 'flv'
 836
 837         # Retrieve video webpage to extract further information
 838         request = compat_urllib_request.Request(url)
 839         try:
 840             self.report_download_webpage(video_id)
 841             webpage = compat_urllib_request.urlopen(request).read()
 842         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 843             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 844             return
 845
 846         # Extract URL, uploader, and title from webpage
 847         self.report_extraction(video_id)
 848         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 849         if mobj is None:
 850             self._downloader.trouble(u'ERROR: unable to extract media URL')
 851             return
 852         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 853
 854         video_url = mediaURL
 855
 856         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 857         if mobj is None:
 858             self._downloader.trouble(u'ERROR: unable to extract title')
 859             return
 860         video_title = mobj.group(1).decode('utf-8')
 861
 862         video_uploader = mobj.group(2).decode('utf-8')
 863
 864         return [{
 865             'id':       video_id.decode('utf-8'),
 866             'url':      video_url.decode('utf-8'),
 867             'uploader': video_uploader,
 868             'upload_date':  None,
 869             'title':    video_title,
 870             'ext':      video_extension.decode('utf-8'),
 871         }]
 872
 873
 874 class YahooIE(InfoExtractor):
 875     """Information extractor for video.yahoo.com."""
 876
 877     _WORKING = False
 878     # _VALID_URL matches all Yahoo! Video URLs
 879     # _VPAGE_URL matches only the extractable '/watch/' URLs
 880     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 881     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 882     IE_NAME = u'video.yahoo'
 883
 884     def __init__(self, downloader=None):
 885         InfoExtractor.__init__(self, downloader)
 886
 887     def report_download_webpage(self, video_id):
 888         """Report webpage download."""
 889         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 890
 891     def report_extraction(self, video_id):
 892         """Report information extraction."""
 893         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 894
 895     def _real_extract(self, url, new_video=True):
 896         # Extract ID from URL
 897         mobj = re.match(self._VALID_URL, url)
 898         if mobj is None:
 899             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 900             return
 901
 902         video_id = mobj.group(2)
 903         video_extension = 'flv'
 904
 905         # Rewrite valid but non-extractable URLs as
 906         # extractable English language /watch/ URLs
 907         if re.match(self._VPAGE_URL, url) is None:
 908             request = compat_urllib_request.Request(url)
 909             try:
 910                 webpage = compat_urllib_request.urlopen(request).read()
 911             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 912                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 913                 return
 914
 915             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 916             if mobj is None:
 917                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 918                 return
 919             yahoo_id = mobj.group(1)
 920
 921             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 922             if mobj is None:
 923                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 924                 return
 925             yahoo_vid = mobj.group(1)
 926
 927             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 928             return self._real_extract(url, new_video=False)
 929
 930         # Retrieve video webpage to extract further information
 931         request = compat_urllib_request.Request(url)
 932         try:
 933             self.report_download_webpage(video_id)
 934             webpage = compat_urllib_request.urlopen(request).read()
 935         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 936             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 937             return
 938
 939         # Extract uploader and title from webpage
 940         self.report_extraction(video_id)
 941         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 942         if mobj is None:
 943             self._downloader.trouble(u'ERROR: unable to extract video title')
 944             return
 945         video_title = mobj.group(1).decode('utf-8')
 946
 947         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 948         if mobj is None:
 949             self._downloader.trouble(u'ERROR: unable to extract video uploader')
 950             return
 951         video_uploader = mobj.group(1).decode('utf-8')
 952
 953         # Extract video thumbnail
 954         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 955         if mobj is None:
 956             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 957             return
 958         video_thumbnail = mobj.group(1).decode('utf-8')
 959
 960         # Extract video description
 961         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 962         if mobj is None:
 963             self._downloader.trouble(u'ERROR: unable to extract video description')
 964             return
 965         video_description = mobj.group(1).decode('utf-8')
 966         if not video_description:
 967             video_description = 'No description available.'
 968
 969         # Extract video height and width
 970         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 971         if mobj is None:
 972             self._downloader.trouble(u'ERROR: unable to extract video height')
 973             return
 974         yv_video_height = mobj.group(1)
 975
 976         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 977         if mobj is None:
 978             self._downloader.trouble(u'ERROR: unable to extract video width')
 979             return
 980         yv_video_width = mobj.group(1)
 981
 982         # Retrieve video playlist to extract media URL
 983         # I'm not completely sure what all these options are, but we
 984         # seem to need most of them, otherwise the server sends a 401.
 985         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 986         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 987         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 988                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 989                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 990         try:
 991             self.report_download_webpage(video_id)
 992             webpage = compat_urllib_request.urlopen(request).read()
 993         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 994             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 995             return
 996
 997         # Extract media URL from playlist XML
 998         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 999         if mobj is None:
1000             self._downloader.trouble(u'ERROR: Unable to extract media URL')
1001             return
1002         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1003         video_url = unescapeHTML(video_url)
1004
1005         return [{
1006             'id':       video_id.decode('utf-8'),
1007             'url':      video_url,
1008             'uploader': video_uploader,
1009             'upload_date':  None,
1010             'title':    video_title,
1011             'ext':      video_extension.decode('utf-8'),
1012             'thumbnail':    video_thumbnail.decode('utf-8'),
1013             'description':  video_description,
1014         }]
1015
1016
1017 class VimeoIE(InfoExtractor):
1018     """Information extractor for vimeo.com."""
1019
1020     # _VALID_URL matches Vimeo URLs
1021     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1022     IE_NAME = u'vimeo'
1023
1024     def __init__(self, downloader=None):
1025         InfoExtractor.__init__(self, downloader)
1026
1027     def report_download_webpage(self, video_id):
1028         """Report webpage download."""
1029         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1030
1031     def report_extraction(self, video_id):
1032         """Report information extraction."""
1033         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1034
1035     def _real_extract(self, url, new_video=True):
1036         # Extract ID from URL
1037         mobj = re.match(self._VALID_URL, url)
1038         if mobj is None:
1039             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1040             return
1041
1042         video_id = mobj.group('id')
1043         if not mobj.group('proto'):
1044             url = 'https://' + url
1045         if mobj.group('direct_link'):
1046             url = 'https://vimeo.com/' + video_id
1047
1048         # Retrieve video webpage to extract further information
1049         request = compat_urllib_request.Request(url, None, std_headers)
1050         try:
1051             self.report_download_webpage(video_id)
1052             webpage_bytes = compat_urllib_request.urlopen(request).read()
1053             webpage = webpage_bytes.decode('utf-8')
1054         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1055             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1056             return
1057
1058         # Now we begin extracting as much information as we can from what we
1059         # retrieved. First we extract the information common to all extractors,
1060         # and latter we extract those that are Vimeo specific.
1061         self.report_extraction(video_id)
1062
1063         # Extract the config JSON
1064         try:
1065             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1066             config = json.loads(config)
1067         except:
1068             self._downloader.trouble(u'ERROR: unable to extract info section')
1069             return
1070
1071         # Extract title
1072         video_title = config["video"]["title"]
1073
1074         # Extract uploader and uploader_id
1075         video_uploader = config["video"]["owner"]["name"]
1076         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1077
1078         # Extract video thumbnail
1079         video_thumbnail = config["video"]["thumbnail"]
1080
1081         # Extract video description
1082         video_description = get_element_by_attribute("itemprop", "description", webpage)
1083         if video_description: video_description = clean_html(video_description)
1084         else: video_description = ''
1085
1086         # Extract upload date
1087         video_upload_date = None
1088         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1089         if mobj is not None:
1090             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1091
1092         # Vimeo specific: extract request signature and timestamp
1093         sig = config['request']['signature']
1094         timestamp = config['request']['timestamp']
1095
1096         # Vimeo specific: extract video codec and quality information
1097         # First consider quality, then codecs, then take everything
1098         # TODO bind to format param
1099         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1100         files = { 'hd': [], 'sd': [], 'other': []}
1101         for codec_name, codec_extension in codecs:
1102             if codec_name in config["video"]["files"]:
1103                 if 'hd' in config["video"]["files"][codec_name]:
1104                     files['hd'].append((codec_name, codec_extension, 'hd'))
1105                 elif 'sd' in config["video"]["files"][codec_name]:
1106                     files['sd'].append((codec_name, codec_extension, 'sd'))
1107                 else:
1108                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1109
1110         for quality in ('hd', 'sd', 'other'):
1111             if len(files[quality]) > 0:
1112                 video_quality = files[quality][0][2]
1113                 video_codec = files[quality][0][0]
1114                 video_extension = files[quality][0][1]
1115                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1116                 break
1117         else:
1118             self._downloader.trouble(u'ERROR: no known codec found')
1119             return
1120
1121         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1122                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1123
1124         return [{
1125             'id':       video_id,
1126             'url':      video_url,
1127             'uploader': video_uploader,
1128             'uploader_id': video_uploader_id,
1129             'upload_date':  video_upload_date,
1130             'title':    video_title,
1131             'ext':      video_extension,
1132             'thumbnail':    video_thumbnail,
1133             'description':  video_description,
1134         }]
1135
1136
1137 class ArteTvIE(InfoExtractor):
1138     """arte.tv information extractor."""
1139
1140     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1141     _LIVE_URL = r'index-[0-9]+\.html$'
1142
1143     IE_NAME = u'arte.tv'
1144
1145     def __init__(self, downloader=None):
1146         InfoExtractor.__init__(self, downloader)
1147
1148     def report_download_webpage(self, video_id):
1149         """Report webpage download."""
1150         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1151
1152     def report_extraction(self, video_id):
1153         """Report information extraction."""
1154         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1155
1156     def fetch_webpage(self, url):
1157         request = compat_urllib_request.Request(url)
1158         try:
1159             self.report_download_webpage(url)
1160             webpage = compat_urllib_request.urlopen(request).read()
1161         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1162             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1163             return
1164         except ValueError as err:
1165             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1166             return
1167         return webpage
1168
1169     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1170         page = self.fetch_webpage(url)
1171         mobj = re.search(regex, page, regexFlags)
1172         info = {}
1173
1174         if mobj is None:
1175             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1176             return
1177
1178         for (i, key, err) in matchTuples:
1179             if mobj.group(i) is None:
1180                 self._downloader.trouble(err)
1181                 return
1182             else:
1183                 info[key] = mobj.group(i)
1184
1185         return info
1186
1187     def extractLiveStream(self, url):
1188         video_lang = url.split('/')[-4]
1189         info = self.grep_webpage(
1190             url,
1191             r'src="(.*?/videothek_js.*?\.js)',
1192             0,
1193             [
1194                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1195             ]
1196         )
1197         http_host = url.split('/')[2]
1198         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1199         info = self.grep_webpage(
1200             next_url,
1201             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1202                 '(http://.*?\.swf).*?' +
1203                 '(rtmp://.*?)\'',
1204             re.DOTALL,
1205             [
1206                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1207                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1208                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1209             ]
1210         )
1211         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1212
1213     def extractPlus7Stream(self, url):
1214         video_lang = url.split('/')[-3]
1215         info = self.grep_webpage(
1216             url,
1217             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1218             0,
1219             [
1220                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1221             ]
1222         )
1223         next_url = compat_urllib_parse.unquote(info.get('url'))
1224         info = self.grep_webpage(
1225             next_url,
1226             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1227             0,
1228             [
1229                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1230             ]
1231         )
1232         next_url = compat_urllib_parse.unquote(info.get('url'))
1233
1234         info = self.grep_webpage(
1235             next_url,
1236             r'<video id="(.*?)".*?>.*?' +
1237                 '<name>(.*?)</name>.*?' +
1238                 '<dateVideo>(.*?)</dateVideo>.*?' +
1239                 '<url quality="hd">(.*?)</url>',
1240             re.DOTALL,
1241             [
1242                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1243                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1244                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1245                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1246             ]
1247         )
1248
1249         return {
1250             'id':           info.get('id'),
1251             'url':          compat_urllib_parse.unquote(info.get('url')),
1252             'uploader':     u'arte.tv',
1253             'upload_date':  info.get('date'),
1254             'title':        info.get('title').decode('utf-8'),
1255             'ext':          u'mp4',
1256             'format':       u'NA',
1257             'player_url':   None,
1258         }
1259
1260     def _real_extract(self, url):
1261         video_id = url.split('/')[-1]
1262         self.report_extraction(video_id)
1263
1264         if re.search(self._LIVE_URL, video_id) is not None:
1265             self.extractLiveStream(url)
1266             return
1267         else:
1268             info = self.extractPlus7Stream(url)
1269
1270         return [info]
1271
1272
1273 class GenericIE(InfoExtractor):
1274     """Generic last-resort information extractor."""
1275
1276     _VALID_URL = r'.*'
1277     IE_NAME = u'generic'
1278
1279     def __init__(self, downloader=None):
1280         InfoExtractor.__init__(self, downloader)
1281
1282     def report_download_webpage(self, video_id):
1283         """Report webpage download."""
1284         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1285         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1286
1287     def report_extraction(self, video_id):
1288         """Report information extraction."""
1289         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1290
1291     def report_following_redirect(self, new_url):
1292         """Report information extraction."""
1293         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1294
1295     def _test_redirect(self, url):
1296         """Check if it is a redirect, like url shorteners, in case restart chain."""
1297         class HeadRequest(compat_urllib_request.Request):
1298             def get_method(self):
1299                 return "HEAD"
1300
1301         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1302             """
1303             Subclass the HTTPRedirectHandler to make it use our
1304             HeadRequest also on the redirected URL
1305             """
1306             def redirect_request(self, req, fp, code, msg, headers, newurl):
1307                 if code in (301, 302, 303, 307):
1308                     newurl = newurl.replace(' ', '%20')
1309                     newheaders = dict((k,v) for k,v in req.headers.items()
1310                                       if k.lower() not in ("content-length", "content-type"))
1311                     return HeadRequest(newurl,
1312                                        headers=newheaders,
1313                                        origin_req_host=req.get_origin_req_host(),
1314                                        unverifiable=True)
1315                 else:
1316                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1317
1318         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1319             """
1320             Fallback to GET if HEAD is not allowed (405 HTTP error)
1321             """
1322             def http_error_405(self, req, fp, code, msg, headers):
1323                 fp.read()
1324                 fp.close()
1325
1326                 newheaders = dict((k,v) for k,v in req.headers.items()
1327                                   if k.lower() not in ("content-length", "content-type"))
1328                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1329                                                  headers=newheaders,
1330                                                  origin_req_host=req.get_origin_req_host(),
1331                                                  unverifiable=True))
1332
1333         # Build our opener
1334         opener = compat_urllib_request.OpenerDirector()
1335         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1336                         HTTPMethodFallback, HEADRedirectHandler,
1337                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1338             opener.add_handler(handler())
1339
1340         response = opener.open(HeadRequest(url))
1341         new_url = response.geturl()
1342
1343         if url == new_url:
1344             return False
1345
1346         self.report_following_redirect(new_url)
1347         self._downloader.download([new_url])
1348         return True
1349
1350     def _real_extract(self, url):
1351         if self._test_redirect(url): return
1352
1353         video_id = url.split('/')[-1]
1354         request = compat_urllib_request.Request(url)
1355         try:
1356             self.report_download_webpage(video_id)
1357             webpage = compat_urllib_request.urlopen(request).read()
1358         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1359             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1360             return
1361         except ValueError as err:
1362             # since this is the last-resort InfoExtractor, if
1363             # this error is thrown, it'll be thrown here
1364             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1365             return
1366
1367         self.report_extraction(video_id)
1368         # Start with something easy: JW Player in SWFObject
1369         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1370         if mobj is None:
1371             # Broaden the search a little bit
1372             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1373         if mobj is None:
1374             # Broaden the search a little bit: JWPlayer JS loader
1375             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1376         if mobj is None:
1377             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1378             return
1379
1380         # It's possible that one of the regexes
1381         # matched, but returned an empty group:
1382         if mobj.group(1) is None:
1383             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1384             return
1385
1386         video_url = compat_urllib_parse.unquote(mobj.group(1))
1387         video_id = os.path.basename(video_url)
1388
1389         # here's a fun little line of code for you:
1390         video_extension = os.path.splitext(video_id)[1][1:]
1391         video_id = os.path.splitext(video_id)[0]
1392
1393         # it's tempting to parse this further, but you would
1394         # have to take into account all the variations like
1395         #   Video Title - Site Name
1396         #   Site Name | Video Title
1397         #   Video Title - Tagline | Site Name
1398         # and so on and so forth; it's just not practical
1399         mobj = re.search(r'<title>(.*)</title>', webpage)
1400         if mobj is None:
1401             self._downloader.trouble(u'ERROR: unable to extract title')
1402             return
1403         video_title = mobj.group(1)
1404
1405         # video uploader is domain name
1406         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1407         if mobj is None:
1408             self._downloader.trouble(u'ERROR: unable to extract title')
1409             return
1410         video_uploader = mobj.group(1)
1411
1412         return [{
1413             'id':       video_id,
1414             'url':      video_url,
1415             'uploader': video_uploader,
1416             'upload_date':  None,
1417             'title':    video_title,
1418             'ext':      video_extension,
1419         }]
1420
1421
1422 class YoutubeSearchIE(InfoExtractor):
1423     """Information Extractor for YouTube search queries."""
1424     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1425     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1426     _max_youtube_results = 1000
1427     IE_NAME = u'youtube:search'
1428
1429     def __init__(self, downloader=None):
1430         InfoExtractor.__init__(self, downloader)
1431
1432     def report_download_page(self, query, pagenum):
1433         """Report attempt to download search page with given number."""
1434         query = query.decode(preferredencoding())
1435         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1436
1437     def _real_extract(self, query):
1438         mobj = re.match(self._VALID_URL, query)
1439         if mobj is None:
1440             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1441             return
1442
1443         prefix, query = query.split(':')
1444         prefix = prefix[8:]
1445         query = query.encode('utf-8')
1446         if prefix == '':
1447             self._download_n_results(query, 1)
1448             return
1449         elif prefix == 'all':
1450             self._download_n_results(query, self._max_youtube_results)
1451             return
1452         else:
1453             try:
1454                 n = int(prefix)
1455                 if n <= 0:
1456                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1457                     return
1458                 elif n > self._max_youtube_results:
1459                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1460                     n = self._max_youtube_results
1461                 self._download_n_results(query, n)
1462                 return
1463             except ValueError: # parsing prefix as integer fails
1464                 self._download_n_results(query, 1)
1465                 return
1466
1467     def _download_n_results(self, query, n):
1468         """Downloads a specified number of results for a query"""
1469
1470         video_ids = []
1471         pagenum = 0
1472         limit = n
1473
1474         while (50 * pagenum) < limit:
1475             self.report_download_page(query, pagenum+1)
1476             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1477             request = compat_urllib_request.Request(result_url)
1478             try:
1479                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1480             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1481                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1482                 return
1483             api_response = json.loads(data)['data']
1484
1485             if not 'items' in api_response:
1486                 self._downloader.trouble(u'[youtube] No video results')
1487                 return
1488
1489             new_ids = list(video['id'] for video in api_response['items'])
1490             video_ids += new_ids
1491
1492             limit = min(n, api_response['totalItems'])
1493             pagenum += 1
1494
1495         if len(video_ids) > n:
1496             video_ids = video_ids[:n]
1497         for id in video_ids:
1498             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1499         return
1500
1501
1502 class GoogleSearchIE(InfoExtractor):
1503     """Information Extractor for Google Video search queries."""
1504     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1505     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1506     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1507     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1508     _max_google_results = 1000
1509     IE_NAME = u'video.google:search'
1510
1511     def __init__(self, downloader=None):
1512         InfoExtractor.__init__(self, downloader)
1513
1514     def report_download_page(self, query, pagenum):
1515         """Report attempt to download playlist page with given number."""
1516         query = query.decode(preferredencoding())
1517         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1518
1519     def _real_extract(self, query):
1520         mobj = re.match(self._VALID_URL, query)
1521         if mobj is None:
1522             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1523             return
1524
1525         prefix, query = query.split(':')
1526         prefix = prefix[8:]
1527         query = query.encode('utf-8')
1528         if prefix == '':
1529             self._download_n_results(query, 1)
1530             return
1531         elif prefix == 'all':
1532             self._download_n_results(query, self._max_google_results)
1533             return
1534         else:
1535             try:
1536                 n = int(prefix)
1537                 if n <= 0:
1538                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1539                     return
1540                 elif n > self._max_google_results:
1541                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1542                     n = self._max_google_results
1543                 self._download_n_results(query, n)
1544                 return
1545             except ValueError: # parsing prefix as integer fails
1546                 self._download_n_results(query, 1)
1547                 return
1548
1549     def _download_n_results(self, query, n):
1550         """Downloads a specified number of results for a query"""
1551
1552         video_ids = []
1553         pagenum = 0
1554
1555         while True:
1556             self.report_download_page(query, pagenum)
1557             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1558             request = compat_urllib_request.Request(result_url)
1559             try:
1560                 page = compat_urllib_request.urlopen(request).read()
1561             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1562                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1563                 return
1564
1565             # Extract video identifiers
1566             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1567                 video_id = mobj.group(1)
1568                 if video_id not in video_ids:
1569                     video_ids.append(video_id)
1570                     if len(video_ids) == n:
1571                         # Specified n videos reached
1572                         for id in video_ids:
1573                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1574                         return
1575
1576             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1577                 for id in video_ids:
1578                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1579                 return
1580
1581             pagenum = pagenum + 1
1582
1583
1584 class YahooSearchIE(InfoExtractor):
1585     """Information Extractor for Yahoo! Video search queries."""
1586
1587     _WORKING = False
1588     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1589     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1590     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1591     _MORE_PAGES_INDICATOR = r'\s*Next'
1592     _max_yahoo_results = 1000
1593     IE_NAME = u'video.yahoo:search'
1594
1595     def __init__(self, downloader=None):
1596         InfoExtractor.__init__(self, downloader)
1597
1598     def report_download_page(self, query, pagenum):
1599         """Report attempt to download playlist page with given number."""
1600         query = query.decode(preferredencoding())
1601         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1602
1603     def _real_extract(self, query):
1604         mobj = re.match(self._VALID_URL, query)
1605         if mobj is None:
1606             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1607             return
1608
1609         prefix, query = query.split(':')
1610         prefix = prefix[8:]
1611         query = query.encode('utf-8')
1612         if prefix == '':
1613             self._download_n_results(query, 1)
1614             return
1615         elif prefix == 'all':
1616             self._download_n_results(query, self._max_yahoo_results)
1617             return
1618         else:
1619             try:
1620                 n = int(prefix)
1621                 if n <= 0:
1622                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1623                     return
1624                 elif n > self._max_yahoo_results:
1625                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1626                     n = self._max_yahoo_results
1627                 self._download_n_results(query, n)
1628                 return
1629             except ValueError: # parsing prefix as integer fails
1630                 self._download_n_results(query, 1)
1631                 return
1632
1633     def _download_n_results(self, query, n):
1634         """Downloads a specified number of results for a query"""
1635
1636         video_ids = []
1637         already_seen = set()
1638         pagenum = 1
1639
1640         while True:
1641             self.report_download_page(query, pagenum)
1642             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1643             request = compat_urllib_request.Request(result_url)
1644             try:
1645                 page = compat_urllib_request.urlopen(request).read()
1646             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1647                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1648                 return
1649
1650             # Extract video identifiers
1651             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1652                 video_id = mobj.group(1)
1653                 if video_id not in already_seen:
1654                     video_ids.append(video_id)
1655                     already_seen.add(video_id)
1656                     if len(video_ids) == n:
1657                         # Specified n videos reached
1658                         for id in video_ids:
1659                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1660                         return
1661
1662             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1663                 for id in video_ids:
1664                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1665                 return
1666
1667             pagenum = pagenum + 1
1668
1669
1670 class YoutubePlaylistIE(InfoExtractor):
1671     """Information Extractor for YouTube playlists."""
1672
1673     _VALID_URL = r"""(?:
1674                         (?:https?://)?
1675                         (?:\w+\.)?
1676                         youtube\.com/
1677                         (?:
1678                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1679                            \? (?:.*?&)*? (?:p|a|list)=
1680                         |  user/.*?/user/
1681                         |  p/
1682                         |  user/.*?#[pg]/c/
1683                         )
1684                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1685                         .*
1686                      |
1687                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1688                      )"""
1689     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1690     _MAX_RESULTS = 50
1691     IE_NAME = u'youtube:playlist'
1692
1693     def __init__(self, downloader=None):
1694         InfoExtractor.__init__(self, downloader)
1695
1696     @classmethod
1697     def suitable(cls, url):
1698         """Receives a URL and returns True if suitable for this IE."""
1699         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1700
1701     def report_download_page(self, playlist_id, pagenum):
1702         """Report attempt to download playlist page with given number."""
1703         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1704
1705     def _real_extract(self, url):
1706         # Extract playlist id
1707         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1708         if mobj is None:
1709             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1710             return
1711
1712         # Download playlist videos from API
1713         playlist_id = mobj.group(1) or mobj.group(2)
1714         page_num = 1
1715         videos = []
1716
1717         while True:
1718             self.report_download_page(playlist_id, page_num)
1719
1720             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1721             try:
1722                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1723             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1724                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1725                 return
1726
1727             try:
1728                 response = json.loads(page)
1729             except ValueError as err:
1730                 self._downloader.trouble(u'ERROR: Invalid JSON in API response: ' + compat_str(err))
1731                 return
1732
1733             if not 'feed' in response or not 'entry' in response['feed']:
1734                 self._downloader.trouble(u'ERROR: Got a malformed response from YouTube API')
1735                 return
1736             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1737                         for entry in response['feed']['entry']
1738                         if 'content' in entry ]
1739
1740             if len(response['feed']['entry']) < self._MAX_RESULTS:
1741                 break
1742             page_num += 1
1743
1744         videos = [v[1] for v in sorted(videos)]
1745         total = len(videos)
1746
1747         playliststart = self._downloader.params.get('playliststart', 1) - 1
1748         playlistend = self._downloader.params.get('playlistend', -1)
1749         if playlistend == -1:
1750             videos = videos[playliststart:]
1751         else:
1752             videos = videos[playliststart:playlistend]
1753
1754         if len(videos) == total:
1755             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1756         else:
1757             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1758
1759         for video in videos:
1760             self._downloader.download([video])
1761         return
1762
1763
1764 class YoutubeChannelIE(InfoExtractor):
1765     """Information Extractor for YouTube channels."""
1766
1767     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1768     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1769     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1770     IE_NAME = u'youtube:channel'
1771
1772     def report_download_page(self, channel_id, pagenum):
1773         """Report attempt to download channel page with given number."""
1774         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1775
1776     def _real_extract(self, url):
1777         # Extract channel id
1778         mobj = re.match(self._VALID_URL, url)
1779         if mobj is None:
1780             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1781             return
1782
1783         # Download channel pages
1784         channel_id = mobj.group(1)
1785         video_ids = []
1786         pagenum = 1
1787
1788         while True:
1789             self.report_download_page(channel_id, pagenum)
1790             url = self._TEMPLATE_URL % (channel_id, pagenum)
1791             request = compat_urllib_request.Request(url)
1792             try:
1793                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1794             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1795                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1796                 return
1797
1798             # Extract video identifiers
1799             ids_in_page = []
1800             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1801                 if mobj.group(1) not in ids_in_page:
1802                     ids_in_page.append(mobj.group(1))
1803             video_ids.extend(ids_in_page)
1804
1805             if self._MORE_PAGES_INDICATOR not in page:
1806                 break
1807             pagenum = pagenum + 1
1808
1809         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1810
1811         for id in video_ids:
1812             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1813         return
1814
1815
1816 class YoutubeUserIE(InfoExtractor):
1817     """Information Extractor for YouTube users."""
1818
1819     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1820     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1821     _GDATA_PAGE_SIZE = 50
1822     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1823     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1824     IE_NAME = u'youtube:user'
1825
1826     def __init__(self, downloader=None):
1827         InfoExtractor.__init__(self, downloader)
1828
1829     def report_download_page(self, username, start_index):
1830         """Report attempt to download user page."""
1831         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1832                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1833
1834     def _real_extract(self, url):
1835         # Extract username
1836         mobj = re.match(self._VALID_URL, url)
1837         if mobj is None:
1838             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1839             return
1840
1841         username = mobj.group(1)
1842
1843         # Download video ids using YouTube Data API. Result size per
1844         # query is limited (currently to 50 videos) so we need to query
1845         # page by page until there are no video ids - it means we got
1846         # all of them.
1847
1848         video_ids = []
1849         pagenum = 0
1850
1851         while True:
1852             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1853             self.report_download_page(username, start_index)
1854
1855             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1856
1857             try:
1858                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1859             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1860                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1861                 return
1862
1863             # Extract video identifiers
1864             ids_in_page = []
1865
1866             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1867                 if mobj.group(1) not in ids_in_page:
1868                     ids_in_page.append(mobj.group(1))
1869
1870             video_ids.extend(ids_in_page)
1871
1872             # A little optimization - if current page is not
1873             # "full", ie. does not contain PAGE_SIZE video ids then
1874             # we can assume that this page is the last one - there
1875             # are no more ids on further pages - no need to query
1876             # again.
1877
1878             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1879                 break
1880
1881             pagenum += 1
1882
1883         all_ids_count = len(video_ids)
1884         playliststart = self._downloader.params.get('playliststart', 1) - 1
1885         playlistend = self._downloader.params.get('playlistend', -1)
1886
1887         if playlistend == -1:
1888             video_ids = video_ids[playliststart:]
1889         else:
1890             video_ids = video_ids[playliststart:playlistend]
1891
1892         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1893                 (username, all_ids_count, len(video_ids)))
1894
1895         for video_id in video_ids:
1896             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1897
1898
1899 class BlipTVUserIE(InfoExtractor):
1900     """Information Extractor for blip.tv users."""
1901
1902     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1903     _PAGE_SIZE = 12
1904     IE_NAME = u'blip.tv:user'
1905
1906     def __init__(self, downloader=None):
1907         InfoExtractor.__init__(self, downloader)
1908
1909     def report_download_page(self, username, pagenum):
1910         """Report attempt to download user page."""
1911         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1912                 (self.IE_NAME, username, pagenum))
1913
1914     def _real_extract(self, url):
1915         # Extract username
1916         mobj = re.match(self._VALID_URL, url)
1917         if mobj is None:
1918             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1919             return
1920
1921         username = mobj.group(1)
1922
1923         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1924
1925         request = compat_urllib_request.Request(url)
1926
1927         try:
1928             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1929             mobj = re.search(r'data-users-id="([^"]+)"', page)
1930             page_base = page_base % mobj.group(1)
1931         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1932             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1933             return
1934
1935
1936         # Download video ids using BlipTV Ajax calls. Result size per
1937         # query is limited (currently to 12 videos) so we need to query
1938         # page by page until there are no video ids - it means we got
1939         # all of them.
1940
1941         video_ids = []
1942         pagenum = 1
1943
1944         while True:
1945             self.report_download_page(username, pagenum)
1946             url = page_base + "&page=" + str(pagenum)
1947             request = compat_urllib_request.Request( url )
1948             try:
1949                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1950             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1951                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1952                 return
1953
1954             # Extract video identifiers
1955             ids_in_page = []
1956
1957             for mobj in re.finditer(r'href="/([^"]+)"', page):
1958                 if mobj.group(1) not in ids_in_page:
1959                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1960
1961             video_ids.extend(ids_in_page)
1962
1963             # A little optimization - if current page is not
1964             # "full", ie. does not contain PAGE_SIZE video ids then
1965             # we can assume that this page is the last one - there
1966             # are no more ids on further pages - no need to query
1967             # again.
1968
1969             if len(ids_in_page) < self._PAGE_SIZE:
1970                 break
1971
1972             pagenum += 1
1973
1974         all_ids_count = len(video_ids)
1975         playliststart = self._downloader.params.get('playliststart', 1) - 1
1976         playlistend = self._downloader.params.get('playlistend', -1)
1977
1978         if playlistend == -1:
1979             video_ids = video_ids[playliststart:]
1980         else:
1981             video_ids = video_ids[playliststart:playlistend]
1982
1983         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1984                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1985
1986         for video_id in video_ids:
1987             self._downloader.download([u'http://blip.tv/'+video_id])
1988
1989
1990 class DepositFilesIE(InfoExtractor):
1991     """Information extractor for depositfiles.com"""
1992
1993     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1994
1995     def report_download_webpage(self, file_id):
1996         """Report webpage download."""
1997         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1998
1999     def report_extraction(self, file_id):
2000         """Report information extraction."""
2001         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2002
2003     def _real_extract(self, url):
2004         file_id = url.split('/')[-1]
2005         # Rebuild url in english locale
2006         url = 'http://depositfiles.com/en/files/' + file_id
2007
2008         # Retrieve file webpage with 'Free download' button pressed
2009         free_download_indication = { 'gateway_result' : '1' }
2010         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2011         try:
2012             self.report_download_webpage(file_id)
2013             webpage = compat_urllib_request.urlopen(request).read()
2014         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2015             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
2016             return
2017
2018         # Search for the real file URL
2019         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2020         if (mobj is None) or (mobj.group(1) is None):
2021             # Try to figure out reason of the error.
2022             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2023             if (mobj is not None) and (mobj.group(1) is not None):
2024                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2025                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2026             else:
2027                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2028             return
2029
2030         file_url = mobj.group(1)
2031         file_extension = os.path.splitext(file_url)[1][1:]
2032
2033         # Search for file title
2034         mobj = re.search(r'<b title="(.*?)">', webpage)
2035         if mobj is None:
2036             self._downloader.trouble(u'ERROR: unable to extract title')
2037             return
2038         file_title = mobj.group(1).decode('utf-8')
2039
2040         return [{
2041             'id':       file_id.decode('utf-8'),
2042             'url':      file_url.decode('utf-8'),
2043             'uploader': None,
2044             'upload_date':  None,
2045             'title':    file_title,
2046             'ext':      file_extension.decode('utf-8'),
2047         }]
2048
2049
2050 class FacebookIE(InfoExtractor):
2051     """Information Extractor for Facebook"""
2052
2053     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2054     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2055     _NETRC_MACHINE = 'facebook'
2056     IE_NAME = u'facebook'
2057
2058     def report_login(self):
2059         """Report attempt to log in."""
2060         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2061
2062     def _real_initialize(self):
2063         if self._downloader is None:
2064             return
2065
2066         useremail = None
2067         password = None
2068         downloader_params = self._downloader.params
2069
2070         # Attempt to use provided username and password or .netrc data
2071         if downloader_params.get('username', None) is not None:
2072             useremail = downloader_params['username']
2073             password = downloader_params['password']
2074         elif downloader_params.get('usenetrc', False):
2075             try:
2076                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2077                 if info is not None:
2078                     useremail = info[0]
2079                     password = info[2]
2080                 else:
2081                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2082             except (IOError, netrc.NetrcParseError) as err:
2083                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2084                 return
2085
2086         if useremail is None:
2087             return
2088
2089         # Log in
2090         login_form = {
2091             'email': useremail,
2092             'pass': password,
2093             'login': 'Log+In'
2094             }
2095         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2096         try:
2097             self.report_login()
2098             login_results = compat_urllib_request.urlopen(request).read()
2099             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2100                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2101                 return
2102         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2103             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2104             return
2105
2106     def _real_extract(self, url):
2107         mobj = re.match(self._VALID_URL, url)
2108         if mobj is None:
2109             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2110             return
2111         video_id = mobj.group('ID')
2112
2113         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2114         webpage = self._download_webpage(url, video_id)
2115
2116         BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2117         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2118         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2119         if not m:
2120             raise ExtractorError(u'Cannot parse data')
2121         data = dict(json.loads(m.group(1)))
2122         params_raw = compat_urllib_parse.unquote(data['params'])
2123         params = json.loads(params_raw)
2124         video_url = params['hd_src']
2125         if not video_url:
2126             video_url = params['sd_src']
2127         if not video_url:
2128             raise ExtractorError(u'Cannot find video URL')
2129         video_duration = int(params['video_duration'])
2130
2131         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2132         if not m:
2133             raise ExtractorError(u'Cannot find title in webpage')
2134         video_title = unescapeHTML(m.group(1))
2135
2136         info = {
2137             'id': video_id,
2138             'title': video_title,
2139             'url': video_url,
2140             'ext': 'mp4',
2141             'duration': video_duration,
2142             'thumbnail': params['thumbnail_src'],
2143         }
2144         return [info]
2145
2146
2147 class BlipTVIE(InfoExtractor):
2148     """Information extractor for blip.tv"""
2149
2150     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2151     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2152     IE_NAME = u'blip.tv'
2153
2154     def report_extraction(self, file_id):
2155         """Report information extraction."""
2156         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2157
2158     def report_direct_download(self, title):
2159         """Report information extraction."""
2160         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2161
2162     def _real_extract(self, url):
2163         mobj = re.match(self._VALID_URL, url)
2164         if mobj is None:
2165             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2166             return
2167
2168         if '?' in url:
2169             cchar = '&'
2170         else:
2171             cchar = '?'
2172         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2173         request = compat_urllib_request.Request(json_url)
2174         request.add_header('User-Agent', 'iTunes/10.6.1')
2175         self.report_extraction(mobj.group(1))
2176         info = None
2177         try:
2178             urlh = compat_urllib_request.urlopen(request)
2179             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2180                 basename = url.split('/')[-1]
2181                 title,ext = os.path.splitext(basename)
2182                 title = title.decode('UTF-8')
2183                 ext = ext.replace('.', '')
2184                 self.report_direct_download(title)
2185                 info = {
2186                     'id': title,
2187                     'url': url,
2188                     'uploader': None,
2189                     'upload_date': None,
2190                     'title': title,
2191                     'ext': ext,
2192                     'urlhandle': urlh
2193                 }
2194         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2195             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2196         if info is None: # Regular URL
2197             try:
2198                 json_code_bytes = urlh.read()
2199                 json_code = json_code_bytes.decode('utf-8')
2200             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2201                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2202                 return
2203
2204             try:
2205                 json_data = json.loads(json_code)
2206                 if 'Post' in json_data:
2207                     data = json_data['Post']
2208                 else:
2209                     data = json_data
2210
2211                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2212                 video_url = data['media']['url']
2213                 umobj = re.match(self._URL_EXT, video_url)
2214                 if umobj is None:
2215                     raise ValueError('Can not determine filename extension')
2216                 ext = umobj.group(1)
2217
2218                 info = {
2219                     'id': data['item_id'],
2220                     'url': video_url,
2221                     'uploader': data['display_name'],
2222                     'upload_date': upload_date,
2223                     'title': data['title'],
2224                     'ext': ext,
2225                     'format': data['media']['mimeType'],
2226                     'thumbnail': data['thumbnailUrl'],
2227                     'description': data['description'],
2228                     'player_url': data['embedUrl'],
2229                     'user_agent': 'iTunes/10.6.1',
2230                 }
2231             except (ValueError,KeyError) as err:
2232                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2233                 return
2234
2235         return [info]
2236
2237
2238 class MyVideoIE(InfoExtractor):
2239     """Information Extractor for myvideo.de."""
2240
2241     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2242     IE_NAME = u'myvideo'
2243
2244     def __init__(self, downloader=None):
2245         InfoExtractor.__init__(self, downloader)
2246
2247     def report_extraction(self, video_id):
2248         """Report information extraction."""
2249         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2250
2251     def _real_extract(self,url):
2252         mobj = re.match(self._VALID_URL, url)
2253         if mobj is None:
2254             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2255             return
2256
2257         video_id = mobj.group(1)
2258
2259         # Get video webpage
2260         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2261         webpage = self._download_webpage(webpage_url, video_id)
2262
2263         self.report_extraction(video_id)
2264         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
2265                  webpage)
2266         if mobj is None:
2267             self._downloader.trouble(u'ERROR: unable to extract media URL')
2268             return
2269         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2270
2271         mobj = re.search('<title>([^<]+)</title>', webpage)
2272         if mobj is None:
2273             self._downloader.trouble(u'ERROR: unable to extract title')
2274             return
2275
2276         video_title = mobj.group(1)
2277
2278         return [{
2279             'id':       video_id,
2280             'url':      video_url,
2281             'uploader': None,
2282             'upload_date':  None,
2283             'title':    video_title,
2284             'ext':      u'flv',
2285         }]
2286
2287 class ComedyCentralIE(InfoExtractor):
2288     """Information extractor for The Daily Show and Colbert Report """
2289
2290     # urls can be abbreviations like :thedailyshow or :colbert
2291     # urls for episodes like:
2292     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2293     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2294     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2295     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2296                       |(https?://)?(www\.)?
2297                           (?P<showname>thedailyshow|colbertnation)\.com/
2298                          (full-episodes/(?P<episode>.*)|
2299                           (?P<clip>
2300                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2301                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2302                      $"""
2303
2304     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2305
2306     _video_extensions = {
2307         '3500': 'mp4',
2308         '2200': 'mp4',
2309         '1700': 'mp4',
2310         '1200': 'mp4',
2311         '750': 'mp4',
2312         '400': 'mp4',
2313     }
2314     _video_dimensions = {
2315         '3500': '1280x720',
2316         '2200': '960x540',
2317         '1700': '768x432',
2318         '1200': '640x360',
2319         '750': '512x288',
2320         '400': '384x216',
2321     }
2322
2323     @classmethod
2324     def suitable(cls, url):
2325         """Receives a URL and returns True if suitable for this IE."""
2326         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2327
2328     def report_extraction(self, episode_id):
2329         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2330
2331     def report_config_download(self, episode_id, media_id):
2332         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2333
2334     def report_index_download(self, episode_id):
2335         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2336
2337     def _print_formats(self, formats):
2338         print('Available formats:')
2339         for x in formats:
2340             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2341
2342
2343     def _real_extract(self, url):
2344         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2345         if mobj is None:
2346             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2347             return
2348
2349         if mobj.group('shortname'):
2350             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2351                 url = u'http://www.thedailyshow.com/full-episodes/'
2352             else:
2353                 url = u'http://www.colbertnation.com/full-episodes/'
2354             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2355             assert mobj is not None
2356
2357         if mobj.group('clip'):
2358             if mobj.group('showname') == 'thedailyshow':
2359                 epTitle = mobj.group('tdstitle')
2360             else:
2361                 epTitle = mobj.group('cntitle')
2362             dlNewest = False
2363         else:
2364             dlNewest = not mobj.group('episode')
2365             if dlNewest:
2366                 epTitle = mobj.group('showname')
2367             else:
2368                 epTitle = mobj.group('episode')
2369
2370         req = compat_urllib_request.Request(url)
2371         self.report_extraction(epTitle)
2372         try:
2373             htmlHandle = compat_urllib_request.urlopen(req)
2374             html = htmlHandle.read()
2375             webpage = html.decode('utf-8')
2376         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2377             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2378             return
2379         if dlNewest:
2380             url = htmlHandle.geturl()
2381             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2382             if mobj is None:
2383                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2384                 return
2385             if mobj.group('episode') == '':
2386                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2387                 return
2388             epTitle = mobj.group('episode')
2389
2390         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2391
2392         if len(mMovieParams) == 0:
2393             # The Colbert Report embeds the information in a without
2394             # a URL prefix; so extract the alternate reference
2395             # and then add the URL prefix manually.
2396
2397             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2398             if len(altMovieParams) == 0:
2399                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2400                 return
2401             else:
2402                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2403
2404         uri = mMovieParams[0][1]
2405         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2406         self.report_index_download(epTitle)
2407         try:
2408             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2409         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2410             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2411             return
2412
2413         results = []
2414
2415         idoc = xml.etree.ElementTree.fromstring(indexXml)
2416         itemEls = idoc.findall('.//item')
2417         for partNum,itemEl in enumerate(itemEls):
2418             mediaId = itemEl.findall('./guid')[0].text
2419             shortMediaId = mediaId.split(':')[-1]
2420             showId = mediaId.split(':')[-2].replace('.com', '')
2421             officialTitle = itemEl.findall('./title')[0].text
2422             officialDate = itemEl.findall('./pubDate')[0].text
2423
2424             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2425                         compat_urllib_parse.urlencode({'uri': mediaId}))
2426             configReq = compat_urllib_request.Request(configUrl)
2427             self.report_config_download(epTitle, shortMediaId)
2428             try:
2429                 configXml = compat_urllib_request.urlopen(configReq).read()
2430             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2431                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2432                 return
2433
2434             cdoc = xml.etree.ElementTree.fromstring(configXml)
2435             turls = []
2436             for rendition in cdoc.findall('.//rendition'):
2437                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2438                 turls.append(finfo)
2439
2440             if len(turls) == 0:
2441                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2442                 continue
2443
2444             if self._downloader.params.get('listformats', None):
2445                 self._print_formats([i[0] for i in turls])
2446                 return
2447
2448             # For now, just pick the highest bitrate
2449             format,rtmp_video_url = turls[-1]
2450
2451             # Get the format arg from the arg stream
2452             req_format = self._downloader.params.get('format', None)
2453
2454             # Select format if we can find one
2455             for f,v in turls:
2456                 if f == req_format:
2457                     format, rtmp_video_url = f, v
2458                     break
2459
2460             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2461             if not m:
2462                 raise ExtractorError(u'Cannot transform RTMP url')
2463             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2464             video_url = base + m.group('finalid')
2465
2466             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2467             info = {
2468                 'id': shortMediaId,
2469                 'url': video_url,
2470                 'uploader': showId,
2471                 'upload_date': officialDate,
2472                 'title': effTitle,
2473                 'ext': 'mp4',
2474                 'format': format,
2475                 'thumbnail': None,
2476                 'description': officialTitle,
2477             }
2478             results.append(info)
2479
2480         return results
2481
2482
2483 class EscapistIE(InfoExtractor):
2484     """Information extractor for The Escapist """
2485
2486     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2487     IE_NAME = u'escapist'
2488
2489     def report_extraction(self, showName):
2490         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2491
2492     def report_config_download(self, showName):
2493         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2494
2495     def _real_extract(self, url):
2496         mobj = re.match(self._VALID_URL, url)
2497         if mobj is None:
2498             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2499             return
2500         showName = mobj.group('showname')
2501         videoId = mobj.group('episode')
2502
2503         self.report_extraction(showName)
2504         try:
2505             webPage = compat_urllib_request.urlopen(url)
2506             webPageBytes = webPage.read()
2507             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2508             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2509         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2510             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2511             return
2512
2513         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2514         description = unescapeHTML(descMatch.group(1))
2515         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2516         imgUrl = unescapeHTML(imgMatch.group(1))
2517         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2518         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2519         configUrlMatch = re.search('config=(.*)$', playerUrl)
2520         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2521
2522         self.report_config_download(showName)
2523         try:
2524             configJSON = compat_urllib_request.urlopen(configUrl)
2525             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2526             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2527         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2528             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2529             return
2530
2531         # Technically, it's JavaScript, not JSON
2532         configJSON = configJSON.replace("'", '"')
2533
2534         try:
2535             config = json.loads(configJSON)
2536         except (ValueError,) as err:
2537             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2538             return
2539
2540         playlist = config['playlist']
2541         videoUrl = playlist[1]['url']
2542
2543         info = {
2544             'id': videoId,
2545             'url': videoUrl,
2546             'uploader': showName,
2547             'upload_date': None,
2548             'title': showName,
2549             'ext': 'flv',
2550             'thumbnail': imgUrl,
2551             'description': description,
2552             'player_url': playerUrl,
2553         }
2554
2555         return [info]
2556
2557 class CollegeHumorIE(InfoExtractor):
2558     """Information extractor for collegehumor.com"""
2559
2560     _WORKING = False
2561     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2562     IE_NAME = u'collegehumor'
2563
2564     def report_manifest(self, video_id):
2565         """Report information extraction."""
2566         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2567
2568     def report_extraction(self, video_id):
2569         """Report information extraction."""
2570         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2571
2572     def _real_extract(self, url):
2573         mobj = re.match(self._VALID_URL, url)
2574         if mobj is None:
2575             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2576             return
2577         video_id = mobj.group('videoid')
2578
2579         info = {
2580             'id': video_id,
2581             'uploader': None,
2582             'upload_date': None,
2583         }
2584
2585         self.report_extraction(video_id)
2586         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2587         try:
2588             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2589         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2590             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2591             return
2592
2593         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2594         try:
2595             videoNode = mdoc.findall('./video')[0]
2596             info['description'] = videoNode.findall('./description')[0].text
2597             info['title'] = videoNode.findall('./caption')[0].text
2598             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2599             manifest_url = videoNode.findall('./file')[0].text
2600         except IndexError:
2601             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2602             return
2603
2604         manifest_url += '?hdcore=2.10.3'
2605         self.report_manifest(video_id)
2606         try:
2607             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2608         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2609             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2610             return
2611
2612         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2613         try:
2614             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2615             node_id = media_node.attrib['url']
2616             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2617         except IndexError as err:
2618             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2619             return
2620
2621         url_pr = compat_urllib_parse_urlparse(manifest_url)
2622         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2623
2624         info['url'] = url
2625         info['ext'] = 'f4f'
2626         return [info]
2627
2628
2629 class XVideosIE(InfoExtractor):
2630     """Information extractor for xvideos.com"""
2631
2632     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2633     IE_NAME = u'xvideos'
2634
2635     def report_extraction(self, video_id):
2636         """Report information extraction."""
2637         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2638
2639     def _real_extract(self, url):
2640         mobj = re.match(self._VALID_URL, url)
2641         if mobj is None:
2642             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2643             return
2644         video_id = mobj.group(1)
2645
2646         webpage = self._download_webpage(url, video_id)
2647
2648         self.report_extraction(video_id)
2649
2650
2651         # Extract video URL
2652         mobj = re.search(r'flv_url=(.+?)&', webpage)
2653         if mobj is None:
2654             self._downloader.trouble(u'ERROR: unable to extract video url')
2655             return
2656         video_url = compat_urllib_parse.unquote(mobj.group(1))
2657
2658
2659         # Extract title
2660         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2661         if mobj is None:
2662             self._downloader.trouble(u'ERROR: unable to extract video title')
2663             return
2664         video_title = mobj.group(1)
2665
2666
2667         # Extract video thumbnail
2668         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2669         if mobj is None:
2670             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2671             return
2672         video_thumbnail = mobj.group(0)
2673
2674         info = {
2675             'id': video_id,
2676             'url': video_url,
2677             'uploader': None,
2678             'upload_date': None,
2679             'title': video_title,
2680             'ext': 'flv',
2681             'thumbnail': video_thumbnail,
2682             'description': None,
2683         }
2684
2685         return [info]
2686
2687
2688 class SoundcloudIE(InfoExtractor):
2689     """Information extractor for soundcloud.com
2690        To access the media, the uid of the song and a stream token
2691        must be extracted from the page source and the script must make
2692        a request to media.soundcloud.com/crossdomain.xml. Then
2693        the media can be grabbed by requesting from an url composed
2694        of the stream token and uid
2695      """
2696
2697     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2698     IE_NAME = u'soundcloud'
2699
2700     def __init__(self, downloader=None):
2701         InfoExtractor.__init__(self, downloader)
2702
2703     def report_resolve(self, video_id):
2704         """Report information extraction."""
2705         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2706
2707     def report_extraction(self, video_id):
2708         """Report information extraction."""
2709         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2710
2711     def _real_extract(self, url):
2712         mobj = re.match(self._VALID_URL, url)
2713         if mobj is None:
2714             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2715             return
2716
2717         # extract uploader (which is in the url)
2718         uploader = mobj.group(1)
2719         # extract simple title (uploader + slug of song title)
2720         slug_title =  mobj.group(2)
2721         simple_title = uploader + u'-' + slug_title
2722
2723         self.report_resolve('%s/%s' % (uploader, slug_title))
2724
2725         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2726         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2727         request = compat_urllib_request.Request(resolv_url)
2728         try:
2729             info_json_bytes = compat_urllib_request.urlopen(request).read()
2730             info_json = info_json_bytes.decode('utf-8')
2731         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2732             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2733             return
2734
2735         info = json.loads(info_json)
2736         video_id = info['id']
2737         self.report_extraction('%s/%s' % (uploader, slug_title))
2738
2739         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2740         request = compat_urllib_request.Request(streams_url)
2741         try:
2742             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2743             stream_json = stream_json_bytes.decode('utf-8')
2744         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2745             self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2746             return
2747
2748         streams = json.loads(stream_json)
2749         mediaURL = streams['http_mp3_128_url']
2750
2751         return [{
2752             'id':       info['id'],
2753             'url':      mediaURL,
2754             'uploader': info['user']['username'],
2755             'upload_date':  info['created_at'],
2756             'title':    info['title'],
2757             'ext':      u'mp3',
2758             'description': info['description'],
2759         }]
2760
2761
2762 class InfoQIE(InfoExtractor):
2763     """Information extractor for infoq.com"""
2764     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2765
2766     def report_extraction(self, video_id):
2767         """Report information extraction."""
2768         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2769
2770     def _real_extract(self, url):
2771         mobj = re.match(self._VALID_URL, url)
2772         if mobj is None:
2773             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2774             return
2775
2776         webpage = self._download_webpage(url, video_id=url)
2777         self.report_extraction(url)
2778
2779         # Extract video URL
2780         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2781         if mobj is None:
2782             self._downloader.trouble(u'ERROR: unable to extract video url')
2783             return
2784         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2785         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2786
2787         # Extract title
2788         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2789         if mobj is None:
2790             self._downloader.trouble(u'ERROR: unable to extract video title')
2791             return
2792         video_title = mobj.group(1)
2793
2794         # Extract description
2795         video_description = u'No description available.'
2796         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2797         if mobj is not None:
2798             video_description = mobj.group(1)
2799
2800         video_filename = video_url.split('/')[-1]
2801         video_id, extension = video_filename.split('.')
2802
2803         info = {
2804             'id': video_id,
2805             'url': video_url,
2806             'uploader': None,
2807             'upload_date': None,
2808             'title': video_title,
2809             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2810             'thumbnail': None,
2811             'description': video_description,
2812         }
2813
2814         return [info]
2815
2816 class MixcloudIE(InfoExtractor):
2817     """Information extractor for www.mixcloud.com"""
2818
2819     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2820     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2821     IE_NAME = u'mixcloud'
2822
2823     def __init__(self, downloader=None):
2824         InfoExtractor.__init__(self, downloader)
2825
2826     def report_download_json(self, file_id):
2827         """Report JSON download."""
2828         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2829
2830     def report_extraction(self, file_id):
2831         """Report information extraction."""
2832         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2833
2834     def get_urls(self, jsonData, fmt, bitrate='best'):
2835         """Get urls from 'audio_formats' section in json"""
2836         file_url = None
2837         try:
2838             bitrate_list = jsonData[fmt]
2839             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2840                 bitrate = max(bitrate_list) # select highest
2841
2842             url_list = jsonData[fmt][bitrate]
2843         except TypeError: # we have no bitrate info.
2844             url_list = jsonData[fmt]
2845         return url_list
2846
2847     def check_urls(self, url_list):
2848         """Returns 1st active url from list"""
2849         for url in url_list:
2850             try:
2851                 compat_urllib_request.urlopen(url)
2852                 return url
2853             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2854                 url = None
2855
2856         return None
2857
2858     def _print_formats(self, formats):
2859         print('Available formats:')
2860         for fmt in formats.keys():
2861             for b in formats[fmt]:
2862                 try:
2863                     ext = formats[fmt][b][0]
2864                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2865                 except TypeError: # we have no bitrate info
2866                     ext = formats[fmt][0]
2867                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2868                     break
2869
2870     def _real_extract(self, url):
2871         mobj = re.match(self._VALID_URL, url)
2872         if mobj is None:
2873             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2874             return
2875         # extract uploader & filename from url
2876         uploader = mobj.group(1).decode('utf-8')
2877         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2878
2879         # construct API request
2880         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2881         # retrieve .json file with links to files
2882         request = compat_urllib_request.Request(file_url)
2883         try:
2884             self.report_download_json(file_url)
2885             jsonData = compat_urllib_request.urlopen(request).read()
2886         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2887             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2888             return
2889
2890         # parse JSON
2891         json_data = json.loads(jsonData)
2892         player_url = json_data['player_swf_url']
2893         formats = dict(json_data['audio_formats'])
2894
2895         req_format = self._downloader.params.get('format', None)
2896         bitrate = None
2897
2898         if self._downloader.params.get('listformats', None):
2899             self._print_formats(formats)
2900             return
2901
2902         if req_format is None or req_format == 'best':
2903             for format_param in formats.keys():
2904                 url_list = self.get_urls(formats, format_param)
2905                 # check urls
2906                 file_url = self.check_urls(url_list)
2907                 if file_url is not None:
2908                     break # got it!
2909         else:
2910             if req_format not in formats:
2911                 self._downloader.trouble(u'ERROR: format is not available')
2912                 return
2913
2914             url_list = self.get_urls(formats, req_format)
2915             file_url = self.check_urls(url_list)
2916             format_param = req_format
2917
2918         return [{
2919             'id': file_id.decode('utf-8'),
2920             'url': file_url.decode('utf-8'),
2921             'uploader': uploader.decode('utf-8'),
2922             'upload_date': None,
2923             'title': json_data['name'],
2924             'ext': file_url.split('.')[-1].decode('utf-8'),
2925             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2926             'thumbnail': json_data['thumbnail_url'],
2927             'description': json_data['description'],
2928             'player_url': player_url.decode('utf-8'),
2929         }]
2930
2931 class StanfordOpenClassroomIE(InfoExtractor):
2932     """Information extractor for Stanford's Open ClassRoom"""
2933
2934     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2935     IE_NAME = u'stanfordoc'
2936
2937     def report_download_webpage(self, objid):
2938         """Report information extraction."""
2939         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2940
2941     def report_extraction(self, video_id):
2942         """Report information extraction."""
2943         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2944
2945     def _real_extract(self, url):
2946         mobj = re.match(self._VALID_URL, url)
2947         if mobj is None:
2948             raise ExtractorError(u'Invalid URL: %s' % url)
2949
2950         if mobj.group('course') and mobj.group('video'): # A specific video
2951             course = mobj.group('course')
2952             video = mobj.group('video')
2953             info = {
2954                 'id': course + '_' + video,
2955                 'uploader': None,
2956                 'upload_date': None,
2957             }
2958
2959             self.report_extraction(info['id'])
2960             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2961             xmlUrl = baseUrl + video + '.xml'
2962             try:
2963                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2964             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2965                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2966                 return
2967             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2968             try:
2969                 info['title'] = mdoc.findall('./title')[0].text
2970                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2971             except IndexError:
2972                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2973                 return
2974             info['ext'] = info['url'].rpartition('.')[2]
2975             return [info]
2976         elif mobj.group('course'): # A course page
2977             course = mobj.group('course')
2978             info = {
2979                 'id': course,
2980                 'type': 'playlist',
2981                 'uploader': None,
2982                 'upload_date': None,
2983             }
2984
2985             coursepage = self._download_webpage(url, info['id'],
2986                                         note='Downloading course info page',
2987                                         errnote='Unable to download course info page')
2988
2989             m = re.search('<h1>([^<]+)</h1>', coursepage)
2990             if m:
2991                 info['title'] = unescapeHTML(m.group(1))
2992             else:
2993                 info['title'] = info['id']
2994
2995             m = re.search('<description>([^<]+)</description>', coursepage)
2996             if m:
2997                 info['description'] = unescapeHTML(m.group(1))
2998
2999             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3000             info['list'] = [
3001                 {
3002                     'type': 'reference',
3003                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3004                 }
3005                     for vpage in links]
3006             results = []
3007             for entry in info['list']:
3008                 assert entry['type'] == 'reference'
3009                 results += self.extract(entry['url'])
3010             return results
3011         else: # Root page
3012             info = {
3013                 'id': 'Stanford OpenClassroom',
3014                 'type': 'playlist',
3015                 'uploader': None,
3016                 'upload_date': None,
3017             }
3018
3019             self.report_download_webpage(info['id'])
3020             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3021             try:
3022                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3023             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3024                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3025                 return
3026
3027             info['title'] = info['id']
3028
3029             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3030             info['list'] = [
3031                 {
3032                     'type': 'reference',
3033                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3034                 }
3035                     for cpage in links]
3036
3037             results = []
3038             for entry in info['list']:
3039                 assert entry['type'] == 'reference'
3040                 results += self.extract(entry['url'])
3041             return results
3042
3043 class MTVIE(InfoExtractor):
3044     """Information extractor for MTV.com"""
3045
3046     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3047     IE_NAME = u'mtv'
3048
3049     def report_extraction(self, video_id):
3050         """Report information extraction."""
3051         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3052
3053     def _real_extract(self, url):
3054         mobj = re.match(self._VALID_URL, url)
3055         if mobj is None:
3056             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3057             return
3058         if not mobj.group('proto'):
3059             url = 'http://' + url
3060         video_id = mobj.group('videoid')
3061
3062         webpage = self._download_webpage(url, video_id)
3063
3064         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3065         if mobj is None:
3066             self._downloader.trouble(u'ERROR: unable to extract song name')
3067             return
3068         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3069         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3070         if mobj is None:
3071             self._downloader.trouble(u'ERROR: unable to extract performer')
3072             return
3073         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3074         video_title = performer + ' - ' + song_name
3075
3076         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3077         if mobj is None:
3078             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3079             return
3080         mtvn_uri = mobj.group(1)
3081
3082         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3083         if mobj is None:
3084             self._downloader.trouble(u'ERROR: unable to extract content id')
3085             return
3086         content_id = mobj.group(1)
3087
3088         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3089         self.report_extraction(video_id)
3090         request = compat_urllib_request.Request(videogen_url)
3091         try:
3092             metadataXml = compat_urllib_request.urlopen(request).read()
3093         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3094             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3095             return
3096
3097         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3098         renditions = mdoc.findall('.//rendition')
3099
3100         # For now, always pick the highest quality.
3101         rendition = renditions[-1]
3102
3103         try:
3104             _,_,ext = rendition.attrib['type'].partition('/')
3105             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3106             video_url = rendition.find('./src').text
3107         except KeyError:
3108             self._downloader.trouble('Invalid rendition field.')
3109             return
3110
3111         info = {
3112             'id': video_id,
3113             'url': video_url,
3114             'uploader': performer,
3115             'upload_date': None,
3116             'title': video_title,
3117             'ext': ext,
3118             'format': format,
3119         }
3120
3121         return [info]
3122
3123
3124 class YoukuIE(InfoExtractor):
3125     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3126
3127     def report_download_webpage(self, file_id):
3128         """Report webpage download."""
3129         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3130
3131     def report_extraction(self, file_id):
3132         """Report information extraction."""
3133         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3134
3135     def _gen_sid(self):
3136         nowTime = int(time.time() * 1000)
3137         random1 = random.randint(1000,1998)
3138         random2 = random.randint(1000,9999)
3139
3140         return "%d%d%d" %(nowTime,random1,random2)
3141
3142     def _get_file_ID_mix_string(self, seed):
3143         mixed = []
3144         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3145         seed = float(seed)
3146         for i in range(len(source)):
3147             seed  =  (seed * 211 + 30031 ) % 65536
3148             index  =  math.floor(seed / 65536 * len(source) )
3149             mixed.append(source[int(index)])
3150             source.remove(source[int(index)])
3151         #return ''.join(mixed)
3152         return mixed
3153
3154     def _get_file_id(self, fileId, seed):
3155         mixed = self._get_file_ID_mix_string(seed)
3156         ids = fileId.split('*')
3157         realId = []
3158         for ch in ids:
3159             if ch:
3160                 realId.append(mixed[int(ch)])
3161         return ''.join(realId)
3162
3163     def _real_extract(self, url):
3164         mobj = re.match(self._VALID_URL, url)
3165         if mobj is None:
3166             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3167             return
3168         video_id = mobj.group('ID')
3169
3170         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3171
3172         request = compat_urllib_request.Request(info_url, None, std_headers)
3173         try:
3174             self.report_download_webpage(video_id)
3175             jsondata = compat_urllib_request.urlopen(request).read()
3176         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3177             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3178             return
3179
3180         self.report_extraction(video_id)
3181         try:
3182             jsonstr = jsondata.decode('utf-8')
3183             config = json.loads(jsonstr)
3184
3185             video_title =  config['data'][0]['title']
3186             seed = config['data'][0]['seed']
3187
3188             format = self._downloader.params.get('format', None)
3189             supported_format = list(config['data'][0]['streamfileids'].keys())
3190
3191             if format is None or format == 'best':
3192                 if 'hd2' in supported_format:
3193                     format = 'hd2'
3194                 else:
3195                     format = 'flv'
3196                 ext = u'flv'
3197             elif format == 'worst':
3198                 format = 'mp4'
3199                 ext = u'mp4'
3200             else:
3201                 format = 'flv'
3202                 ext = u'flv'
3203
3204
3205             fileid = config['data'][0]['streamfileids'][format]
3206             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3207         except (UnicodeDecodeError, ValueError, KeyError):
3208             self._downloader.trouble(u'ERROR: unable to extract info section')
3209             return
3210
3211         files_info=[]
3212         sid = self._gen_sid()
3213         fileid = self._get_file_id(fileid, seed)
3214
3215         #column 8,9 of fileid represent the segment number
3216         #fileid[7:9] should be changed
3217         for index, key in enumerate(keys):
3218
3219             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3220             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3221
3222             info = {
3223                 'id': '%s_part%02d' % (video_id, index),
3224                 'url': download_url,
3225                 'uploader': None,
3226                 'upload_date': None,
3227                 'title': video_title,
3228                 'ext': ext,
3229             }
3230             files_info.append(info)
3231
3232         return files_info
3233
3234
3235 class XNXXIE(InfoExtractor):
3236     """Information extractor for xnxx.com"""
3237
3238     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3239     IE_NAME = u'xnxx'
3240     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3241     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3242     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3243
3244     def report_webpage(self, video_id):
3245         """Report information extraction"""
3246         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3247
3248     def report_extraction(self, video_id):
3249         """Report information extraction"""
3250         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3251
3252     def _real_extract(self, url):
3253         mobj = re.match(self._VALID_URL, url)
3254         if mobj is None:
3255             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3256             return
3257         video_id = mobj.group(1)
3258
3259         self.report_webpage(video_id)
3260
3261         # Get webpage content
3262         try:
3263             webpage_bytes = compat_urllib_request.urlopen(url).read()
3264             webpage = webpage_bytes.decode('utf-8')
3265         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3266             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3267             return
3268
3269         result = re.search(self.VIDEO_URL_RE, webpage)
3270         if result is None:
3271             self._downloader.trouble(u'ERROR: unable to extract video url')
3272             return
3273         video_url = compat_urllib_parse.unquote(result.group(1))
3274
3275         result = re.search(self.VIDEO_TITLE_RE, webpage)
3276         if result is None:
3277             self._downloader.trouble(u'ERROR: unable to extract video title')
3278             return
3279         video_title = result.group(1)
3280
3281         result = re.search(self.VIDEO_THUMB_RE, webpage)
3282         if result is None:
3283             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3284             return
3285         video_thumbnail = result.group(1)
3286
3287         return [{
3288             'id': video_id,
3289             'url': video_url,
3290             'uploader': None,
3291             'upload_date': None,
3292             'title': video_title,
3293             'ext': 'flv',
3294             'thumbnail': video_thumbnail,
3295             'description': None,
3296         }]
3297
3298
3299 class GooglePlusIE(InfoExtractor):
3300     """Information extractor for plus.google.com."""
3301
3302     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3303     IE_NAME = u'plus.google'
3304
3305     def __init__(self, downloader=None):
3306         InfoExtractor.__init__(self, downloader)
3307
3308     def report_extract_entry(self, url):
3309         """Report downloading extry"""
3310         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3311
3312     def report_date(self, upload_date):
3313         """Report downloading extry"""
3314         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3315
3316     def report_uploader(self, uploader):
3317         """Report downloading extry"""
3318         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3319
3320     def report_title(self, video_title):
3321         """Report downloading extry"""
3322         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3323
3324     def report_extract_vid_page(self, video_page):
3325         """Report information extraction."""
3326         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3327
3328     def _real_extract(self, url):
3329         # Extract id from URL
3330         mobj = re.match(self._VALID_URL, url)
3331         if mobj is None:
3332             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3333             return
3334
3335         post_url = mobj.group(0)
3336         video_id = mobj.group(1)
3337
3338         video_extension = 'flv'
3339
3340         # Step 1, Retrieve post webpage to extract further information
3341         self.report_extract_entry(post_url)
3342         request = compat_urllib_request.Request(post_url)
3343         try:
3344             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3345         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3346             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3347             return
3348
3349         # Extract update date
3350         upload_date = None
3351         pattern = 'title="Timestamp">(.*?)</a>'
3352         mobj = re.search(pattern, webpage)
3353         if mobj:
3354             upload_date = mobj.group(1)
3355             # Convert timestring to a format suitable for filename
3356             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3357             upload_date = upload_date.strftime('%Y%m%d')
3358         self.report_date(upload_date)
3359
3360         # Extract uploader
3361         uploader = None
3362         pattern = r'rel\="author".*?>(.*?)</a>'
3363         mobj = re.search(pattern, webpage)
3364         if mobj:
3365             uploader = mobj.group(1)
3366         self.report_uploader(uploader)
3367
3368         # Extract title
3369         # Get the first line for title
3370         video_title = u'NA'
3371         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3372         mobj = re.search(pattern, webpage)
3373         if mobj:
3374             video_title = mobj.group(1)
3375         self.report_title(video_title)
3376
3377         # Step 2, Stimulate clicking the image box to launch video
3378         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3379         mobj = re.search(pattern, webpage)
3380         if mobj is None:
3381             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3382
3383         video_page = mobj.group(1)
3384         request = compat_urllib_request.Request(video_page)
3385         try:
3386             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3387         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3388             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3389             return
3390         self.report_extract_vid_page(video_page)
3391
3392
3393         # Extract video links on video page
3394         """Extract video links of all sizes"""
3395         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3396         mobj = re.findall(pattern, webpage)
3397         if len(mobj) == 0:
3398             self._downloader.trouble(u'ERROR: unable to extract video links')
3399
3400         # Sort in resolution
3401         links = sorted(mobj)
3402
3403         # Choose the lowest of the sort, i.e. highest resolution
3404         video_url = links[-1]
3405         # Only get the url. The resolution part in the tuple has no use anymore
3406         video_url = video_url[-1]
3407         # Treat escaped \u0026 style hex
3408         try:
3409             video_url = video_url.decode("unicode_escape")
3410         except AttributeError: # Python 3
3411             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3412
3413
3414         return [{
3415             'id':       video_id,
3416             'url':      video_url,
3417             'uploader': uploader,
3418             'upload_date':  upload_date,
3419             'title':    video_title,
3420             'ext':      video_extension,
3421         }]
3422
3423 class NBAIE(InfoExtractor):
3424     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3425     IE_NAME = u'nba'
3426
3427     def _real_extract(self, url):
3428         mobj = re.match(self._VALID_URL, url)
3429         if mobj is None:
3430             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3431             return
3432
3433         video_id = mobj.group(1)
3434         if video_id.endswith('/index.html'):
3435             video_id = video_id[:-len('/index.html')]
3436
3437         webpage = self._download_webpage(url, video_id)
3438
3439         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3440         def _findProp(rexp, default=None):
3441             m = re.search(rexp, webpage)
3442             if m:
3443                 return unescapeHTML(m.group(1))
3444             else:
3445                 return default
3446
3447         shortened_video_id = video_id.rpartition('/')[2]
3448         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3449         info = {
3450             'id': shortened_video_id,
3451             'url': video_url,
3452             'ext': 'mp4',
3453             'title': title,
3454             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3455             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3456         }
3457         return [info]
3458
3459 class JustinTVIE(InfoExtractor):
3460     """Information extractor for justin.tv and twitch.tv"""
3461     # TODO: One broadcast may be split into multiple videos. The key
3462     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3463     # starts at 1 and increases. Can we treat all parts as one video?
3464
3465     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3466         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3467     _JUSTIN_PAGE_LIMIT = 100
3468     IE_NAME = u'justin.tv'
3469
3470     def report_extraction(self, file_id):
3471         """Report information extraction."""
3472         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3473
3474     def report_download_page(self, channel, offset):
3475         """Report attempt to download a single page of videos."""
3476         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3477                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3478
3479     # Return count of items, list of *valid* items
3480     def _parse_page(self, url):
3481         try:
3482             urlh = compat_urllib_request.urlopen(url)
3483             webpage_bytes = urlh.read()
3484             webpage = webpage_bytes.decode('utf-8', 'ignore')
3485         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3486             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3487             return
3488
3489         response = json.loads(webpage)
3490         if type(response) != list:
3491             error_text = response.get('error', 'unknown error')
3492             self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3493             return
3494         info = []
3495         for clip in response:
3496             video_url = clip['video_file_url']
3497             if video_url:
3498                 video_extension = os.path.splitext(video_url)[1][1:]
3499                 video_date = re.sub('-', '', clip['start_time'][:10])
3500                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3501                 video_id = clip['id']
3502                 video_title = clip.get('title', video_id)
3503                 info.append({
3504                     'id': video_id,
3505                     'url': video_url,
3506                     'title': video_title,
3507                     'uploader': clip.get('channel_name', video_uploader_id),
3508                     'uploader_id': video_uploader_id,
3509                     'upload_date': video_date,
3510                     'ext': video_extension,
3511                 })
3512         return (len(response), info)
3513
3514     def _real_extract(self, url):
3515         mobj = re.match(self._VALID_URL, url)
3516         if mobj is None:
3517             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3518             return
3519
3520         api = 'http://api.justin.tv'
3521         video_id = mobj.group(mobj.lastindex)
3522         paged = False
3523         if mobj.lastindex == 1:
3524             paged = True
3525             api += '/channel/archives/%s.json'
3526         else:
3527             api += '/broadcast/by_archive/%s.json'
3528         api = api % (video_id,)
3529
3530         self.report_extraction(video_id)
3531
3532         info = []
3533         offset = 0
3534         limit = self._JUSTIN_PAGE_LIMIT
3535         while True:
3536             if paged:
3537                 self.report_download_page(video_id, offset)
3538             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3539             page_count, page_info = self._parse_page(page_url)
3540             info.extend(page_info)
3541             if not paged or page_count != limit:
3542                 break
3543             offset += limit
3544         return info
3545
3546 class FunnyOrDieIE(InfoExtractor):
3547     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3548
3549     def _real_extract(self, url):
3550         mobj = re.match(self._VALID_URL, url)
3551         if mobj is None:
3552             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3553             return
3554
3555         video_id = mobj.group('id')
3556         webpage = self._download_webpage(url, video_id)
3557
3558         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3559         if not m:
3560             self._downloader.trouble(u'ERROR: unable to find video information')
3561         video_url = unescapeHTML(m.group('url'))
3562
3563         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3564         if not m:
3565             self._downloader.trouble(u'Cannot find video title')
3566         title = unescapeHTML(m.group('title'))
3567
3568         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3569         if m:
3570             desc = unescapeHTML(m.group('desc'))
3571         else:
3572             desc = None
3573
3574         info = {
3575             'id': video_id,
3576             'url': video_url,
3577             'ext': 'mp4',
3578             'title': title,
3579             'description': desc,
3580         }
3581         return [info]
3582
3583 class TweetReelIE(InfoExtractor):
3584     _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3585
3586     def _real_extract(self, url):
3587         mobj = re.match(self._VALID_URL, url)
3588         if mobj is None:
3589             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3590             return
3591
3592         video_id = mobj.group('id')
3593         webpage = self._download_webpage(url, video_id)
3594
3595         m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3596         if not m:
3597             self._downloader.trouble(u'ERROR: Cannot find status ID')
3598         status_id = m.group(1)
3599
3600         m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3601         if not m:
3602             self._downloader.trouble(u'WARNING: Cannot find description')
3603         desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3604
3605         m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3606         if not m:
3607             self._downloader.trouble(u'ERROR: Cannot find uploader')
3608         uploader = unescapeHTML(m.group('uploader'))
3609         uploader_id = unescapeHTML(m.group('uploader_id'))
3610
3611         m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3612         if not m:
3613             self._downloader.trouble(u'ERROR: Cannot find upload date')
3614         upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3615
3616         title = desc
3617         video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3618
3619         info = {
3620             'id': video_id,
3621             'url': video_url,
3622             'ext': 'mov',
3623             'title': title,
3624             'description': desc,
3625             'uploader': uploader,
3626             'uploader_id': uploader_id,
3627             'internal_id': status_id,
3628             'upload_date': upload_date
3629         }
3630         return [info]
3631
3632 class SteamIE(InfoExtractor):
3633     _VALID_URL = r"""http://store.steampowered.com/
3634                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3635                 (?P<gameID>\d+)/?
3636                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3637                 """
3638
3639     @classmethod
3640     def suitable(cls, url):
3641         """Receives a URL and returns True if suitable for this IE."""
3642         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3643
3644     def _real_extract(self, url):
3645         m = re.match(self._VALID_URL, url, re.VERBOSE)
3646         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3647         gameID = m.group('gameID')
3648         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3649         webpage = self._download_webpage(videourl, gameID)
3650         mweb = re.finditer(urlRE, webpage)
3651         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3652         titles = re.finditer(namesRE, webpage)
3653         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3654         thumbs = re.finditer(thumbsRE, webpage)
3655         videos = []
3656         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3657             video_id = vid.group('videoID')
3658             title = vtitle.group('videoName')
3659             video_url = vid.group('videoURL')
3660             video_thumb = thumb.group('thumbnail')
3661             if not video_url:
3662                 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3663             info = {
3664                 'id':video_id,
3665                 'url':video_url,
3666                 'ext': 'flv',
3667                 'title': unescapeHTML(title),
3668                 'thumbnail': video_thumb
3669                   }
3670             videos.append(info)
3671         return videos
3672
3673 class UstreamIE(InfoExtractor):
3674     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3675     IE_NAME = u'ustream'
3676
3677     def _real_extract(self, url):
3678         m = re.match(self._VALID_URL, url)
3679         video_id = m.group('videoID')
3680         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3681         webpage = self._download_webpage(url, video_id)
3682         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3683         title = m.group('title')
3684         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3685         uploader = m.group('uploader')
3686         info = {
3687                 'id':video_id,
3688                 'url':video_url,
3689                 'ext': 'flv',
3690                 'title': title,
3691                 'uploader': uploader
3692                   }
3693         return [info]
3694
3695 class RBMARadioIE(InfoExtractor):
3696     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3697
3698     def _real_extract(self, url):
3699         m = re.match(self._VALID_URL, url)
3700         video_id = m.group('videoID')
3701
3702         webpage = self._download_webpage(url, video_id)
3703         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3704         if not m:
3705             raise ExtractorError(u'Cannot find metadata')
3706         json_data = m.group(1)
3707
3708         try:
3709             data = json.loads(json_data)
3710         except ValueError as e:
3711             raise ExtractorError(u'Invalid JSON: ' + str(e))
3712
3713         video_url = data['akamai_url'] + '&cbr=256'
3714         url_parts = compat_urllib_parse_urlparse(video_url)
3715         video_ext = url_parts.path.rpartition('.')[2]
3716         info = {
3717                 'id': video_id,
3718                 'url': video_url,
3719                 'ext': video_ext,
3720                 'title': data['title'],
3721                 'description': data.get('teaser_text'),
3722                 'location': data.get('country_of_origin'),
3723                 'uploader': data.get('host', {}).get('name'),
3724                 'uploader_id': data.get('host', {}).get('slug'),
3725                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3726                 'duration': data.get('duration'),
3727         }
3728         return [info]
3729
3730
3731 class YouPornIE(InfoExtractor):
3732     """Information extractor for youporn.com."""
3733     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3734
3735     def _print_formats(self, formats):
3736         """Print all available formats"""
3737         print(u'Available formats:')
3738         print(u'ext\t\tformat')
3739         print(u'---------------------------------')
3740         for format in formats:
3741             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3742
3743     def _specific(self, req_format, formats):
3744         for x in formats:
3745             if(x["format"]==req_format):
3746                 return x
3747         return None
3748
3749     def _real_extract(self, url):
3750         mobj = re.match(self._VALID_URL, url)
3751         if mobj is None:
3752             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3753             return
3754
3755         video_id = mobj.group('videoid')
3756
3757         req = compat_urllib_request.Request(url)
3758         req.add_header('Cookie', 'age_verified=1')
3759         webpage = self._download_webpage(req, video_id)
3760
3761         # Get the video title
3762         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3763         if result is None:
3764             raise ExtractorError(u'Unable to extract video title')
3765         video_title = result.group('title').strip()
3766
3767         # Get the video date
3768         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3769         if result is None:
3770             self._downloader.to_stderr(u'WARNING: unable to extract video date')
3771             upload_date = None
3772         else:
3773             upload_date = result.group('date').strip()
3774
3775         # Get the video uploader
3776         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3777         if result is None:
3778             self._downloader.to_stderr(u'WARNING: unable to extract uploader')
3779             video_uploader = None
3780         else:
3781             video_uploader = result.group('uploader').strip()
3782             video_uploader = clean_html( video_uploader )
3783
3784         # Get all of the formats available
3785         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3786         result = re.search(DOWNLOAD_LIST_RE, webpage)
3787         if result is None:
3788             raise ExtractorError(u'Unable to extract download list')
3789         download_list_html = result.group('download_list').strip()
3790
3791         # Get all of the links from the page
3792         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3793         links = re.findall(LINK_RE, download_list_html)
3794         if(len(links) == 0):
3795             raise ExtractorError(u'ERROR: no known formats available for video')
3796
3797         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3798
3799         formats = []
3800         for link in links:
3801
3802             # A link looks like this:
3803             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3804             # A path looks like this:
3805             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3806             video_url = unescapeHTML( link )
3807             path = compat_urllib_parse_urlparse( video_url ).path
3808             extension = os.path.splitext( path )[1][1:]
3809             format = path.split('/')[4].split('_')[:2]
3810             size = format[0]
3811             bitrate = format[1]
3812             format = "-".join( format )
3813             title = u'%s-%s-%s' % (video_title, size, bitrate)
3814
3815             formats.append({
3816                 'id': video_id,
3817                 'url': video_url,
3818                 'uploader': video_uploader,
3819                 'upload_date': upload_date,
3820                 'title': title,
3821                 'ext': extension,
3822                 'format': format,
3823                 'thumbnail': None,
3824                 'description': None,
3825                 'player_url': None
3826             })
3827
3828         if self._downloader.params.get('listformats', None):
3829             self._print_formats(formats)
3830             return
3831
3832         req_format = self._downloader.params.get('format', None)
3833         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3834
3835         if req_format is None or req_format == 'best':
3836             return [formats[0]]
3837         elif req_format == 'worst':
3838             return [formats[-1]]
3839         elif req_format in ('-1', 'all'):
3840             return formats
3841         else:
3842             format = self._specific( req_format, formats )
3843             if result is None:
3844                 self._downloader.trouble(u'ERROR: requested format not available')
3845                 return
3846             return [format]
3847
3848
3849
3850 class PornotubeIE(InfoExtractor):
3851     """Information extractor for pornotube.com."""
3852     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3853
3854     def _real_extract(self, url):
3855         mobj = re.match(self._VALID_URL, url)
3856         if mobj is None:
3857             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3858             return
3859
3860         video_id = mobj.group('videoid')
3861         video_title = mobj.group('title')
3862
3863         # Get webpage content
3864         webpage = self._download_webpage(url, video_id)
3865
3866         # Get the video URL
3867         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3868         result = re.search(VIDEO_URL_RE, webpage)
3869         if result is None:
3870             self._downloader.trouble(u'ERROR: unable to extract video url')
3871             return
3872         video_url = compat_urllib_parse.unquote(result.group('url'))
3873
3874         #Get the uploaded date
3875         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3876         result = re.search(VIDEO_UPLOADED_RE, webpage)
3877         if result is None:
3878             self._downloader.trouble(u'ERROR: unable to extract video title')
3879             return
3880         upload_date = result.group('date')
3881
3882         info = {'id': video_id,
3883                 'url': video_url,
3884                 'uploader': None,
3885                 'upload_date': upload_date,
3886                 'title': video_title,
3887                 'ext': 'flv',
3888                 'format': 'flv'}
3889
3890         return [info]
3891
3892 class YouJizzIE(InfoExtractor):
3893     """Information extractor for youjizz.com."""
3894     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3895
3896     def _real_extract(self, url):
3897         mobj = re.match(self._VALID_URL, url)
3898         if mobj is None:
3899             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3900             return
3901
3902         video_id = mobj.group('videoid')
3903
3904         # Get webpage content
3905         webpage = self._download_webpage(url, video_id)
3906
3907         # Get the video title
3908         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3909         if result is None:
3910             raise ExtractorError(u'ERROR: unable to extract video title')
3911         video_title = result.group('title').strip()
3912
3913         # Get the embed page
3914         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3915         if result is None:
3916             raise ExtractorError(u'ERROR: unable to extract embed page')
3917
3918         embed_page_url = result.group(0).strip()
3919         video_id = result.group('videoid')
3920
3921         webpage = self._download_webpage(embed_page_url, video_id)
3922
3923         # Get the video URL
3924         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3925         if result is None:
3926             raise ExtractorError(u'ERROR: unable to extract video url')
3927         video_url = result.group('source')
3928
3929         info = {'id': video_id,
3930                 'url': video_url,
3931                 'title': video_title,
3932                 'ext': 'flv',
3933                 'format': 'flv',
3934                 'player_url': embed_page_url}
3935
3936         return [info]
3937
3938 class EightTracksIE(InfoExtractor):
3939     IE_NAME = '8tracks'
3940     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3941
3942     def _real_extract(self, url):
3943         mobj = re.match(self._VALID_URL, url)
3944         if mobj is None:
3945             raise ExtractorError(u'Invalid URL: %s' % url)
3946         playlist_id = mobj.group('id')
3947
3948         webpage = self._download_webpage(url, playlist_id)
3949
3950         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3951         if not m:
3952             raise ExtractorError(u'Cannot find trax information')
3953         json_like = m.group(1)
3954         data = json.loads(json_like)
3955
3956         session = str(random.randint(0, 1000000000))
3957         mix_id = data['id']
3958         track_count = data['tracks_count']
3959         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3960         next_url = first_url
3961         res = []
3962         for i in itertools.count():
3963             api_json = self._download_webpage(next_url, playlist_id,
3964                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3965                 errnote=u'Failed to download song information')
3966             api_data = json.loads(api_json)
3967             track_data = api_data[u'set']['track']
3968             info = {
3969                 'id': track_data['id'],
3970                 'url': track_data['track_file_stream_url'],
3971                 'title': track_data['performer'] + u' - ' + track_data['name'],
3972                 'raw_title': track_data['name'],
3973                 'uploader_id': data['user']['login'],
3974                 'ext': 'm4a',
3975             }
3976             res.append(info)
3977             if api_data['set']['at_last_track']:
3978                 break
3979             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3980         return res
3981
3982 class KeekIE(InfoExtractor):
3983     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3984     IE_NAME = u'keek'
3985
3986     def _real_extract(self, url):
3987         m = re.match(self._VALID_URL, url)
3988         video_id = m.group('videoID')
3989         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3990         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3991         webpage = self._download_webpage(url, video_id)
3992         m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
3993         title = unescapeHTML(m.group('title'))
3994         m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
3995         uploader = unescapeHTML(m.group('uploader'))
3996         info = {
3997                 'id':video_id,
3998                 'url':video_url,
3999                 'ext': 'mp4',
4000                 'title': title,
4001                 'thumbnail': thumbnail,
4002                 'uploader': uploader
4003         }
4004         return [info]
4005
4006 class TEDIE(InfoExtractor):
4007     _VALID_URL=r'''http://www.ted.com/
4008                    (
4009                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4010                         |
4011                         ((?P<type_talk>talks)) # We have a simple talk
4012                    )
4013                    /(?P<name>\w+) # Here goes the name and then ".html"
4014                    '''
4015
4016     @classmethod
4017     def suitable(cls, url):
4018         """Receives a URL and returns True if suitable for this IE."""
4019         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4020
4021     def _real_extract(self, url):
4022         m=re.match(self._VALID_URL, url, re.VERBOSE)
4023         if m.group('type_talk'):
4024             return [self._talk_info(url)]
4025         else :
4026             playlist_id=m.group('playlist_id')
4027             name=m.group('name')
4028             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4029             return self._playlist_videos_info(url,name,playlist_id)
4030
4031     def _talk_video_link(self,mediaSlug):
4032         '''Returns the video link for that mediaSlug'''
4033         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4034
4035     def _playlist_videos_info(self,url,name,playlist_id=0):
4036         '''Returns the videos of the playlist'''
4037         video_RE=r'''
4038                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4039                      ([.\s]*?)data-playlist_item_id="(\d+)"
4040                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4041                      '''
4042         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4043         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4044         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4045         m_names=re.finditer(video_name_RE,webpage)
4046         info=[]
4047         for m_video, m_name in zip(m_videos,m_names):
4048             video_id=m_video.group('video_id')
4049             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4050             info.append(self._talk_info(talk_url,video_id))
4051         return info
4052
4053     def _talk_info(self, url, video_id=0):
4054         """Return the video for the talk in the url"""
4055         m=re.match(self._VALID_URL, url,re.VERBOSE)
4056         videoName=m.group('name')
4057         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4058         # If the url includes the language we get the title translated
4059         title_RE=r'<h1><span id="altHeadline" >(?P<title>.*)</span></h1>'
4060         title=re.search(title_RE, webpage).group('title')
4061         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4062                         "id":(?P<videoID>[\d]+).*?
4063                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4064         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4065         thumb_match=re.search(thumb_RE,webpage)
4066         info_match=re.search(info_RE,webpage,re.VERBOSE)
4067         video_id=info_match.group('videoID')
4068         mediaSlug=info_match.group('mediaSlug')
4069         video_url=self._talk_video_link(mediaSlug)
4070         info = {
4071                 'id': video_id,
4072                 'url': video_url,
4073                 'ext': 'mp4',
4074                 'title': title,
4075                 'thumbnail': thumb_match.group('thumbnail')
4076                 }
4077         return info
4078
4079 class MySpassIE(InfoExtractor):
4080     _VALID_URL = r'http://www.myspass.de/.*'
4081
4082     def _real_extract(self, url):
4083         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4084
4085         # video id is the last path element of the URL
4086         # usually there is a trailing slash, so also try the second but last
4087         url_path = compat_urllib_parse_urlparse(url).path
4088         url_parent_path, video_id = os.path.split(url_path)
4089         if not video_id:
4090             _, video_id = os.path.split(url_parent_path)
4091
4092         # get metadata
4093         metadata_url = META_DATA_URL_TEMPLATE % video_id
4094         metadata_text = self._download_webpage(metadata_url, video_id)
4095         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4096
4097         # extract values from metadata
4098         url_flv_el = metadata.find('url_flv')
4099         if url_flv_el is None:
4100             self._downloader.trouble(u'ERROR: unable to extract download url')
4101             return
4102         video_url = url_flv_el.text
4103         extension = os.path.splitext(video_url)[1][1:]
4104         title_el = metadata.find('title')
4105         if title_el is None:
4106             self._downloader.trouble(u'ERROR: unable to extract title')
4107             return
4108         title = title_el.text
4109         format_id_el = metadata.find('format_id')
4110         if format_id_el is None:
4111             format = ext
4112         else:
4113             format = format_id_el.text
4114         description_el = metadata.find('description')
4115         if description_el is not None:
4116             description = description_el.text
4117         else:
4118             description = None
4119         imagePreview_el = metadata.find('imagePreview')
4120         if imagePreview_el is not None:
4121             thumbnail = imagePreview_el.text
4122         else:
4123             thumbnail = None
4124         info = {
4125             'id': video_id,
4126             'url': video_url,
4127             'title': title,
4128             'ext': extension,
4129             'format': format,
4130             'thumbnail': thumbnail,
4131             'description': description
4132         }
4133         return [info]
4134
4135 def gen_extractors():
4136     """ Return a list of an instance of every supported extractor.
4137     The order does matter; the first extractor matched is the one handling the URL.
4138     """
4139     return [
4140         YoutubePlaylistIE(),
4141         YoutubeChannelIE(),
4142         YoutubeUserIE(),
4143         YoutubeSearchIE(),
4144         YoutubeIE(),
4145         MetacafeIE(),
4146         DailymotionIE(),
4147         GoogleSearchIE(),
4148         PhotobucketIE(),
4149         YahooIE(),
4150         YahooSearchIE(),
4151         DepositFilesIE(),
4152         FacebookIE(),
4153         BlipTVUserIE(),
4154         BlipTVIE(),
4155         VimeoIE(),
4156         MyVideoIE(),
4157         ComedyCentralIE(),
4158         EscapistIE(),
4159         CollegeHumorIE(),
4160         XVideosIE(),
4161         SoundcloudIE(),
4162         InfoQIE(),
4163         MixcloudIE(),
4164         StanfordOpenClassroomIE(),
4165         MTVIE(),
4166         YoukuIE(),
4167         XNXXIE(),
4168         YouJizzIE(),
4169         PornotubeIE(),
4170         YouPornIE(),
4171         GooglePlusIE(),
4172         ArteTvIE(),
4173         NBAIE(),
4174         JustinTVIE(),
4175         FunnyOrDieIE(),
4176         TweetReelIE(),
4177         SteamIE(),
4178         UstreamIE(),
4179         RBMARadioIE(),
4180         EightTracksIE(),
4181         KeekIE(),
4182         TEDIE(),
4183         MySpassIE(),
4184         GenericIE()
4185     ]
4186
4187