_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18
  19 from .utils import *
  20
  21
  22 class InfoExtractor(object):
  23     """Information Extractor class.
  24
  25     Information extractors are the classes that, given a URL, extract
  26     information about the video (or videos) the URL refers to. This
  27     information includes the real video URL, the video title, author and
  28     others. The information is stored in a dictionary which is then
  29     passed to the FileDownloader. The FileDownloader processes this
  30     information possibly downloading the video to the file system, among
  31     other possible outcomes.
  32
  33     The dictionaries must include the following fields:
  34
  35     id:             Video identifier.
  36     url:            Final video URL.
  37     title:          Video title, unescaped.
  38     ext:            Video filename extension.
  39
  40     The following fields are optional:
  41
  42     format:         The video format, defaults to ext (used for --get-format)
  43     thumbnail:      Full URL to a video thumbnail image.
  44     description:    One-line video description.
  45     uploader:       Full name of the video uploader.
  46     upload_date:    Video upload date (YYYYMMDD).
  47     uploader_id:    Nickname or id of the video uploader.
  48     location:       Physical location of the video.
  49     player_url:     SWF Player URL (used for rtmpdump).
  50     subtitles:      The .srt file contents.
  51     urlhandle:      [internal] The urlHandle to be used to download the file,
  52                     like returned by urllib.request.urlopen
  53
  54     The fields should all be Unicode strings.
  55
  56     Subclasses of this one should re-define the _real_initialize() and
  57     _real_extract() methods and define a _VALID_URL regexp.
  58     Probably, they should also be added to the list of extractors.
  59
  60     _real_extract() must return a *list* of information dictionaries as
  61     described above.
  62
  63     Finally, the _WORKING attribute should be set to False for broken IEs
  64     in order to warn the users and skip the tests.
  65     """
  66
  67     _ready = False
  68     _downloader = None
  69     _WORKING = True
  70
  71     def __init__(self, downloader=None):
  72         """Constructor. Receives an optional downloader."""
  73         self._ready = False
  74         self.set_downloader(downloader)
  75
  76     def suitable(self, url):
  77         """Receives a URL and returns True if suitable for this IE."""
  78         return re.match(self._VALID_URL, url) is not None
  79
  80     def working(self):
  81         """Getter method for _WORKING."""
  82         return self._WORKING
  83
  84     def initialize(self):
  85         """Initializes an instance (authentication, etc)."""
  86         if not self._ready:
  87             self._real_initialize()
  88             self._ready = True
  89
  90     def extract(self, url):
  91         """Extracts URL information and returns it in list of dicts."""
  92         self.initialize()
  93         return self._real_extract(url)
  94
  95     def set_downloader(self, downloader):
  96         """Sets the downloader for this IE."""
  97         self._downloader = downloader
  98
  99     def _real_initialize(self):
 100         """Real initialization process. Redefine in subclasses."""
 101         pass
 102
 103     def _real_extract(self, url):
 104         """Real extraction process. Redefine in subclasses."""
 105         pass
 106
 107     @property
 108     def IE_NAME(self):
 109         return type(self).__name__[:-2]
 110
 111     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 112         """ Returns the response handle """
 113         if note is None:
 114             note = u'Downloading video webpage'
 115         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
 116         try:
 117             return compat_urllib_request.urlopen(url_or_request)
 118         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 119             if errnote is None:
 120                 errnote = u'Unable to download webpage'
 121             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 122
 123     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 124         """ Returns the data of the page as a string """
 125         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 126         webpage_bytes = urlh.read()
 127         return webpage_bytes.decode('utf-8', 'replace')
 128
 129
 130 class YoutubeIE(InfoExtractor):
 131     """Information extractor for youtube.com."""
 132
 133     _VALID_URL = r"""^
 134                      (
 135                          (?:https?://)?                                       # http(s):// (optional)
 136                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 137                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 138                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 139                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
 140                          (?:                                                  # the various things that can precede the ID:
 141                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 142                              |(?:                                             # or the v= param in all its forms
 143                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 144                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 145                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 146                                  v=
 147                              )
 148                          )?                                                   # optional -> youtube.com/xxxx is OK
 149                      )?                                                       # all until now is optional -> you can pass the naked ID
 150                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 151                      (?(1).+)?                                                # if we found the ID, everything can follow
 152                      $"""
 153     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 154     _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 155     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 156     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 157     _NETRC_MACHINE = 'youtube'
 158     # Listed in order of quality
 159     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 160     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 161     _video_extensions = {
 162         '13': '3gp',
 163         '17': 'mp4',
 164         '18': 'mp4',
 165         '22': 'mp4',
 166         '37': 'mp4',
 167         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 168         '43': 'webm',
 169         '44': 'webm',
 170         '45': 'webm',
 171         '46': 'webm',
 172     }
 173     _video_dimensions = {
 174         '5': '240x400',
 175         '6': '???',
 176         '13': '???',
 177         '17': '144x176',
 178         '18': '360x640',
 179         '22': '720x1280',
 180         '34': '360x640',
 181         '35': '480x854',
 182         '37': '1080x1920',
 183         '38': '3072x4096',
 184         '43': '360x640',
 185         '44': '480x854',
 186         '45': '720x1280',
 187         '46': '1080x1920',
 188     }
 189     IE_NAME = u'youtube'
 190
 191     def suitable(self, url):
 192         """Receives a URL and returns True if suitable for this IE."""
 193         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
 194
 195     def report_lang(self):
 196         """Report attempt to set language."""
 197         self._downloader.to_screen(u'[youtube] Setting language')
 198
 199     def report_login(self):
 200         """Report attempt to log in."""
 201         self._downloader.to_screen(u'[youtube] Logging in')
 202
 203     def report_age_confirmation(self):
 204         """Report attempt to confirm age."""
 205         self._downloader.to_screen(u'[youtube] Confirming age')
 206
 207     def report_video_webpage_download(self, video_id):
 208         """Report attempt to download video webpage."""
 209         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 210
 211     def report_video_info_webpage_download(self, video_id):
 212         """Report attempt to download video info webpage."""
 213         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 214
 215     def report_video_subtitles_download(self, video_id):
 216         """Report attempt to download video info webpage."""
 217         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 218
 219     def report_information_extraction(self, video_id):
 220         """Report attempt to extract video information."""
 221         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 222
 223     def report_unavailable_format(self, video_id, format):
 224         """Report extracted video URL."""
 225         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 226
 227     def report_rtmp_download(self):
 228         """Indicate the download will use the RTMP protocol."""
 229         self._downloader.to_screen(u'[youtube] RTMP download detected')
 230
 231     def _closed_captions_xml_to_srt(self, xml_string):
 232         srt = ''
 233         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 234         # TODO parse xml instead of regex
 235         for n, (start, dur_tag, dur, caption) in enumerate(texts):
 236             if not dur: dur = '4'
 237             start = float(start)
 238             end = start + float(dur)
 239             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 240             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 241             caption = unescapeHTML(caption)
 242             caption = unescapeHTML(caption) # double cycle, intentional
 243             srt += str(n+1) + '\n'
 244             srt += start + ' --> ' + end + '\n'
 245             srt += caption + '\n\n'
 246         return srt
 247
 248     def _extract_subtitles(self, video_id):
 249         self.report_video_subtitles_download(video_id)
 250         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 251         try:
 252             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 253         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 254             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 255         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 256         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 257         if not srt_lang_list:
 258             return (u'WARNING: video has no closed captions', None)
 259         if self._downloader.params.get('subtitleslang', False):
 260             srt_lang = self._downloader.params.get('subtitleslang')
 261         elif 'en' in srt_lang_list:
 262             srt_lang = 'en'
 263         else:
 264             srt_lang = list(srt_lang_list.keys())[0]
 265         if not srt_lang in srt_lang_list:
 266             return (u'WARNING: no closed captions found in the specified language', None)
 267         params = compat_urllib_parse.urlencode({
 268             'lang': srt_lang,
 269             'name': srt_lang_list[srt_lang].encode('utf-8'),
 270             'v': video_id,
 271         })
 272         url = 'http://www.youtube.com/api/timedtext?' + params
 273         try:
 274             srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8')
 275         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 276             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 277         if not srt_xml:
 278             return (u'WARNING: Did not fetch video subtitles', None)
 279         return (None, self._closed_captions_xml_to_srt(srt_xml))
 280
 281     def _print_formats(self, formats):
 282         print('Available formats:')
 283         for x in formats:
 284             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 285
 286     def _real_initialize(self):
 287         if self._downloader is None:
 288             return
 289
 290         username = None
 291         password = None
 292         downloader_params = self._downloader.params
 293
 294         # Attempt to use provided username and password or .netrc data
 295         if downloader_params.get('username', None) is not None:
 296             username = downloader_params['username']
 297             password = downloader_params['password']
 298         elif downloader_params.get('usenetrc', False):
 299             try:
 300                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 301                 if info is not None:
 302                     username = info[0]
 303                     password = info[2]
 304                 else:
 305                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 306             except (IOError, netrc.NetrcParseError) as err:
 307                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
 308                 return
 309
 310         # Set language
 311         request = compat_urllib_request.Request(self._LANG_URL)
 312         try:
 313             self.report_lang()
 314             compat_urllib_request.urlopen(request).read()
 315         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 316             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
 317             return
 318
 319         # No authentication to be performed
 320         if username is None:
 321             return
 322
 323         # Log in
 324         login_form = {
 325                 'current_form': 'loginForm',
 326                 'next':     '/',
 327                 'action_login': 'Log In',
 328                 'username': username,
 329                 'password': password,
 330                 }
 331         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
 332         try:
 333             self.report_login()
 334             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 335             if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 336                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 337                 return
 338         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 339             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
 340             return
 341
 342         # Confirm age
 343         age_form = {
 344                 'next_url':     '/',
 345                 'action_confirm':   'Confirm',
 346                 }
 347         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 348         try:
 349             self.report_age_confirmation()
 350             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 351         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 352             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 353             return
 354
 355     def _extract_id(self, url):
 356         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 357         if mobj is None:
 358             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 359             return
 360         video_id = mobj.group(2)
 361         return video_id
 362
 363     def _real_extract(self, url):
 364         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 365         mobj = re.search(self._NEXT_URL_RE, url)
 366         if mobj:
 367             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 368         video_id = self._extract_id(url)
 369
 370         # Get video webpage
 371         self.report_video_webpage_download(video_id)
 372         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 373         request = compat_urllib_request.Request(url)
 374         try:
 375             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 376         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 377             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
 378             return
 379
 380         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 381
 382         # Attempt to extract SWF player URL
 383         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 384         if mobj is not None:
 385             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 386         else:
 387             player_url = None
 388
 389         # Get video info
 390         self.report_video_info_webpage_download(video_id)
 391         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 392             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 393                     % (video_id, el_type))
 394             request = compat_urllib_request.Request(video_info_url)
 395             try:
 396                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
 397                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
 398                 video_info = compat_parse_qs(video_info_webpage)
 399                 if 'token' in video_info:
 400                     break
 401             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 402                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 403                 return
 404         if 'token' not in video_info:
 405             if 'reason' in video_info:
 406                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
 407             else:
 408                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 409             return
 410
 411         # Check for "rental" videos
 412         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 413             self._downloader.trouble(u'ERROR: "rental" videos not supported')
 414             return
 415
 416         # Start extracting information
 417         self.report_information_extraction(video_id)
 418
 419         # uploader
 420         if 'author' not in video_info:
 421             self._downloader.trouble(u'ERROR: unable to extract uploader name')
 422             return
 423         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 424
 425         # uploader_id
 426         video_uploader_id = None
 427         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 428         if mobj is not None:
 429             video_uploader_id = mobj.group(1)
 430         else:
 431             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 432
 433         # title
 434         if 'title' not in video_info:
 435             self._downloader.trouble(u'ERROR: unable to extract video title')
 436             return
 437         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 438
 439         # thumbnail image
 440         if 'thumbnail_url' not in video_info:
 441             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 442             video_thumbnail = ''
 443         else:   # don't panic if we can't find it
 444             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 445
 446         # upload date
 447         upload_date = None
 448         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 449         if mobj is not None:
 450             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 451             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 452             for expression in format_expressions:
 453                 try:
 454                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 455                 except:
 456                     pass
 457
 458         # description
 459         video_description = get_element_by_id("eow-description", video_webpage)
 460         if video_description:
 461             video_description = clean_html(video_description)
 462         else:
 463             video_description = ''
 464
 465         # closed captions
 466         video_subtitles = None
 467         if self._downloader.params.get('writesubtitles', False):
 468             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
 469             if srt_error:
 470                 self._downloader.trouble(srt_error)
 471
 472         if 'length_seconds' not in video_info:
 473             self._downloader.trouble(u'WARNING: unable to extract video duration')
 474             video_duration = ''
 475         else:
 476             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 477
 478         # token
 479         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 480
 481         # Decide which formats to download
 482         req_format = self._downloader.params.get('format', None)
 483
 484         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 485             self.report_rtmp_download()
 486             video_url_list = [(None, video_info['conn'][0])]
 487         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 488             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 489             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 490             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 491             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 492
 493             format_limit = self._downloader.params.get('format_limit', None)
 494             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 495             if format_limit is not None and format_limit in available_formats:
 496                 format_list = available_formats[available_formats.index(format_limit):]
 497             else:
 498                 format_list = available_formats
 499             existing_formats = [x for x in format_list if x in url_map]
 500             if len(existing_formats) == 0:
 501                 self._downloader.trouble(u'ERROR: no known formats available for video')
 502                 return
 503             if self._downloader.params.get('listformats', None):
 504                 self._print_formats(existing_formats)
 505                 return
 506             if req_format is None or req_format == 'best':
 507                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 508             elif req_format == 'worst':
 509                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 510             elif req_format in ('-1', 'all'):
 511                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 512             else:
 513                 # Specific formats. We pick the first in a slash-delimeted sequence.
 514                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 515                 req_formats = req_format.split('/')
 516                 video_url_list = None
 517                 for rf in req_formats:
 518                     if rf in url_map:
 519                         video_url_list = [(rf, url_map[rf])]
 520                         break
 521                 if video_url_list is None:
 522                     self._downloader.trouble(u'ERROR: requested format not available')
 523                     return
 524         else:
 525             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 526             return
 527
 528         results = []
 529         for format_param, video_real_url in video_url_list:
 530             # Extension
 531             video_extension = self._video_extensions.get(format_param, 'flv')
 532
 533             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 534                                               self._video_dimensions.get(format_param, '???'))
 535
 536             results.append({
 537                 'id':       video_id,
 538                 'url':      video_real_url,
 539                 'uploader': video_uploader,
 540                 'uploader_id': video_uploader_id,
 541                 'upload_date':  upload_date,
 542                 'title':    video_title,
 543                 'ext':      video_extension,
 544                 'format':   video_format,
 545                 'thumbnail':    video_thumbnail,
 546                 'description':  video_description,
 547                 'player_url':   player_url,
 548                 'subtitles':    video_subtitles,
 549                 'duration':     video_duration
 550             })
 551         return results
 552
 553
 554 class MetacafeIE(InfoExtractor):
 555     """Information Extractor for metacafe.com."""
 556
 557     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 558     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 559     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 560     IE_NAME = u'metacafe'
 561
 562     def __init__(self, downloader=None):
 563         InfoExtractor.__init__(self, downloader)
 564
 565     def report_disclaimer(self):
 566         """Report disclaimer retrieval."""
 567         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 568
 569     def report_age_confirmation(self):
 570         """Report attempt to confirm age."""
 571         self._downloader.to_screen(u'[metacafe] Confirming age')
 572
 573     def report_download_webpage(self, video_id):
 574         """Report webpage download."""
 575         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 576
 577     def report_extraction(self, video_id):
 578         """Report information extraction."""
 579         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 580
 581     def _real_initialize(self):
 582         # Retrieve disclaimer
 583         request = compat_urllib_request.Request(self._DISCLAIMER)
 584         try:
 585             self.report_disclaimer()
 586             disclaimer = compat_urllib_request.urlopen(request).read()
 587         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 588             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
 589             return
 590
 591         # Confirm age
 592         disclaimer_form = {
 593             'filters': '0',
 594             'submit': "Continue - I'm over 18",
 595             }
 596         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 597         try:
 598             self.report_age_confirmation()
 599             disclaimer = compat_urllib_request.urlopen(request).read()
 600         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 601             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 602             return
 603
 604     def _real_extract(self, url):
 605         # Extract id and simplified title from URL
 606         mobj = re.match(self._VALID_URL, url)
 607         if mobj is None:
 608             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 609             return
 610
 611         video_id = mobj.group(1)
 612
 613         # Check if video comes from YouTube
 614         mobj2 = re.match(r'^yt-(.*)$', video_id)
 615         if mobj2 is not None:
 616             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 617             return
 618
 619         # Retrieve video webpage to extract further information
 620         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
 621         try:
 622             self.report_download_webpage(video_id)
 623             webpage = compat_urllib_request.urlopen(request).read()
 624         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 625             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 626             return
 627
 628         # Extract URL, uploader and title from webpage
 629         self.report_extraction(video_id)
 630         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 631         if mobj is not None:
 632             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 633             video_extension = mediaURL[-3:]
 634
 635             # Extract gdaKey if available
 636             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 637             if mobj is None:
 638                 video_url = mediaURL
 639             else:
 640                 gdaKey = mobj.group(1)
 641                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 642         else:
 643             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 644             if mobj is None:
 645                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 646                 return
 647             vardict = compat_parse_qs(mobj.group(1))
 648             if 'mediaData' not in vardict:
 649                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 650                 return
 651             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 652             if mobj is None:
 653                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 654                 return
 655             mediaURL = mobj.group(1).replace('\\/', '/')
 656             video_extension = mediaURL[-3:]
 657             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 658
 659         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 660         if mobj is None:
 661             self._downloader.trouble(u'ERROR: unable to extract title')
 662             return
 663         video_title = mobj.group(1).decode('utf-8')
 664
 665         mobj = re.search(r'submitter=(.*?);', webpage)
 666         if mobj is None:
 667             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 668             return
 669         video_uploader = mobj.group(1)
 670
 671         return [{
 672             'id':       video_id.decode('utf-8'),
 673             'url':      video_url.decode('utf-8'),
 674             'uploader': video_uploader.decode('utf-8'),
 675             'upload_date':  None,
 676             'title':    video_title,
 677             'ext':      video_extension.decode('utf-8'),
 678         }]
 679
 680
 681 class DailymotionIE(InfoExtractor):
 682     """Information Extractor for Dailymotion"""
 683
 684     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 685     IE_NAME = u'dailymotion'
 686
 687     def __init__(self, downloader=None):
 688         InfoExtractor.__init__(self, downloader)
 689
 690     def report_extraction(self, video_id):
 691         """Report information extraction."""
 692         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 693
 694     def _real_extract(self, url):
 695         # Extract id and simplified title from URL
 696         mobj = re.match(self._VALID_URL, url)
 697         if mobj is None:
 698             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 699             return
 700
 701         video_id = mobj.group(1).split('_')[0].split('?')[0]
 702
 703         video_extension = 'mp4'
 704
 705         # Retrieve video webpage to extract further information
 706         request = compat_urllib_request.Request(url)
 707         request.add_header('Cookie', 'family_filter=off')
 708         webpage = self._download_webpage(request, video_id)
 709
 710         # Extract URL, uploader and title from webpage
 711         self.report_extraction(video_id)
 712         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 713         if mobj is None:
 714             self._downloader.trouble(u'ERROR: unable to extract media URL')
 715             return
 716         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 717
 718         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 719             if key in flashvars:
 720                 max_quality = key
 721                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 722                 break
 723         else:
 724             self._downloader.trouble(u'ERROR: unable to extract video URL')
 725             return
 726
 727         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 728         if mobj is None:
 729             self._downloader.trouble(u'ERROR: unable to extract video URL')
 730             return
 731
 732         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 733
 734         # TODO: support choosing qualities
 735
 736         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 737         if mobj is None:
 738             self._downloader.trouble(u'ERROR: unable to extract title')
 739             return
 740         video_title = unescapeHTML(mobj.group('title'))
 741
 742         video_uploader = None
 743         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 744         if mobj is None:
 745             # lookin for official user
 746             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 747             if mobj_official is None:
 748                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 749             else:
 750                 video_uploader = mobj_official.group(1)
 751         else:
 752             video_uploader = mobj.group(1)
 753
 754         video_upload_date = None
 755         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 756         if mobj is not None:
 757             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 758
 759         return [{
 760             'id':       video_id,
 761             'url':      video_url,
 762             'uploader': video_uploader,
 763             'upload_date':  video_upload_date,
 764             'title':    video_title,
 765             'ext':      video_extension,
 766         }]
 767
 768
 769 class PhotobucketIE(InfoExtractor):
 770     """Information extractor for photobucket.com."""
 771
 772     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 773     IE_NAME = u'photobucket'
 774
 775     def __init__(self, downloader=None):
 776         InfoExtractor.__init__(self, downloader)
 777
 778     def report_download_webpage(self, video_id):
 779         """Report webpage download."""
 780         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 781
 782     def report_extraction(self, video_id):
 783         """Report information extraction."""
 784         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 785
 786     def _real_extract(self, url):
 787         # Extract id from URL
 788         mobj = re.match(self._VALID_URL, url)
 789         if mobj is None:
 790             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 791             return
 792
 793         video_id = mobj.group(1)
 794
 795         video_extension = 'flv'
 796
 797         # Retrieve video webpage to extract further information
 798         request = compat_urllib_request.Request(url)
 799         try:
 800             self.report_download_webpage(video_id)
 801             webpage = compat_urllib_request.urlopen(request).read()
 802         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 803             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 804             return
 805
 806         # Extract URL, uploader, and title from webpage
 807         self.report_extraction(video_id)
 808         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 809         if mobj is None:
 810             self._downloader.trouble(u'ERROR: unable to extract media URL')
 811             return
 812         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 813
 814         video_url = mediaURL
 815
 816         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 817         if mobj is None:
 818             self._downloader.trouble(u'ERROR: unable to extract title')
 819             return
 820         video_title = mobj.group(1).decode('utf-8')
 821
 822         video_uploader = mobj.group(2).decode('utf-8')
 823
 824         return [{
 825             'id':       video_id.decode('utf-8'),
 826             'url':      video_url.decode('utf-8'),
 827             'uploader': video_uploader,
 828             'upload_date':  None,
 829             'title':    video_title,
 830             'ext':      video_extension.decode('utf-8'),
 831         }]
 832
 833
 834 class YahooIE(InfoExtractor):
 835     """Information extractor for video.yahoo.com."""
 836
 837     _WORKING = False
 838     # _VALID_URL matches all Yahoo! Video URLs
 839     # _VPAGE_URL matches only the extractable '/watch/' URLs
 840     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 841     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 842     IE_NAME = u'video.yahoo'
 843
 844     def __init__(self, downloader=None):
 845         InfoExtractor.__init__(self, downloader)
 846
 847     def report_download_webpage(self, video_id):
 848         """Report webpage download."""
 849         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 850
 851     def report_extraction(self, video_id):
 852         """Report information extraction."""
 853         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 854
 855     def _real_extract(self, url, new_video=True):
 856         # Extract ID from URL
 857         mobj = re.match(self._VALID_URL, url)
 858         if mobj is None:
 859             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 860             return
 861
 862         video_id = mobj.group(2)
 863         video_extension = 'flv'
 864
 865         # Rewrite valid but non-extractable URLs as
 866         # extractable English language /watch/ URLs
 867         if re.match(self._VPAGE_URL, url) is None:
 868             request = compat_urllib_request.Request(url)
 869             try:
 870                 webpage = compat_urllib_request.urlopen(request).read()
 871             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 872                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 873                 return
 874
 875             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 876             if mobj is None:
 877                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 878                 return
 879             yahoo_id = mobj.group(1)
 880
 881             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 882             if mobj is None:
 883                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 884                 return
 885             yahoo_vid = mobj.group(1)
 886
 887             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 888             return self._real_extract(url, new_video=False)
 889
 890         # Retrieve video webpage to extract further information
 891         request = compat_urllib_request.Request(url)
 892         try:
 893             self.report_download_webpage(video_id)
 894             webpage = compat_urllib_request.urlopen(request).read()
 895         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 896             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 897             return
 898
 899         # Extract uploader and title from webpage
 900         self.report_extraction(video_id)
 901         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 902         if mobj is None:
 903             self._downloader.trouble(u'ERROR: unable to extract video title')
 904             return
 905         video_title = mobj.group(1).decode('utf-8')
 906
 907         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 908         if mobj is None:
 909             self._downloader.trouble(u'ERROR: unable to extract video uploader')
 910             return
 911         video_uploader = mobj.group(1).decode('utf-8')
 912
 913         # Extract video thumbnail
 914         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 915         if mobj is None:
 916             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 917             return
 918         video_thumbnail = mobj.group(1).decode('utf-8')
 919
 920         # Extract video description
 921         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 922         if mobj is None:
 923             self._downloader.trouble(u'ERROR: unable to extract video description')
 924             return
 925         video_description = mobj.group(1).decode('utf-8')
 926         if not video_description:
 927             video_description = 'No description available.'
 928
 929         # Extract video height and width
 930         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 931         if mobj is None:
 932             self._downloader.trouble(u'ERROR: unable to extract video height')
 933             return
 934         yv_video_height = mobj.group(1)
 935
 936         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 937         if mobj is None:
 938             self._downloader.trouble(u'ERROR: unable to extract video width')
 939             return
 940         yv_video_width = mobj.group(1)
 941
 942         # Retrieve video playlist to extract media URL
 943         # I'm not completely sure what all these options are, but we
 944         # seem to need most of them, otherwise the server sends a 401.
 945         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 946         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 947         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 948                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 949                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 950         try:
 951             self.report_download_webpage(video_id)
 952             webpage = compat_urllib_request.urlopen(request).read()
 953         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 954             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 955             return
 956
 957         # Extract media URL from playlist XML
 958         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 959         if mobj is None:
 960             self._downloader.trouble(u'ERROR: Unable to extract media URL')
 961             return
 962         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
 963         video_url = unescapeHTML(video_url)
 964
 965         return [{
 966             'id':       video_id.decode('utf-8'),
 967             'url':      video_url,
 968             'uploader': video_uploader,
 969             'upload_date':  None,
 970             'title':    video_title,
 971             'ext':      video_extension.decode('utf-8'),
 972             'thumbnail':    video_thumbnail.decode('utf-8'),
 973             'description':  video_description,
 974         }]
 975
 976
 977 class VimeoIE(InfoExtractor):
 978     """Information extractor for vimeo.com."""
 979
 980     # _VALID_URL matches Vimeo URLs
 981     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
 982     IE_NAME = u'vimeo'
 983
 984     def __init__(self, downloader=None):
 985         InfoExtractor.__init__(self, downloader)
 986
 987     def report_download_webpage(self, video_id):
 988         """Report webpage download."""
 989         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
 990
 991     def report_extraction(self, video_id):
 992         """Report information extraction."""
 993         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
 994
 995     def _real_extract(self, url, new_video=True):
 996         # Extract ID from URL
 997         mobj = re.match(self._VALID_URL, url)
 998         if mobj is None:
 999             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1000             return
1001
1002         video_id = mobj.group('id')
1003         if not mobj.group('proto'):
1004             url = 'https://' + url
1005         if mobj.group('direct_link'):
1006             url = 'https://vimeo.com/' + video_id
1007
1008         # Retrieve video webpage to extract further information
1009         request = compat_urllib_request.Request(url, None, std_headers)
1010         try:
1011             self.report_download_webpage(video_id)
1012             webpage_bytes = compat_urllib_request.urlopen(request).read()
1013             webpage = webpage_bytes.decode('utf-8')
1014         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1015             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1016             return
1017
1018         # Now we begin extracting as much information as we can from what we
1019         # retrieved. First we extract the information common to all extractors,
1020         # and latter we extract those that are Vimeo specific.
1021         self.report_extraction(video_id)
1022
1023         # Extract the config JSON
1024         try:
1025             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1026             config = json.loads(config)
1027         except:
1028             self._downloader.trouble(u'ERROR: unable to extract info section')
1029             return
1030
1031         # Extract title
1032         video_title = config["video"]["title"]
1033
1034         # Extract uploader and uploader_id
1035         video_uploader = config["video"]["owner"]["name"]
1036         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1037
1038         # Extract video thumbnail
1039         video_thumbnail = config["video"]["thumbnail"]
1040
1041         # Extract video description
1042         video_description = get_element_by_attribute("itemprop", "description", webpage)
1043         if video_description: video_description = clean_html(video_description)
1044         else: video_description = ''
1045
1046         # Extract upload date
1047         video_upload_date = None
1048         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1049         if mobj is not None:
1050             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1051
1052         # Vimeo specific: extract request signature and timestamp
1053         sig = config['request']['signature']
1054         timestamp = config['request']['timestamp']
1055
1056         # Vimeo specific: extract video codec and quality information
1057         # First consider quality, then codecs, then take everything
1058         # TODO bind to format param
1059         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1060         files = { 'hd': [], 'sd': [], 'other': []}
1061         for codec_name, codec_extension in codecs:
1062             if codec_name in config["video"]["files"]:
1063                 if 'hd' in config["video"]["files"][codec_name]:
1064                     files['hd'].append((codec_name, codec_extension, 'hd'))
1065                 elif 'sd' in config["video"]["files"][codec_name]:
1066                     files['sd'].append((codec_name, codec_extension, 'sd'))
1067                 else:
1068                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1069
1070         for quality in ('hd', 'sd', 'other'):
1071             if len(files[quality]) > 0:
1072                 video_quality = files[quality][0][2]
1073                 video_codec = files[quality][0][0]
1074                 video_extension = files[quality][0][1]
1075                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1076                 break
1077         else:
1078             self._downloader.trouble(u'ERROR: no known codec found')
1079             return
1080
1081         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1082                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1083
1084         return [{
1085             'id':       video_id,
1086             'url':      video_url,
1087             'uploader': video_uploader,
1088             'uploader_id': video_uploader_id,
1089             'upload_date':  video_upload_date,
1090             'title':    video_title,
1091             'ext':      video_extension,
1092             'thumbnail':    video_thumbnail,
1093             'description':  video_description,
1094         }]
1095
1096
1097 class ArteTvIE(InfoExtractor):
1098     """arte.tv information extractor."""
1099
1100     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1101     _LIVE_URL = r'index-[0-9]+\.html$'
1102
1103     IE_NAME = u'arte.tv'
1104
1105     def __init__(self, downloader=None):
1106         InfoExtractor.__init__(self, downloader)
1107
1108     def report_download_webpage(self, video_id):
1109         """Report webpage download."""
1110         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1111
1112     def report_extraction(self, video_id):
1113         """Report information extraction."""
1114         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1115
1116     def fetch_webpage(self, url):
1117         request = compat_urllib_request.Request(url)
1118         try:
1119             self.report_download_webpage(url)
1120             webpage = compat_urllib_request.urlopen(request).read()
1121         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1122             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1123             return
1124         except ValueError as err:
1125             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1126             return
1127         return webpage
1128
1129     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1130         page = self.fetch_webpage(url)
1131         mobj = re.search(regex, page, regexFlags)
1132         info = {}
1133
1134         if mobj is None:
1135             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1136             return
1137
1138         for (i, key, err) in matchTuples:
1139             if mobj.group(i) is None:
1140                 self._downloader.trouble(err)
1141                 return
1142             else:
1143                 info[key] = mobj.group(i)
1144
1145         return info
1146
1147     def extractLiveStream(self, url):
1148         video_lang = url.split('/')[-4]
1149         info = self.grep_webpage(
1150             url,
1151             r'src="(.*?/videothek_js.*?\.js)',
1152             0,
1153             [
1154                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1155             ]
1156         )
1157         http_host = url.split('/')[2]
1158         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1159         info = self.grep_webpage(
1160             next_url,
1161             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1162                 '(http://.*?\.swf).*?' +
1163                 '(rtmp://.*?)\'',
1164             re.DOTALL,
1165             [
1166                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1167                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1168                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1169             ]
1170         )
1171         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1172
1173     def extractPlus7Stream(self, url):
1174         video_lang = url.split('/')[-3]
1175         info = self.grep_webpage(
1176             url,
1177             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1178             0,
1179             [
1180                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1181             ]
1182         )
1183         next_url = compat_urllib_parse.unquote(info.get('url'))
1184         info = self.grep_webpage(
1185             next_url,
1186             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1187             0,
1188             [
1189                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1190             ]
1191         )
1192         next_url = compat_urllib_parse.unquote(info.get('url'))
1193
1194         info = self.grep_webpage(
1195             next_url,
1196             r'<video id="(.*?)".*?>.*?' +
1197                 '<name>(.*?)</name>.*?' +
1198                 '<dateVideo>(.*?)</dateVideo>.*?' +
1199                 '<url quality="hd">(.*?)</url>',
1200             re.DOTALL,
1201             [
1202                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1203                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1204                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1205                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1206             ]
1207         )
1208
1209         return {
1210             'id':           info.get('id'),
1211             'url':          compat_urllib_parse.unquote(info.get('url')),
1212             'uploader':     u'arte.tv',
1213             'upload_date':  info.get('date'),
1214             'title':        info.get('title').decode('utf-8'),
1215             'ext':          u'mp4',
1216             'format':       u'NA',
1217             'player_url':   None,
1218         }
1219
1220     def _real_extract(self, url):
1221         video_id = url.split('/')[-1]
1222         self.report_extraction(video_id)
1223
1224         if re.search(self._LIVE_URL, video_id) is not None:
1225             self.extractLiveStream(url)
1226             return
1227         else:
1228             info = self.extractPlus7Stream(url)
1229
1230         return [info]
1231
1232
1233 class GenericIE(InfoExtractor):
1234     """Generic last-resort information extractor."""
1235
1236     _VALID_URL = r'.*'
1237     IE_NAME = u'generic'
1238
1239     def __init__(self, downloader=None):
1240         InfoExtractor.__init__(self, downloader)
1241
1242     def report_download_webpage(self, video_id):
1243         """Report webpage download."""
1244         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1245         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1246
1247     def report_extraction(self, video_id):
1248         """Report information extraction."""
1249         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1250
1251     def report_following_redirect(self, new_url):
1252         """Report information extraction."""
1253         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1254
1255     def _test_redirect(self, url):
1256         """Check if it is a redirect, like url shorteners, in case restart chain."""
1257         class HeadRequest(compat_urllib_request.Request):
1258             def get_method(self):
1259                 return "HEAD"
1260
1261         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1262             """
1263             Subclass the HTTPRedirectHandler to make it use our
1264             HeadRequest also on the redirected URL
1265             """
1266             def redirect_request(self, req, fp, code, msg, headers, newurl):
1267                 if code in (301, 302, 303, 307):
1268                     newurl = newurl.replace(' ', '%20')
1269                     newheaders = dict((k,v) for k,v in req.headers.items()
1270                                       if k.lower() not in ("content-length", "content-type"))
1271                     return HeadRequest(newurl,
1272                                        headers=newheaders,
1273                                        origin_req_host=req.get_origin_req_host(),
1274                                        unverifiable=True)
1275                 else:
1276                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1277
1278         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1279             """
1280             Fallback to GET if HEAD is not allowed (405 HTTP error)
1281             """
1282             def http_error_405(self, req, fp, code, msg, headers):
1283                 fp.read()
1284                 fp.close()
1285
1286                 newheaders = dict((k,v) for k,v in req.headers.items()
1287                                   if k.lower() not in ("content-length", "content-type"))
1288                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1289                                                  headers=newheaders,
1290                                                  origin_req_host=req.get_origin_req_host(),
1291                                                  unverifiable=True))
1292
1293         # Build our opener
1294         opener = compat_urllib_request.OpenerDirector()
1295         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1296                         HTTPMethodFallback, HEADRedirectHandler,
1297                         compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1298             opener.add_handler(handler())
1299
1300         response = opener.open(HeadRequest(url))
1301         new_url = response.geturl()
1302
1303         if url == new_url:
1304             return False
1305
1306         self.report_following_redirect(new_url)
1307         self._downloader.download([new_url])
1308         return True
1309
1310     def _real_extract(self, url):
1311         if self._test_redirect(url): return
1312
1313         video_id = url.split('/')[-1]
1314         request = compat_urllib_request.Request(url)
1315         try:
1316             self.report_download_webpage(video_id)
1317             webpage = compat_urllib_request.urlopen(request).read()
1318         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1319             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1320             return
1321         except ValueError as err:
1322             # since this is the last-resort InfoExtractor, if
1323             # this error is thrown, it'll be thrown here
1324             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1325             return
1326
1327         self.report_extraction(video_id)
1328         # Start with something easy: JW Player in SWFObject
1329         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1330         if mobj is None:
1331             # Broaden the search a little bit
1332             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1333         if mobj is None:
1334             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1335             return
1336
1337         # It's possible that one of the regexes
1338         # matched, but returned an empty group:
1339         if mobj.group(1) is None:
1340             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1341             return
1342
1343         video_url = compat_urllib_parse.unquote(mobj.group(1))
1344         video_id = os.path.basename(video_url)
1345
1346         # here's a fun little line of code for you:
1347         video_extension = os.path.splitext(video_id)[1][1:]
1348         video_id = os.path.splitext(video_id)[0]
1349
1350         # it's tempting to parse this further, but you would
1351         # have to take into account all the variations like
1352         #   Video Title - Site Name
1353         #   Site Name | Video Title
1354         #   Video Title - Tagline | Site Name
1355         # and so on and so forth; it's just not practical
1356         mobj = re.search(r'<title>(.*)</title>', webpage)
1357         if mobj is None:
1358             self._downloader.trouble(u'ERROR: unable to extract title')
1359             return
1360         video_title = mobj.group(1)
1361
1362         # video uploader is domain name
1363         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1364         if mobj is None:
1365             self._downloader.trouble(u'ERROR: unable to extract title')
1366             return
1367         video_uploader = mobj.group(1)
1368
1369         return [{
1370             'id':       video_id,
1371             'url':      video_url,
1372             'uploader': video_uploader,
1373             'upload_date':  None,
1374             'title':    video_title,
1375             'ext':      video_extension,
1376         }]
1377
1378
1379 class YoutubeSearchIE(InfoExtractor):
1380     """Information Extractor for YouTube search queries."""
1381     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1382     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1383     _max_youtube_results = 1000
1384     IE_NAME = u'youtube:search'
1385
1386     def __init__(self, downloader=None):
1387         InfoExtractor.__init__(self, downloader)
1388
1389     def report_download_page(self, query, pagenum):
1390         """Report attempt to download search page with given number."""
1391         query = query.decode(preferredencoding())
1392         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1393
1394     def _real_extract(self, query):
1395         mobj = re.match(self._VALID_URL, query)
1396         if mobj is None:
1397             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1398             return
1399
1400         prefix, query = query.split(':')
1401         prefix = prefix[8:]
1402         query = query.encode('utf-8')
1403         if prefix == '':
1404             self._download_n_results(query, 1)
1405             return
1406         elif prefix == 'all':
1407             self._download_n_results(query, self._max_youtube_results)
1408             return
1409         else:
1410             try:
1411                 n = int(prefix)
1412                 if n <= 0:
1413                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1414                     return
1415                 elif n > self._max_youtube_results:
1416                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1417                     n = self._max_youtube_results
1418                 self._download_n_results(query, n)
1419                 return
1420             except ValueError: # parsing prefix as integer fails
1421                 self._download_n_results(query, 1)
1422                 return
1423
1424     def _download_n_results(self, query, n):
1425         """Downloads a specified number of results for a query"""
1426
1427         video_ids = []
1428         pagenum = 0
1429         limit = n
1430
1431         while (50 * pagenum) < limit:
1432             self.report_download_page(query, pagenum+1)
1433             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1434             request = compat_urllib_request.Request(result_url)
1435             try:
1436                 data = compat_urllib_request.urlopen(request).read()
1437             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1438                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1439                 return
1440             api_response = json.loads(data)['data']
1441
1442             new_ids = list(video['id'] for video in api_response['items'])
1443             video_ids += new_ids
1444
1445             limit = min(n, api_response['totalItems'])
1446             pagenum += 1
1447
1448         if len(video_ids) > n:
1449             video_ids = video_ids[:n]
1450         for id in video_ids:
1451             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1452         return
1453
1454
1455 class GoogleSearchIE(InfoExtractor):
1456     """Information Extractor for Google Video search queries."""
1457     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1458     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1459     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1460     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1461     _max_google_results = 1000
1462     IE_NAME = u'video.google:search'
1463
1464     def __init__(self, downloader=None):
1465         InfoExtractor.__init__(self, downloader)
1466
1467     def report_download_page(self, query, pagenum):
1468         """Report attempt to download playlist page with given number."""
1469         query = query.decode(preferredencoding())
1470         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1471
1472     def _real_extract(self, query):
1473         mobj = re.match(self._VALID_URL, query)
1474         if mobj is None:
1475             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1476             return
1477
1478         prefix, query = query.split(':')
1479         prefix = prefix[8:]
1480         query = query.encode('utf-8')
1481         if prefix == '':
1482             self._download_n_results(query, 1)
1483             return
1484         elif prefix == 'all':
1485             self._download_n_results(query, self._max_google_results)
1486             return
1487         else:
1488             try:
1489                 n = int(prefix)
1490                 if n <= 0:
1491                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1492                     return
1493                 elif n > self._max_google_results:
1494                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1495                     n = self._max_google_results
1496                 self._download_n_results(query, n)
1497                 return
1498             except ValueError: # parsing prefix as integer fails
1499                 self._download_n_results(query, 1)
1500                 return
1501
1502     def _download_n_results(self, query, n):
1503         """Downloads a specified number of results for a query"""
1504
1505         video_ids = []
1506         pagenum = 0
1507
1508         while True:
1509             self.report_download_page(query, pagenum)
1510             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1511             request = compat_urllib_request.Request(result_url)
1512             try:
1513                 page = compat_urllib_request.urlopen(request).read()
1514             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1515                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1516                 return
1517
1518             # Extract video identifiers
1519             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1520                 video_id = mobj.group(1)
1521                 if video_id not in video_ids:
1522                     video_ids.append(video_id)
1523                     if len(video_ids) == n:
1524                         # Specified n videos reached
1525                         for id in video_ids:
1526                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1527                         return
1528
1529             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1530                 for id in video_ids:
1531                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1532                 return
1533
1534             pagenum = pagenum + 1
1535
1536
1537 class YahooSearchIE(InfoExtractor):
1538     """Information Extractor for Yahoo! Video search queries."""
1539
1540     _WORKING = False
1541     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1542     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1543     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1544     _MORE_PAGES_INDICATOR = r'\s*Next'
1545     _max_yahoo_results = 1000
1546     IE_NAME = u'video.yahoo:search'
1547
1548     def __init__(self, downloader=None):
1549         InfoExtractor.__init__(self, downloader)
1550
1551     def report_download_page(self, query, pagenum):
1552         """Report attempt to download playlist page with given number."""
1553         query = query.decode(preferredencoding())
1554         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1555
1556     def _real_extract(self, query):
1557         mobj = re.match(self._VALID_URL, query)
1558         if mobj is None:
1559             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1560             return
1561
1562         prefix, query = query.split(':')
1563         prefix = prefix[8:]
1564         query = query.encode('utf-8')
1565         if prefix == '':
1566             self._download_n_results(query, 1)
1567             return
1568         elif prefix == 'all':
1569             self._download_n_results(query, self._max_yahoo_results)
1570             return
1571         else:
1572             try:
1573                 n = int(prefix)
1574                 if n <= 0:
1575                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1576                     return
1577                 elif n > self._max_yahoo_results:
1578                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1579                     n = self._max_yahoo_results
1580                 self._download_n_results(query, n)
1581                 return
1582             except ValueError: # parsing prefix as integer fails
1583                 self._download_n_results(query, 1)
1584                 return
1585
1586     def _download_n_results(self, query, n):
1587         """Downloads a specified number of results for a query"""
1588
1589         video_ids = []
1590         already_seen = set()
1591         pagenum = 1
1592
1593         while True:
1594             self.report_download_page(query, pagenum)
1595             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1596             request = compat_urllib_request.Request(result_url)
1597             try:
1598                 page = compat_urllib_request.urlopen(request).read()
1599             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1600                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1601                 return
1602
1603             # Extract video identifiers
1604             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1605                 video_id = mobj.group(1)
1606                 if video_id not in already_seen:
1607                     video_ids.append(video_id)
1608                     already_seen.add(video_id)
1609                     if len(video_ids) == n:
1610                         # Specified n videos reached
1611                         for id in video_ids:
1612                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1613                         return
1614
1615             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1616                 for id in video_ids:
1617                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1618                 return
1619
1620             pagenum = pagenum + 1
1621
1622
1623 class YoutubePlaylistIE(InfoExtractor):
1624     """Information Extractor for YouTube playlists."""
1625
1626     _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1627     _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1628     _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1629     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1630     IE_NAME = u'youtube:playlist'
1631
1632     def __init__(self, downloader=None):
1633         InfoExtractor.__init__(self, downloader)
1634
1635     def report_download_page(self, playlist_id, pagenum):
1636         """Report attempt to download playlist page with given number."""
1637         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1638
1639     def _real_extract(self, url):
1640         # Extract playlist id
1641         mobj = re.match(self._VALID_URL, url)
1642         if mobj is None:
1643             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1644             return
1645
1646         # Single video case
1647         if mobj.group(3) is not None:
1648             self._downloader.download([mobj.group(3)])
1649             return
1650
1651         # Download playlist pages
1652         # prefix is 'p' as default for playlists but there are other types that need extra care
1653         playlist_prefix = mobj.group(1)
1654         if playlist_prefix == 'a':
1655             playlist_access = 'artist'
1656         else:
1657             playlist_prefix = 'p'
1658             playlist_access = 'view_play_list'
1659         playlist_id = mobj.group(2)
1660         video_ids = []
1661         pagenum = 1
1662
1663         while True:
1664             self.report_download_page(playlist_id, pagenum)
1665             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1666             request = compat_urllib_request.Request(url)
1667             try:
1668                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1669             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1670                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1671                 return
1672
1673             # Extract video identifiers
1674             ids_in_page = []
1675             for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1676                 if mobj.group(1) not in ids_in_page:
1677                     ids_in_page.append(mobj.group(1))
1678             video_ids.extend(ids_in_page)
1679
1680             if self._MORE_PAGES_INDICATOR not in page:
1681                 break
1682             pagenum = pagenum + 1
1683
1684         total = len(video_ids)
1685
1686         playliststart = self._downloader.params.get('playliststart', 1) - 1
1687         playlistend = self._downloader.params.get('playlistend', -1)
1688         if playlistend == -1:
1689             video_ids = video_ids[playliststart:]
1690         else:
1691             video_ids = video_ids[playliststart:playlistend]
1692
1693         if len(video_ids) == total:
1694             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1695         else:
1696             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1697
1698         for id in video_ids:
1699             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1700         return
1701
1702
1703 class YoutubeChannelIE(InfoExtractor):
1704     """Information Extractor for YouTube channels."""
1705
1706     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1707     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1708     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1709     IE_NAME = u'youtube:channel'
1710
1711     def report_download_page(self, channel_id, pagenum):
1712         """Report attempt to download channel page with given number."""
1713         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1714
1715     def _real_extract(self, url):
1716         # Extract channel id
1717         mobj = re.match(self._VALID_URL, url)
1718         if mobj is None:
1719             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1720             return
1721
1722         # Download channel pages
1723         channel_id = mobj.group(1)
1724         video_ids = []
1725         pagenum = 1
1726
1727         while True:
1728             self.report_download_page(channel_id, pagenum)
1729             url = self._TEMPLATE_URL % (channel_id, pagenum)
1730             request = compat_urllib_request.Request(url)
1731             try:
1732                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1733             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1734                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1735                 return
1736
1737             # Extract video identifiers
1738             ids_in_page = []
1739             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1740                 if mobj.group(1) not in ids_in_page:
1741                     ids_in_page.append(mobj.group(1))
1742             video_ids.extend(ids_in_page)
1743
1744             if self._MORE_PAGES_INDICATOR not in page:
1745                 break
1746             pagenum = pagenum + 1
1747
1748         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1749
1750         for id in video_ids:
1751             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1752         return
1753
1754
1755 class YoutubeUserIE(InfoExtractor):
1756     """Information Extractor for YouTube users."""
1757
1758     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1759     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1760     _GDATA_PAGE_SIZE = 50
1761     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1762     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1763     IE_NAME = u'youtube:user'
1764
1765     def __init__(self, downloader=None):
1766         InfoExtractor.__init__(self, downloader)
1767
1768     def report_download_page(self, username, start_index):
1769         """Report attempt to download user page."""
1770         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1771                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1772
1773     def _real_extract(self, url):
1774         # Extract username
1775         mobj = re.match(self._VALID_URL, url)
1776         if mobj is None:
1777             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1778             return
1779
1780         username = mobj.group(1)
1781
1782         # Download video ids using YouTube Data API. Result size per
1783         # query is limited (currently to 50 videos) so we need to query
1784         # page by page until there are no video ids - it means we got
1785         # all of them.
1786
1787         video_ids = []
1788         pagenum = 0
1789
1790         while True:
1791             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1792             self.report_download_page(username, start_index)
1793
1794             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1795
1796             try:
1797                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1798             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1799                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1800                 return
1801
1802             # Extract video identifiers
1803             ids_in_page = []
1804
1805             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1806                 if mobj.group(1) not in ids_in_page:
1807                     ids_in_page.append(mobj.group(1))
1808
1809             video_ids.extend(ids_in_page)
1810
1811             # A little optimization - if current page is not
1812             # "full", ie. does not contain PAGE_SIZE video ids then
1813             # we can assume that this page is the last one - there
1814             # are no more ids on further pages - no need to query
1815             # again.
1816
1817             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1818                 break
1819
1820             pagenum += 1
1821
1822         all_ids_count = len(video_ids)
1823         playliststart = self._downloader.params.get('playliststart', 1) - 1
1824         playlistend = self._downloader.params.get('playlistend', -1)
1825
1826         if playlistend == -1:
1827             video_ids = video_ids[playliststart:]
1828         else:
1829             video_ids = video_ids[playliststart:playlistend]
1830
1831         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1832                 (username, all_ids_count, len(video_ids)))
1833
1834         for video_id in video_ids:
1835             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1836
1837
1838 class BlipTVUserIE(InfoExtractor):
1839     """Information Extractor for blip.tv users."""
1840
1841     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1842     _PAGE_SIZE = 12
1843     IE_NAME = u'blip.tv:user'
1844
1845     def __init__(self, downloader=None):
1846         InfoExtractor.__init__(self, downloader)
1847
1848     def report_download_page(self, username, pagenum):
1849         """Report attempt to download user page."""
1850         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1851                 (self.IE_NAME, username, pagenum))
1852
1853     def _real_extract(self, url):
1854         # Extract username
1855         mobj = re.match(self._VALID_URL, url)
1856         if mobj is None:
1857             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1858             return
1859
1860         username = mobj.group(1)
1861
1862         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1863
1864         request = compat_urllib_request.Request(url)
1865
1866         try:
1867             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1868             mobj = re.search(r'data-users-id="([^"]+)"', page)
1869             page_base = page_base % mobj.group(1)
1870         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1871             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1872             return
1873
1874
1875         # Download video ids using BlipTV Ajax calls. Result size per
1876         # query is limited (currently to 12 videos) so we need to query
1877         # page by page until there are no video ids - it means we got
1878         # all of them.
1879
1880         video_ids = []
1881         pagenum = 1
1882
1883         while True:
1884             self.report_download_page(username, pagenum)
1885
1886             request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1887
1888             try:
1889                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1890             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1891                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1892                 return
1893
1894             # Extract video identifiers
1895             ids_in_page = []
1896
1897             for mobj in re.finditer(r'href="/([^"]+)"', page):
1898                 if mobj.group(1) not in ids_in_page:
1899                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1900
1901             video_ids.extend(ids_in_page)
1902
1903             # A little optimization - if current page is not
1904             # "full", ie. does not contain PAGE_SIZE video ids then
1905             # we can assume that this page is the last one - there
1906             # are no more ids on further pages - no need to query
1907             # again.
1908
1909             if len(ids_in_page) < self._PAGE_SIZE:
1910                 break
1911
1912             pagenum += 1
1913
1914         all_ids_count = len(video_ids)
1915         playliststart = self._downloader.params.get('playliststart', 1) - 1
1916         playlistend = self._downloader.params.get('playlistend', -1)
1917
1918         if playlistend == -1:
1919             video_ids = video_ids[playliststart:]
1920         else:
1921             video_ids = video_ids[playliststart:playlistend]
1922
1923         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1924                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1925
1926         for video_id in video_ids:
1927             self._downloader.download([u'http://blip.tv/'+video_id])
1928
1929
1930 class DepositFilesIE(InfoExtractor):
1931     """Information extractor for depositfiles.com"""
1932
1933     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1934
1935     def report_download_webpage(self, file_id):
1936         """Report webpage download."""
1937         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1938
1939     def report_extraction(self, file_id):
1940         """Report information extraction."""
1941         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1942
1943     def _real_extract(self, url):
1944         file_id = url.split('/')[-1]
1945         # Rebuild url in english locale
1946         url = 'http://depositfiles.com/en/files/' + file_id
1947
1948         # Retrieve file webpage with 'Free download' button pressed
1949         free_download_indication = { 'gateway_result' : '1' }
1950         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1951         try:
1952             self.report_download_webpage(file_id)
1953             webpage = compat_urllib_request.urlopen(request).read()
1954         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1955             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1956             return
1957
1958         # Search for the real file URL
1959         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1960         if (mobj is None) or (mobj.group(1) is None):
1961             # Try to figure out reason of the error.
1962             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1963             if (mobj is not None) and (mobj.group(1) is not None):
1964                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1965                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1966             else:
1967                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1968             return
1969
1970         file_url = mobj.group(1)
1971         file_extension = os.path.splitext(file_url)[1][1:]
1972
1973         # Search for file title
1974         mobj = re.search(r'<b title="(.*?)">', webpage)
1975         if mobj is None:
1976             self._downloader.trouble(u'ERROR: unable to extract title')
1977             return
1978         file_title = mobj.group(1).decode('utf-8')
1979
1980         return [{
1981             'id':       file_id.decode('utf-8'),
1982             'url':      file_url.decode('utf-8'),
1983             'uploader': None,
1984             'upload_date':  None,
1985             'title':    file_title,
1986             'ext':      file_extension.decode('utf-8'),
1987         }]
1988
1989
1990 class FacebookIE(InfoExtractor):
1991     """Information Extractor for Facebook"""
1992
1993     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1994     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1995     _NETRC_MACHINE = 'facebook'
1996     IE_NAME = u'facebook'
1997
1998     def report_login(self):
1999         """Report attempt to log in."""
2000         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2001
2002     def _real_initialize(self):
2003         if self._downloader is None:
2004             return
2005
2006         useremail = None
2007         password = None
2008         downloader_params = self._downloader.params
2009
2010         # Attempt to use provided username and password or .netrc data
2011         if downloader_params.get('username', None) is not None:
2012             useremail = downloader_params['username']
2013             password = downloader_params['password']
2014         elif downloader_params.get('usenetrc', False):
2015             try:
2016                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2017                 if info is not None:
2018                     useremail = info[0]
2019                     password = info[2]
2020                 else:
2021                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2022             except (IOError, netrc.NetrcParseError) as err:
2023                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2024                 return
2025
2026         if useremail is None:
2027             return
2028
2029         # Log in
2030         login_form = {
2031             'email': useremail,
2032             'pass': password,
2033             'login': 'Log+In'
2034             }
2035         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2036         try:
2037             self.report_login()
2038             login_results = compat_urllib_request.urlopen(request).read()
2039             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2040                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2041                 return
2042         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2043             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2044             return
2045
2046     def _real_extract(self, url):
2047         mobj = re.match(self._VALID_URL, url)
2048         if mobj is None:
2049             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2050             return
2051         video_id = mobj.group('ID')
2052
2053         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2054         webpage = self._download_webpage(url, video_id)
2055
2056         BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2057         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2058         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2059         if not m:
2060             raise ExtractorError(u'Cannot parse data')
2061         data = dict(json.loads(m.group(1)))
2062         params_raw = compat_urllib_parse.unquote(data['params'])
2063         params = json.loads(params_raw)
2064         video_url = params['hd_src']
2065         video_duration = int(params['video_duration'])
2066
2067         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2068         if not m:
2069             raise ExtractorError(u'Cannot find title in webpage')
2070         video_title = unescapeHTML(m.group(1))
2071
2072         info = {
2073             'id': video_id,
2074             'title': video_title,
2075             'url': video_url,
2076             'ext': 'mp4',
2077             'duration': video_duration,
2078             'thumbnail': params['thumbnail_src'],
2079         }
2080         return [info]
2081
2082
2083 class BlipTVIE(InfoExtractor):
2084     """Information extractor for blip.tv"""
2085
2086     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2087     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2088     IE_NAME = u'blip.tv'
2089
2090     def report_extraction(self, file_id):
2091         """Report information extraction."""
2092         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2093
2094     def report_direct_download(self, title):
2095         """Report information extraction."""
2096         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2097
2098     def _real_extract(self, url):
2099         mobj = re.match(self._VALID_URL, url)
2100         if mobj is None:
2101             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2102             return
2103
2104         if '?' in url:
2105             cchar = '&'
2106         else:
2107             cchar = '?'
2108         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2109         request = compat_urllib_request.Request(json_url)
2110         request.add_header('User-Agent', 'iTunes/10.6.1')
2111         self.report_extraction(mobj.group(1))
2112         info = None
2113         try:
2114             urlh = compat_urllib_request.urlopen(request)
2115             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2116                 basename = url.split('/')[-1]
2117                 title,ext = os.path.splitext(basename)
2118                 title = title.decode('UTF-8')
2119                 ext = ext.replace('.', '')
2120                 self.report_direct_download(title)
2121                 info = {
2122                     'id': title,
2123                     'url': url,
2124                     'uploader': None,
2125                     'upload_date': None,
2126                     'title': title,
2127                     'ext': ext,
2128                     'urlhandle': urlh
2129                 }
2130         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2131             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2132         if info is None: # Regular URL
2133             try:
2134                 json_code_bytes = urlh.read()
2135                 json_code = json_code_bytes.decode('utf-8')
2136             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2137                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2138                 return
2139
2140             try:
2141                 json_data = json.loads(json_code)
2142                 if 'Post' in json_data:
2143                     data = json_data['Post']
2144                 else:
2145                     data = json_data
2146
2147                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2148                 video_url = data['media']['url']
2149                 umobj = re.match(self._URL_EXT, video_url)
2150                 if umobj is None:
2151                     raise ValueError('Can not determine filename extension')
2152                 ext = umobj.group(1)
2153
2154                 info = {
2155                     'id': data['item_id'],
2156                     'url': video_url,
2157                     'uploader': data['display_name'],
2158                     'upload_date': upload_date,
2159                     'title': data['title'],
2160                     'ext': ext,
2161                     'format': data['media']['mimeType'],
2162                     'thumbnail': data['thumbnailUrl'],
2163                     'description': data['description'],
2164                     'player_url': data['embedUrl'],
2165                     'user_agent': 'iTunes/10.6.1',
2166                 }
2167             except (ValueError,KeyError) as err:
2168                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2169                 return
2170
2171         return [info]
2172
2173
2174 class MyVideoIE(InfoExtractor):
2175     """Information Extractor for myvideo.de."""
2176
2177     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2178     IE_NAME = u'myvideo'
2179
2180     def __init__(self, downloader=None):
2181         InfoExtractor.__init__(self, downloader)
2182
2183     def report_extraction(self, video_id):
2184         """Report information extraction."""
2185         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2186
2187     def _real_extract(self,url):
2188         mobj = re.match(self._VALID_URL, url)
2189         if mobj is None:
2190             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2191             return
2192
2193         video_id = mobj.group(1)
2194
2195         # Get video webpage
2196         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2197         webpage = self._download_webpage(webpage_url, video_id)
2198
2199         self.report_extraction(video_id)
2200         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2201                  webpage)
2202         if mobj is None:
2203             self._downloader.trouble(u'ERROR: unable to extract media URL')
2204             return
2205         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2206
2207         mobj = re.search('<title>([^<]+)</title>', webpage)
2208         if mobj is None:
2209             self._downloader.trouble(u'ERROR: unable to extract title')
2210             return
2211
2212         video_title = mobj.group(1)
2213
2214         return [{
2215             'id':       video_id,
2216             'url':      video_url,
2217             'uploader': None,
2218             'upload_date':  None,
2219             'title':    video_title,
2220             'ext':      u'flv',
2221         }]
2222
2223 class ComedyCentralIE(InfoExtractor):
2224     """Information extractor for The Daily Show and Colbert Report """
2225
2226     # urls can be abbreviations like :thedailyshow or :colbert
2227     # urls for episodes like:
2228     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2229     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2230     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2231     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2232                       |(https?://)?(www\.)?
2233                           (?P<showname>thedailyshow|colbertnation)\.com/
2234                          (full-episodes/(?P<episode>.*)|
2235                           (?P<clip>
2236                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2237                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2238                      $"""
2239
2240     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2241
2242     _video_extensions = {
2243         '3500': 'mp4',
2244         '2200': 'mp4',
2245         '1700': 'mp4',
2246         '1200': 'mp4',
2247         '750': 'mp4',
2248         '400': 'mp4',
2249     }
2250     _video_dimensions = {
2251         '3500': '1280x720',
2252         '2200': '960x540',
2253         '1700': '768x432',
2254         '1200': '640x360',
2255         '750': '512x288',
2256         '400': '384x216',
2257     }
2258
2259     def suitable(self, url):
2260         """Receives a URL and returns True if suitable for this IE."""
2261         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2262
2263     def report_extraction(self, episode_id):
2264         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2265
2266     def report_config_download(self, episode_id, media_id):
2267         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2268
2269     def report_index_download(self, episode_id):
2270         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2271
2272     def _print_formats(self, formats):
2273         print('Available formats:')
2274         for x in formats:
2275             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2276
2277
2278     def _real_extract(self, url):
2279         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2280         if mobj is None:
2281             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2282             return
2283
2284         if mobj.group('shortname'):
2285             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2286                 url = u'http://www.thedailyshow.com/full-episodes/'
2287             else:
2288                 url = u'http://www.colbertnation.com/full-episodes/'
2289             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2290             assert mobj is not None
2291
2292         if mobj.group('clip'):
2293             if mobj.group('showname') == 'thedailyshow':
2294                 epTitle = mobj.group('tdstitle')
2295             else:
2296                 epTitle = mobj.group('cntitle')
2297             dlNewest = False
2298         else:
2299             dlNewest = not mobj.group('episode')
2300             if dlNewest:
2301                 epTitle = mobj.group('showname')
2302             else:
2303                 epTitle = mobj.group('episode')
2304
2305         req = compat_urllib_request.Request(url)
2306         self.report_extraction(epTitle)
2307         try:
2308             htmlHandle = compat_urllib_request.urlopen(req)
2309             html = htmlHandle.read()
2310             webpage = html.decode('utf-8')
2311         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2312             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2313             return
2314         if dlNewest:
2315             url = htmlHandle.geturl()
2316             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2317             if mobj is None:
2318                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2319                 return
2320             if mobj.group('episode') == '':
2321                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2322                 return
2323             epTitle = mobj.group('episode')
2324
2325         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2326
2327         if len(mMovieParams) == 0:
2328             # The Colbert Report embeds the information in a without
2329             # a URL prefix; so extract the alternate reference
2330             # and then add the URL prefix manually.
2331
2332             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2333             if len(altMovieParams) == 0:
2334                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2335                 return
2336             else:
2337                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2338
2339         uri = mMovieParams[0][1]
2340         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2341         self.report_index_download(epTitle)
2342         try:
2343             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2344         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2345             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2346             return
2347
2348         results = []
2349
2350         idoc = xml.etree.ElementTree.fromstring(indexXml)
2351         itemEls = idoc.findall('.//item')
2352         for partNum,itemEl in enumerate(itemEls):
2353             mediaId = itemEl.findall('./guid')[0].text
2354             shortMediaId = mediaId.split(':')[-1]
2355             showId = mediaId.split(':')[-2].replace('.com', '')
2356             officialTitle = itemEl.findall('./title')[0].text
2357             officialDate = itemEl.findall('./pubDate')[0].text
2358
2359             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2360                         compat_urllib_parse.urlencode({'uri': mediaId}))
2361             configReq = compat_urllib_request.Request(configUrl)
2362             self.report_config_download(epTitle, shortMediaId)
2363             try:
2364                 configXml = compat_urllib_request.urlopen(configReq).read()
2365             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2366                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2367                 return
2368
2369             cdoc = xml.etree.ElementTree.fromstring(configXml)
2370             turls = []
2371             for rendition in cdoc.findall('.//rendition'):
2372                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2373                 turls.append(finfo)
2374
2375             if len(turls) == 0:
2376                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2377                 continue
2378
2379             if self._downloader.params.get('listformats', None):
2380                 self._print_formats([i[0] for i in turls])
2381                 return
2382
2383             # For now, just pick the highest bitrate
2384             format,rtmp_video_url = turls[-1]
2385
2386             # Get the format arg from the arg stream
2387             req_format = self._downloader.params.get('format', None)
2388
2389             # Select format if we can find one
2390             for f,v in turls:
2391                 if f == req_format:
2392                     format, rtmp_video_url = f, v
2393                     break
2394
2395             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2396             if not m:
2397                 raise ExtractorError(u'Cannot transform RTMP url')
2398             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2399             video_url = base + m.group('finalid')
2400
2401             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2402             info = {
2403                 'id': shortMediaId,
2404                 'url': video_url,
2405                 'uploader': showId,
2406                 'upload_date': officialDate,
2407                 'title': effTitle,
2408                 'ext': 'mp4',
2409                 'format': format,
2410                 'thumbnail': None,
2411                 'description': officialTitle,
2412             }
2413             results.append(info)
2414
2415         return results
2416
2417
2418 class EscapistIE(InfoExtractor):
2419     """Information extractor for The Escapist """
2420
2421     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2422     IE_NAME = u'escapist'
2423
2424     def report_extraction(self, showName):
2425         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2426
2427     def report_config_download(self, showName):
2428         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2429
2430     def _real_extract(self, url):
2431         mobj = re.match(self._VALID_URL, url)
2432         if mobj is None:
2433             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2434             return
2435         showName = mobj.group('showname')
2436         videoId = mobj.group('episode')
2437
2438         self.report_extraction(showName)
2439         try:
2440             webPage = compat_urllib_request.urlopen(url)
2441             webPageBytes = webPage.read()
2442             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2443             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2444         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2445             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2446             return
2447
2448         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2449         description = unescapeHTML(descMatch.group(1))
2450         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2451         imgUrl = unescapeHTML(imgMatch.group(1))
2452         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2453         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2454         configUrlMatch = re.search('config=(.*)$', playerUrl)
2455         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2456
2457         self.report_config_download(showName)
2458         try:
2459             configJSON = compat_urllib_request.urlopen(configUrl)
2460             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2461             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2462         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2463             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2464             return
2465
2466         # Technically, it's JavaScript, not JSON
2467         configJSON = configJSON.replace("'", '"')
2468
2469         try:
2470             config = json.loads(configJSON)
2471         except (ValueError,) as err:
2472             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2473             return
2474
2475         playlist = config['playlist']
2476         videoUrl = playlist[1]['url']
2477
2478         info = {
2479             'id': videoId,
2480             'url': videoUrl,
2481             'uploader': showName,
2482             'upload_date': None,
2483             'title': showName,
2484             'ext': 'flv',
2485             'thumbnail': imgUrl,
2486             'description': description,
2487             'player_url': playerUrl,
2488         }
2489
2490         return [info]
2491
2492 class CollegeHumorIE(InfoExtractor):
2493     """Information extractor for collegehumor.com"""
2494
2495     _WORKING = False
2496     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2497     IE_NAME = u'collegehumor'
2498
2499     def report_manifest(self, video_id):
2500         """Report information extraction."""
2501         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2502
2503     def report_extraction(self, video_id):
2504         """Report information extraction."""
2505         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2506
2507     def _real_extract(self, url):
2508         mobj = re.match(self._VALID_URL, url)
2509         if mobj is None:
2510             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2511             return
2512         video_id = mobj.group('videoid')
2513
2514         info = {
2515             'id': video_id,
2516             'uploader': None,
2517             'upload_date': None,
2518         }
2519
2520         self.report_extraction(video_id)
2521         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2522         try:
2523             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2524         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2525             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2526             return
2527
2528         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2529         try:
2530             videoNode = mdoc.findall('./video')[0]
2531             info['description'] = videoNode.findall('./description')[0].text
2532             info['title'] = videoNode.findall('./caption')[0].text
2533             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2534             manifest_url = videoNode.findall('./file')[0].text
2535         except IndexError:
2536             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2537             return
2538
2539         manifest_url += '?hdcore=2.10.3'
2540         self.report_manifest(video_id)
2541         try:
2542             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2543         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2544             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2545             return
2546
2547         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2548         try:
2549             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2550             node_id = media_node.attrib['url']
2551             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2552         except IndexError as err:
2553             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2554             return
2555
2556         url_pr = compat_urllib_parse_urlparse(manifest_url)
2557         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2558
2559         info['url'] = url
2560         info['ext'] = 'f4f'
2561         return [info]
2562
2563
2564 class XVideosIE(InfoExtractor):
2565     """Information extractor for xvideos.com"""
2566
2567     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2568     IE_NAME = u'xvideos'
2569
2570     def report_extraction(self, video_id):
2571         """Report information extraction."""
2572         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2573
2574     def _real_extract(self, url):
2575         mobj = re.match(self._VALID_URL, url)
2576         if mobj is None:
2577             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2578             return
2579         video_id = mobj.group(1)
2580
2581         webpage = self._download_webpage(url, video_id)
2582
2583         self.report_extraction(video_id)
2584
2585
2586         # Extract video URL
2587         mobj = re.search(r'flv_url=(.+?)&', webpage)
2588         if mobj is None:
2589             self._downloader.trouble(u'ERROR: unable to extract video url')
2590             return
2591         video_url = compat_urllib_parse.unquote(mobj.group(1))
2592
2593
2594         # Extract title
2595         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2596         if mobj is None:
2597             self._downloader.trouble(u'ERROR: unable to extract video title')
2598             return
2599         video_title = mobj.group(1)
2600
2601
2602         # Extract video thumbnail
2603         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2604         if mobj is None:
2605             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2606             return
2607         video_thumbnail = mobj.group(0)
2608
2609         info = {
2610             'id': video_id,
2611             'url': video_url,
2612             'uploader': None,
2613             'upload_date': None,
2614             'title': video_title,
2615             'ext': 'flv',
2616             'thumbnail': video_thumbnail,
2617             'description': None,
2618         }
2619
2620         return [info]
2621
2622
2623 class SoundcloudIE(InfoExtractor):
2624     """Information extractor for soundcloud.com
2625        To access the media, the uid of the song and a stream token
2626        must be extracted from the page source and the script must make
2627        a request to media.soundcloud.com/crossdomain.xml. Then
2628        the media can be grabbed by requesting from an url composed
2629        of the stream token and uid
2630      """
2631
2632     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2633     IE_NAME = u'soundcloud'
2634
2635     def __init__(self, downloader=None):
2636         InfoExtractor.__init__(self, downloader)
2637
2638     def report_resolve(self, video_id):
2639         """Report information extraction."""
2640         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2641
2642     def report_extraction(self, video_id):
2643         """Report information extraction."""
2644         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2645
2646     def _real_extract(self, url):
2647         mobj = re.match(self._VALID_URL, url)
2648         if mobj is None:
2649             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2650             return
2651
2652         # extract uploader (which is in the url)
2653         uploader = mobj.group(1)
2654         # extract simple title (uploader + slug of song title)
2655         slug_title =  mobj.group(2)
2656         simple_title = uploader + u'-' + slug_title
2657
2658         self.report_resolve('%s/%s' % (uploader, slug_title))
2659
2660         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2661         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2662         request = compat_urllib_request.Request(resolv_url)
2663         try:
2664             info_json_bytes = compat_urllib_request.urlopen(request).read()
2665             info_json = info_json_bytes.decode('utf-8')
2666         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2667             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2668             return
2669
2670         info = json.loads(info_json)
2671         video_id = info['id']
2672         self.report_extraction('%s/%s' % (uploader, slug_title))
2673
2674         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2675         request = compat_urllib_request.Request(streams_url)
2676         try:
2677             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2678             stream_json = stream_json_bytes.decode('utf-8')
2679         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2680             self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2681             return
2682
2683         streams = json.loads(stream_json)
2684         mediaURL = streams['http_mp3_128_url']
2685
2686         return [{
2687             'id':       info['id'],
2688             'url':      mediaURL,
2689             'uploader': info['user']['username'],
2690             'upload_date':  info['created_at'],
2691             'title':    info['title'],
2692             'ext':      u'mp3',
2693             'description': info['description'],
2694         }]
2695
2696
2697 class InfoQIE(InfoExtractor):
2698     """Information extractor for infoq.com"""
2699     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2700
2701     def report_extraction(self, video_id):
2702         """Report information extraction."""
2703         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2704
2705     def _real_extract(self, url):
2706         mobj = re.match(self._VALID_URL, url)
2707         if mobj is None:
2708             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2709             return
2710
2711         webpage = self._download_webpage(url, video_id=url)
2712         self.report_extraction(url)
2713
2714         # Extract video URL
2715         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2716         if mobj is None:
2717             self._downloader.trouble(u'ERROR: unable to extract video url')
2718             return
2719         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2720         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2721
2722         # Extract title
2723         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2724         if mobj is None:
2725             self._downloader.trouble(u'ERROR: unable to extract video title')
2726             return
2727         video_title = mobj.group(1)
2728
2729         # Extract description
2730         video_description = u'No description available.'
2731         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2732         if mobj is not None:
2733             video_description = mobj.group(1)
2734
2735         video_filename = video_url.split('/')[-1]
2736         video_id, extension = video_filename.split('.')
2737
2738         info = {
2739             'id': video_id,
2740             'url': video_url,
2741             'uploader': None,
2742             'upload_date': None,
2743             'title': video_title,
2744             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2745             'thumbnail': None,
2746             'description': video_description,
2747         }
2748
2749         return [info]
2750
2751 class MixcloudIE(InfoExtractor):
2752     """Information extractor for www.mixcloud.com"""
2753
2754     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2755     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2756     IE_NAME = u'mixcloud'
2757
2758     def __init__(self, downloader=None):
2759         InfoExtractor.__init__(self, downloader)
2760
2761     def report_download_json(self, file_id):
2762         """Report JSON download."""
2763         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2764
2765     def report_extraction(self, file_id):
2766         """Report information extraction."""
2767         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2768
2769     def get_urls(self, jsonData, fmt, bitrate='best'):
2770         """Get urls from 'audio_formats' section in json"""
2771         file_url = None
2772         try:
2773             bitrate_list = jsonData[fmt]
2774             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2775                 bitrate = max(bitrate_list) # select highest
2776
2777             url_list = jsonData[fmt][bitrate]
2778         except TypeError: # we have no bitrate info.
2779             url_list = jsonData[fmt]
2780         return url_list
2781
2782     def check_urls(self, url_list):
2783         """Returns 1st active url from list"""
2784         for url in url_list:
2785             try:
2786                 compat_urllib_request.urlopen(url)
2787                 return url
2788             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2789                 url = None
2790
2791         return None
2792
2793     def _print_formats(self, formats):
2794         print('Available formats:')
2795         for fmt in formats.keys():
2796             for b in formats[fmt]:
2797                 try:
2798                     ext = formats[fmt][b][0]
2799                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2800                 except TypeError: # we have no bitrate info
2801                     ext = formats[fmt][0]
2802                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2803                     break
2804
2805     def _real_extract(self, url):
2806         mobj = re.match(self._VALID_URL, url)
2807         if mobj is None:
2808             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2809             return
2810         # extract uploader & filename from url
2811         uploader = mobj.group(1).decode('utf-8')
2812         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2813
2814         # construct API request
2815         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2816         # retrieve .json file with links to files
2817         request = compat_urllib_request.Request(file_url)
2818         try:
2819             self.report_download_json(file_url)
2820             jsonData = compat_urllib_request.urlopen(request).read()
2821         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2822             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2823             return
2824
2825         # parse JSON
2826         json_data = json.loads(jsonData)
2827         player_url = json_data['player_swf_url']
2828         formats = dict(json_data['audio_formats'])
2829
2830         req_format = self._downloader.params.get('format', None)
2831         bitrate = None
2832
2833         if self._downloader.params.get('listformats', None):
2834             self._print_formats(formats)
2835             return
2836
2837         if req_format is None or req_format == 'best':
2838             for format_param in formats.keys():
2839                 url_list = self.get_urls(formats, format_param)
2840                 # check urls
2841                 file_url = self.check_urls(url_list)
2842                 if file_url is not None:
2843                     break # got it!
2844         else:
2845             if req_format not in formats:
2846                 self._downloader.trouble(u'ERROR: format is not available')
2847                 return
2848
2849             url_list = self.get_urls(formats, req_format)
2850             file_url = self.check_urls(url_list)
2851             format_param = req_format
2852
2853         return [{
2854             'id': file_id.decode('utf-8'),
2855             'url': file_url.decode('utf-8'),
2856             'uploader': uploader.decode('utf-8'),
2857             'upload_date': None,
2858             'title': json_data['name'],
2859             'ext': file_url.split('.')[-1].decode('utf-8'),
2860             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2861             'thumbnail': json_data['thumbnail_url'],
2862             'description': json_data['description'],
2863             'player_url': player_url.decode('utf-8'),
2864         }]
2865
2866 class StanfordOpenClassroomIE(InfoExtractor):
2867     """Information extractor for Stanford's Open ClassRoom"""
2868
2869     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2870     IE_NAME = u'stanfordoc'
2871
2872     def report_download_webpage(self, objid):
2873         """Report information extraction."""
2874         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2875
2876     def report_extraction(self, video_id):
2877         """Report information extraction."""
2878         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2879
2880     def _real_extract(self, url):
2881         mobj = re.match(self._VALID_URL, url)
2882         if mobj is None:
2883             raise ExtractorError(u'Invalid URL: %s' % url)
2884
2885         if mobj.group('course') and mobj.group('video'): # A specific video
2886             course = mobj.group('course')
2887             video = mobj.group('video')
2888             info = {
2889                 'id': course + '_' + video,
2890                 'uploader': None,
2891                 'upload_date': None,
2892             }
2893
2894             self.report_extraction(info['id'])
2895             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2896             xmlUrl = baseUrl + video + '.xml'
2897             try:
2898                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2899             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2900                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2901                 return
2902             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2903             try:
2904                 info['title'] = mdoc.findall('./title')[0].text
2905                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2906             except IndexError:
2907                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2908                 return
2909             info['ext'] = info['url'].rpartition('.')[2]
2910             return [info]
2911         elif mobj.group('course'): # A course page
2912             course = mobj.group('course')
2913             info = {
2914                 'id': course,
2915                 'type': 'playlist',
2916                 'uploader': None,
2917                 'upload_date': None,
2918             }
2919
2920             coursepage = self._download_webpage(url, info['id'],
2921                                         note='Downloading course info page',
2922                                         errnote='Unable to download course info page')
2923
2924             m = re.search('<h1>([^<]+)</h1>', coursepage)
2925             if m:
2926                 info['title'] = unescapeHTML(m.group(1))
2927             else:
2928                 info['title'] = info['id']
2929
2930             m = re.search('<description>([^<]+)</description>', coursepage)
2931             if m:
2932                 info['description'] = unescapeHTML(m.group(1))
2933
2934             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2935             info['list'] = [
2936                 {
2937                     'type': 'reference',
2938                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2939                 }
2940                     for vpage in links]
2941             results = []
2942             for entry in info['list']:
2943                 assert entry['type'] == 'reference'
2944                 results += self.extract(entry['url'])
2945             return results
2946         else: # Root page
2947             info = {
2948                 'id': 'Stanford OpenClassroom',
2949                 'type': 'playlist',
2950                 'uploader': None,
2951                 'upload_date': None,
2952             }
2953
2954             self.report_download_webpage(info['id'])
2955             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2956             try:
2957                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2958             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2959                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
2960                 return
2961
2962             info['title'] = info['id']
2963
2964             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2965             info['list'] = [
2966                 {
2967                     'type': 'reference',
2968                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2969                 }
2970                     for cpage in links]
2971
2972             results = []
2973             for entry in info['list']:
2974                 assert entry['type'] == 'reference'
2975                 results += self.extract(entry['url'])
2976             return results
2977
2978 class MTVIE(InfoExtractor):
2979     """Information extractor for MTV.com"""
2980
2981     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2982     IE_NAME = u'mtv'
2983
2984     def report_extraction(self, video_id):
2985         """Report information extraction."""
2986         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2987
2988     def _real_extract(self, url):
2989         mobj = re.match(self._VALID_URL, url)
2990         if mobj is None:
2991             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2992             return
2993         if not mobj.group('proto'):
2994             url = 'http://' + url
2995         video_id = mobj.group('videoid')
2996
2997         webpage = self._download_webpage(url, video_id)
2998
2999         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3000         if mobj is None:
3001             self._downloader.trouble(u'ERROR: unable to extract song name')
3002             return
3003         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3004         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3005         if mobj is None:
3006             self._downloader.trouble(u'ERROR: unable to extract performer')
3007             return
3008         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3009         video_title = performer + ' - ' + song_name
3010
3011         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3012         if mobj is None:
3013             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3014             return
3015         mtvn_uri = mobj.group(1)
3016
3017         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3018         if mobj is None:
3019             self._downloader.trouble(u'ERROR: unable to extract content id')
3020             return
3021         content_id = mobj.group(1)
3022
3023         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3024         self.report_extraction(video_id)
3025         request = compat_urllib_request.Request(videogen_url)
3026         try:
3027             metadataXml = compat_urllib_request.urlopen(request).read()
3028         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3029             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3030             return
3031
3032         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3033         renditions = mdoc.findall('.//rendition')
3034
3035         # For now, always pick the highest quality.
3036         rendition = renditions[-1]
3037
3038         try:
3039             _,_,ext = rendition.attrib['type'].partition('/')
3040             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3041             video_url = rendition.find('./src').text
3042         except KeyError:
3043             self._downloader.trouble('Invalid rendition field.')
3044             return
3045
3046         info = {
3047             'id': video_id,
3048             'url': video_url,
3049             'uploader': performer,
3050             'upload_date': None,
3051             'title': video_title,
3052             'ext': ext,
3053             'format': format,
3054         }
3055
3056         return [info]
3057
3058
3059 class YoukuIE(InfoExtractor):
3060     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3061
3062     def report_download_webpage(self, file_id):
3063         """Report webpage download."""
3064         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3065
3066     def report_extraction(self, file_id):
3067         """Report information extraction."""
3068         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3069
3070     def _gen_sid(self):
3071         nowTime = int(time.time() * 1000)
3072         random1 = random.randint(1000,1998)
3073         random2 = random.randint(1000,9999)
3074
3075         return "%d%d%d" %(nowTime,random1,random2)
3076
3077     def _get_file_ID_mix_string(self, seed):
3078         mixed = []
3079         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3080         seed = float(seed)
3081         for i in range(len(source)):
3082             seed  =  (seed * 211 + 30031 ) % 65536
3083             index  =  math.floor(seed / 65536 * len(source) )
3084             mixed.append(source[int(index)])
3085             source.remove(source[int(index)])
3086         #return ''.join(mixed)
3087         return mixed
3088
3089     def _get_file_id(self, fileId, seed):
3090         mixed = self._get_file_ID_mix_string(seed)
3091         ids = fileId.split('*')
3092         realId = []
3093         for ch in ids:
3094             if ch:
3095                 realId.append(mixed[int(ch)])
3096         return ''.join(realId)
3097
3098     def _real_extract(self, url):
3099         mobj = re.match(self._VALID_URL, url)
3100         if mobj is None:
3101             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3102             return
3103         video_id = mobj.group('ID')
3104
3105         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3106
3107         request = compat_urllib_request.Request(info_url, None, std_headers)
3108         try:
3109             self.report_download_webpage(video_id)
3110             jsondata = compat_urllib_request.urlopen(request).read()
3111         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3112             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3113             return
3114
3115         self.report_extraction(video_id)
3116         try:
3117             jsonstr = jsondata.decode('utf-8')
3118             config = json.loads(jsonstr)
3119
3120             video_title =  config['data'][0]['title']
3121             seed = config['data'][0]['seed']
3122
3123             format = self._downloader.params.get('format', None)
3124             supported_format = list(config['data'][0]['streamfileids'].keys())
3125
3126             if format is None or format == 'best':
3127                 if 'hd2' in supported_format:
3128                     format = 'hd2'
3129                 else:
3130                     format = 'flv'
3131                 ext = u'flv'
3132             elif format == 'worst':
3133                 format = 'mp4'
3134                 ext = u'mp4'
3135             else:
3136                 format = 'flv'
3137                 ext = u'flv'
3138
3139
3140             fileid = config['data'][0]['streamfileids'][format]
3141             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3142         except (UnicodeDecodeError, ValueError, KeyError):
3143             self._downloader.trouble(u'ERROR: unable to extract info section')
3144             return
3145
3146         files_info=[]
3147         sid = self._gen_sid()
3148         fileid = self._get_file_id(fileid, seed)
3149
3150         #column 8,9 of fileid represent the segment number
3151         #fileid[7:9] should be changed
3152         for index, key in enumerate(keys):
3153
3154             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3155             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3156
3157             info = {
3158                 'id': '%s_part%02d' % (video_id, index),
3159                 'url': download_url,
3160                 'uploader': None,
3161                 'upload_date': None,
3162                 'title': video_title,
3163                 'ext': ext,
3164             }
3165             files_info.append(info)
3166
3167         return files_info
3168
3169
3170 class XNXXIE(InfoExtractor):
3171     """Information extractor for xnxx.com"""
3172
3173     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3174     IE_NAME = u'xnxx'
3175     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3176     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3177     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3178
3179     def report_webpage(self, video_id):
3180         """Report information extraction"""
3181         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3182
3183     def report_extraction(self, video_id):
3184         """Report information extraction"""
3185         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3186
3187     def _real_extract(self, url):
3188         mobj = re.match(self._VALID_URL, url)
3189         if mobj is None:
3190             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3191             return
3192         video_id = mobj.group(1)
3193
3194         self.report_webpage(video_id)
3195
3196         # Get webpage content
3197         try:
3198             webpage_bytes = compat_urllib_request.urlopen(url).read()
3199             webpage = webpage_bytes.decode('utf-8')
3200         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3201             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3202             return
3203
3204         result = re.search(self.VIDEO_URL_RE, webpage)
3205         if result is None:
3206             self._downloader.trouble(u'ERROR: unable to extract video url')
3207             return
3208         video_url = compat_urllib_parse.unquote(result.group(1))
3209
3210         result = re.search(self.VIDEO_TITLE_RE, webpage)
3211         if result is None:
3212             self._downloader.trouble(u'ERROR: unable to extract video title')
3213             return
3214         video_title = result.group(1)
3215
3216         result = re.search(self.VIDEO_THUMB_RE, webpage)
3217         if result is None:
3218             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3219             return
3220         video_thumbnail = result.group(1)
3221
3222         return [{
3223             'id': video_id,
3224             'url': video_url,
3225             'uploader': None,
3226             'upload_date': None,
3227             'title': video_title,
3228             'ext': 'flv',
3229             'thumbnail': video_thumbnail,
3230             'description': None,
3231         }]
3232
3233
3234 class GooglePlusIE(InfoExtractor):
3235     """Information extractor for plus.google.com."""
3236
3237     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3238     IE_NAME = u'plus.google'
3239
3240     def __init__(self, downloader=None):
3241         InfoExtractor.__init__(self, downloader)
3242
3243     def report_extract_entry(self, url):
3244         """Report downloading extry"""
3245         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3246
3247     def report_date(self, upload_date):
3248         """Report downloading extry"""
3249         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3250
3251     def report_uploader(self, uploader):
3252         """Report downloading extry"""
3253         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3254
3255     def report_title(self, video_title):
3256         """Report downloading extry"""
3257         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3258
3259     def report_extract_vid_page(self, video_page):
3260         """Report information extraction."""
3261         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3262
3263     def _real_extract(self, url):
3264         # Extract id from URL
3265         mobj = re.match(self._VALID_URL, url)
3266         if mobj is None:
3267             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3268             return
3269
3270         post_url = mobj.group(0)
3271         video_id = mobj.group(1)
3272
3273         video_extension = 'flv'
3274
3275         # Step 1, Retrieve post webpage to extract further information
3276         self.report_extract_entry(post_url)
3277         request = compat_urllib_request.Request(post_url)
3278         try:
3279             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3280         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3281             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3282             return
3283
3284         # Extract update date
3285         upload_date = None
3286         pattern = 'title="Timestamp">(.*?)</a>'
3287         mobj = re.search(pattern, webpage)
3288         if mobj:
3289             upload_date = mobj.group(1)
3290             # Convert timestring to a format suitable for filename
3291             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3292             upload_date = upload_date.strftime('%Y%m%d')
3293         self.report_date(upload_date)
3294
3295         # Extract uploader
3296         uploader = None
3297         pattern = r'rel\="author".*?>(.*?)</a>'
3298         mobj = re.search(pattern, webpage)
3299         if mobj:
3300             uploader = mobj.group(1)
3301         self.report_uploader(uploader)
3302
3303         # Extract title
3304         # Get the first line for title
3305         video_title = u'NA'
3306         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3307         mobj = re.search(pattern, webpage)
3308         if mobj:
3309             video_title = mobj.group(1)
3310         self.report_title(video_title)
3311
3312         # Step 2, Stimulate clicking the image box to launch video
3313         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3314         mobj = re.search(pattern, webpage)
3315         if mobj is None:
3316             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3317
3318         video_page = mobj.group(1)
3319         request = compat_urllib_request.Request(video_page)
3320         try:
3321             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3322         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3323             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3324             return
3325         self.report_extract_vid_page(video_page)
3326
3327
3328         # Extract video links on video page
3329         """Extract video links of all sizes"""
3330         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3331         mobj = re.findall(pattern, webpage)
3332         if len(mobj) == 0:
3333             self._downloader.trouble(u'ERROR: unable to extract video links')
3334
3335         # Sort in resolution
3336         links = sorted(mobj)
3337
3338         # Choose the lowest of the sort, i.e. highest resolution
3339         video_url = links[-1]
3340         # Only get the url. The resolution part in the tuple has no use anymore
3341         video_url = video_url[-1]
3342         # Treat escaped \u0026 style hex
3343         try:
3344             video_url = video_url.decode("unicode_escape")
3345         except AttributeError: # Python 3
3346             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3347
3348
3349         return [{
3350             'id':       video_id,
3351             'url':      video_url,
3352             'uploader': uploader,
3353             'upload_date':  upload_date,
3354             'title':    video_title,
3355             'ext':      video_extension,
3356         }]
3357
3358 class NBAIE(InfoExtractor):
3359     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3360     IE_NAME = u'nba'
3361
3362     def _real_extract(self, url):
3363         mobj = re.match(self._VALID_URL, url)
3364         if mobj is None:
3365             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3366             return
3367
3368         video_id = mobj.group(1)
3369         if video_id.endswith('/index.html'):
3370             video_id = video_id[:-len('/index.html')]
3371
3372         webpage = self._download_webpage(url, video_id)
3373
3374         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3375         def _findProp(rexp, default=None):
3376             m = re.search(rexp, webpage)
3377             if m:
3378                 return unescapeHTML(m.group(1))
3379             else:
3380                 return default
3381
3382         shortened_video_id = video_id.rpartition('/')[2]
3383         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3384         info = {
3385             'id': shortened_video_id,
3386             'url': video_url,
3387             'ext': 'mp4',
3388             'title': title,
3389             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3390             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3391         }
3392         return [info]
3393
3394 class JustinTVIE(InfoExtractor):
3395     """Information extractor for justin.tv and twitch.tv"""
3396     # TODO: One broadcast may be split into multiple videos. The key
3397     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3398     # starts at 1 and increases. Can we treat all parts as one video?
3399
3400     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3401         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3402     _JUSTIN_PAGE_LIMIT = 100
3403     IE_NAME = u'justin.tv'
3404
3405     def report_extraction(self, file_id):
3406         """Report information extraction."""
3407         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3408
3409     def report_download_page(self, channel, offset):
3410         """Report attempt to download a single page of videos."""
3411         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3412                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3413
3414     # Return count of items, list of *valid* items
3415     def _parse_page(self, url):
3416         try:
3417             urlh = compat_urllib_request.urlopen(url)
3418             webpage_bytes = urlh.read()
3419             webpage = webpage_bytes.decode('utf-8', 'ignore')
3420         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3421             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3422             return
3423
3424         response = json.loads(webpage)
3425         if type(response) != list:
3426             error_text = response.get('error', 'unknown error')
3427             self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3428             return
3429         info = []
3430         for clip in response:
3431             video_url = clip['video_file_url']
3432             if video_url:
3433                 video_extension = os.path.splitext(video_url)[1][1:]
3434                 video_date = re.sub('-', '', clip['start_time'][:10])
3435                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3436                 video_id = clip['id']
3437                 video_title = clip.get('title', video_id)
3438                 info.append({
3439                     'id': video_id,
3440                     'url': video_url,
3441                     'title': video_title,
3442                     'uploader': clip.get('channel_name', video_uploader_id),
3443                     'uploader_id': video_uploader_id,
3444                     'upload_date': video_date,
3445                     'ext': video_extension,
3446                 })
3447         return (len(response), info)
3448
3449     def _real_extract(self, url):
3450         mobj = re.match(self._VALID_URL, url)
3451         if mobj is None:
3452             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3453             return
3454
3455         api = 'http://api.justin.tv'
3456         video_id = mobj.group(mobj.lastindex)
3457         paged = False
3458         if mobj.lastindex == 1:
3459             paged = True
3460             api += '/channel/archives/%s.json'
3461         else:
3462             api += '/broadcast/by_archive/%s.json'
3463         api = api % (video_id,)
3464
3465         self.report_extraction(video_id)
3466
3467         info = []
3468         offset = 0
3469         limit = self._JUSTIN_PAGE_LIMIT
3470         while True:
3471             if paged:
3472                 self.report_download_page(video_id, offset)
3473             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3474             page_count, page_info = self._parse_page(page_url)
3475             info.extend(page_info)
3476             if not paged or page_count != limit:
3477                 break
3478             offset += limit
3479         return info
3480
3481 class FunnyOrDieIE(InfoExtractor):
3482     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3483
3484     def _real_extract(self, url):
3485         mobj = re.match(self._VALID_URL, url)
3486         if mobj is None:
3487             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3488             return
3489
3490         video_id = mobj.group('id')
3491         webpage = self._download_webpage(url, video_id)
3492
3493         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3494         if not m:
3495             self._downloader.trouble(u'ERROR: unable to find video information')
3496         video_url = unescapeHTML(m.group('url'))
3497
3498         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3499         if not m:
3500             self._downloader.trouble(u'Cannot find video title')
3501         title = unescapeHTML(m.group('title'))
3502
3503         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3504         if m:
3505             desc = unescapeHTML(m.group('desc'))
3506         else:
3507             desc = None
3508
3509         info = {
3510             'id': video_id,
3511             'url': video_url,
3512             'ext': 'mp4',
3513             'title': title,
3514             'description': desc,
3515         }
3516         return [info]
3517
3518 class TweetReelIE(InfoExtractor):
3519     _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3520
3521     def _real_extract(self, url):
3522         mobj = re.match(self._VALID_URL, url)
3523         if mobj is None:
3524             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3525             return
3526
3527         video_id = mobj.group('id')
3528         webpage = self._download_webpage(url, video_id)
3529
3530         m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3531         if not m:
3532             self._downloader.trouble(u'ERROR: Cannot find status ID')
3533         status_id = m.group(1)
3534
3535         m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3536         if not m:
3537             self._downloader.trouble(u'WARNING: Cannot find description')
3538         desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3539
3540         m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3541         if not m:
3542             self._downloader.trouble(u'ERROR: Cannot find uploader')
3543         uploader = unescapeHTML(m.group('uploader'))
3544         uploader_id = unescapeHTML(m.group('uploader_id'))
3545
3546         m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3547         if not m:
3548             self._downloader.trouble(u'ERROR: Cannot find upload date')
3549         upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3550
3551         title = desc
3552         video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3553
3554         info = {
3555             'id': video_id,
3556             'url': video_url,
3557             'ext': 'mov',
3558             'title': title,
3559             'description': desc,
3560             'uploader': uploader,
3561             'uploader_id': uploader_id,
3562             'internal_id': status_id,
3563             'upload_date': upload_date
3564         }
3565         return [info]
3566
3567 class SteamIE(InfoExtractor):
3568     _VALID_URL = r"""http://store.steampowered.com/
3569                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3570                 (?P<gameID>\d+)/?
3571                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3572                 """
3573
3574     def suitable(self, url):
3575         """Receives a URL and returns True if suitable for this IE."""
3576         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3577
3578     def _real_extract(self, url):
3579         m = re.match(self._VALID_URL, url, re.VERBOSE)
3580         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3581         gameID = m.group('gameID')
3582         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3583         webpage = self._download_webpage(videourl, gameID)
3584         mweb = re.finditer(urlRE, webpage)
3585         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3586         titles = re.finditer(namesRE, webpage)
3587         videos = []
3588         for vid,vtitle in zip(mweb,titles):
3589             video_id = vid.group('videoID')
3590             title = vtitle.group('videoName')
3591             video_url = vid.group('videoURL')
3592             if not video_url:
3593                 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3594             info = {
3595                 'id':video_id,
3596                 'url':video_url,
3597                 'ext': 'flv',
3598                 'title': unescapeHTML(title)
3599                   }
3600             videos.append(info)
3601         return videos
3602
3603 class UstreamIE(InfoExtractor):
3604     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3605     IE_NAME = u'ustream'
3606
3607     def _real_extract(self, url):
3608         m = re.match(self._VALID_URL, url)
3609         video_id = m.group('videoID')
3610         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3611         webpage = self._download_webpage(url, video_id)
3612         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3613         title = m.group('title')
3614         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3615         uploader = m.group('uploader')
3616         info = {
3617                 'id':video_id,
3618                 'url':video_url,
3619                 'ext': 'flv',
3620                 'title': title,
3621                 'uploader': uploader
3622                   }
3623         return [info]
3624
3625 class RBMARadioIE(InfoExtractor):
3626     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3627
3628     def _real_extract(self, url):
3629         m = re.match(self._VALID_URL, url)
3630         video_id = m.group('videoID')
3631
3632         webpage = self._download_webpage(url, video_id)
3633         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3634         if not m:
3635             raise ExtractorError(u'Cannot find metadata')
3636         json_data = m.group(1)
3637
3638         try:
3639             data = json.loads(json_data)
3640         except ValueError as e:
3641             raise ExtractorError(u'Invalid JSON: ' + str(e))
3642
3643         video_url = data['akamai_url'] + '&cbr=256'
3644         url_parts = compat_urllib_parse_urlparse(video_url)
3645         video_ext = url_parts.path.rpartition('.')[2]
3646         info = {
3647                 'id': video_id,
3648                 'url': video_url,
3649                 'ext': video_ext,
3650                 'title': data['title'],
3651                 'description': data.get('teaser_text'),
3652                 'location': data.get('country_of_origin'),
3653                 'uploader': data.get('host', {}).get('name'),
3654                 'uploader_id': data.get('host', {}).get('slug'),
3655                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3656                 'duration': data.get('duration'),
3657         }
3658         return [info]
3659
3660
3661 class YouPornIE(InfoExtractor):
3662     """Information extractor for youporn.com."""
3663     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3664
3665     def _print_formats(self, formats):
3666         """Print all available formats"""
3667         print(u'Available formats:')
3668         print(u'ext\t\tformat')
3669         print(u'---------------------------------')
3670         for format in formats:
3671             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3672
3673     def _specific(self, req_format, formats):
3674         for x in formats:
3675             if(x["format"]==req_format):
3676                 return x
3677         return None
3678
3679     def _real_extract(self, url):
3680         mobj = re.match(self._VALID_URL, url)
3681         if mobj is None:
3682             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3683             return
3684
3685         video_id = mobj.group('videoid')
3686
3687         req = compat_urllib_request.Request(url)
3688         req.add_header('Cookie', 'age_verified=1')
3689         webpage = self._download_webpage(req, video_id)
3690
3691         # Get the video title
3692         result = re.search(r'videoTitleArea">(?P<title>.*)</h1>', webpage)
3693         if result is None:
3694             raise ExtractorError(u'ERROR: unable to extract video title')
3695         video_title = result.group('title').strip()
3696
3697         # Get the video date
3698         result = re.search(r'Date:</b>(?P<date>.*)</li>', webpage)
3699         if result is None:
3700             self._downloader.to_stderr(u'WARNING: unable to extract video date')
3701             upload_date = None
3702         else:
3703             upload_date = result.group('date').strip()
3704
3705         # Get the video uploader
3706         result = re.search(r'Submitted:</b>(?P<uploader>.*)</li>', webpage)
3707         if result is None:
3708             self._downloader.to_stderr(u'ERROR: unable to extract uploader')
3709             video_uploader = None
3710         else:
3711             video_uploader = result.group('uploader').strip()
3712             video_uploader = clean_html( video_uploader )
3713
3714         # Get all of the formats available
3715         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3716         result = re.search(DOWNLOAD_LIST_RE, webpage)
3717         if result is None:
3718             raise ExtractorError(u'Unable to extract download list')
3719         download_list_html = result.group('download_list').strip()
3720
3721         # Get all of the links from the page
3722         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3723         links = re.findall(LINK_RE, download_list_html)
3724         if(len(links) == 0):
3725             raise ExtractorError(u'ERROR: no known formats available for video')
3726
3727         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3728
3729         formats = []
3730         for link in links:
3731
3732             # A link looks like this:
3733             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3734             # A path looks like this:
3735             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3736             video_url = unescapeHTML( link )
3737             path = compat_urllib_parse_urlparse( video_url ).path
3738             extension = os.path.splitext( path )[1][1:]
3739             format = path.split('/')[4].split('_')[:2]
3740             size = format[0]
3741             bitrate = format[1]
3742             format = "-".join( format )
3743             title = u'%s-%s-%s' % (video_title, size, bitrate)
3744
3745             formats.append({
3746                 'id': video_id,
3747                 'url': video_url,
3748                 'uploader': video_uploader,
3749                 'upload_date': upload_date,
3750                 'title': title,
3751                 'ext': extension,
3752                 'format': format,
3753                 'thumbnail': None,
3754                 'description': None,
3755                 'player_url': None
3756             })
3757
3758         if self._downloader.params.get('listformats', None):
3759             self._print_formats(formats)
3760             return
3761
3762         req_format = self._downloader.params.get('format', None)
3763         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3764
3765         if req_format is None or req_format == 'best':
3766             return [formats[0]]
3767         elif req_format == 'worst':
3768             return [formats[-1]]
3769         elif req_format in ('-1', 'all'):
3770             return formats
3771         else:
3772             format = self._specific( req_format, formats )
3773             if result is None:
3774                 self._downloader.trouble(u'ERROR: requested format not available')
3775                 return
3776             return [format]
3777
3778
3779
3780 class PornotubeIE(InfoExtractor):
3781     """Information extractor for pornotube.com."""
3782     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3783
3784     def _real_extract(self, url):
3785         mobj = re.match(self._VALID_URL, url)
3786         if mobj is None:
3787             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3788             return
3789
3790         video_id = mobj.group('videoid')
3791         video_title = mobj.group('title')
3792
3793         # Get webpage content
3794         webpage = self._download_webpage(url, video_id)
3795
3796         # Get the video URL
3797         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3798         result = re.search(VIDEO_URL_RE, webpage)
3799         if result is None:
3800             self._downloader.trouble(u'ERROR: unable to extract video url')
3801             return
3802         video_url = compat_urllib_parse.unquote(result.group('url'))
3803
3804         #Get the uploaded date
3805         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3806         result = re.search(VIDEO_UPLOADED_RE, webpage)
3807         if result is None:
3808             self._downloader.trouble(u'ERROR: unable to extract video title')
3809             return
3810         upload_date = result.group('date')
3811
3812         info = {'id': video_id,
3813                 'url': video_url,
3814                 'uploader': None,
3815                 'upload_date': upload_date,
3816                 'title': video_title,
3817                 'ext': 'flv',
3818                 'format': 'flv'}
3819
3820         return [info]
3821
3822 class YouJizzIE(InfoExtractor):
3823     """Information extractor for youjizz.com."""
3824     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3825
3826     def _real_extract(self, url):
3827         mobj = re.match(self._VALID_URL, url)
3828         if mobj is None:
3829             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3830             return
3831
3832         video_id = mobj.group('videoid')
3833
3834         # Get webpage content
3835         webpage = self._download_webpage(url, video_id)
3836
3837         # Get the video title
3838         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3839         if result is None:
3840             raise ExtractorError(u'ERROR: unable to extract video title')
3841         video_title = result.group('title').strip()
3842
3843         # Get the embed page
3844         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3845         if result is None:
3846             raise ExtractorError(u'ERROR: unable to extract embed page')
3847
3848         embed_page_url = result.group(0).strip()
3849         video_id = result.group('videoid')
3850
3851         webpage = self._download_webpage(embed_page_url, video_id)
3852
3853         # Get the video URL
3854         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3855         if result is None:
3856             raise ExtractorError(u'ERROR: unable to extract video url')
3857         video_url = result.group('source')
3858
3859         info = {'id': video_id,
3860                 'url': video_url,
3861                 'title': video_title,
3862                 'ext': 'flv',
3863                 'format': 'flv',
3864                 'player_url': embed_page_url}
3865
3866         return [info]
3867
3868 class EightTracksIE(InfoExtractor):
3869     IE_NAME = '8tracks'
3870     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3871
3872     def _real_extract(self, url):
3873         mobj = re.match(self._VALID_URL, url)
3874         if mobj is None:
3875             raise ExtractorError(u'Invalid URL: %s' % url)
3876         playlist_id = mobj.group('id')
3877
3878         webpage = self._download_webpage(url, playlist_id)
3879
3880         m = re.search(r"new TRAX.Mix\((.*?)\);\n*\s*TRAX.initSearchAutocomplete\('#search'\);", webpage, flags=re.DOTALL)
3881         if not m:
3882             raise ExtractorError(u'Cannot find trax information')
3883         json_like = m.group(1)
3884         data = json.loads(json_like)
3885
3886         session = str(random.randint(0, 1000000000))
3887         mix_id = data['id']
3888         track_count = data['tracks_count']
3889         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3890         next_url = first_url
3891         res = []
3892         for i in itertools.count():
3893             api_json = self._download_webpage(next_url, playlist_id,
3894                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3895                 errnote=u'Failed to download song information')
3896             api_data = json.loads(api_json)
3897             track_data = api_data[u'set']['track']
3898             info = {
3899                 'id': track_data['id'],
3900                 'url': track_data['track_file_stream_url'],
3901                 'title': track_data['performer'] + u' - ' + track_data['name'],
3902                 'raw_title': track_data['name'],
3903                 'uploader_id': data['user']['login'],
3904                 'ext': 'm4a',
3905             }
3906             res.append(info)
3907             if api_data['set']['at_last_track']:
3908                 break
3909             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3910         return res
3911
3912 def gen_extractors():
3913     """ Return a list of an instance of every supported extractor.
3914     The order does matter; the first extractor matched is the one handling the URL.
3915     """
3916     return [
3917         YoutubePlaylistIE(),
3918         YoutubeChannelIE(),
3919         YoutubeUserIE(),
3920         YoutubeSearchIE(),
3921         YoutubeIE(),
3922         MetacafeIE(),
3923         DailymotionIE(),
3924         GoogleSearchIE(),
3925         PhotobucketIE(),
3926         YahooIE(),
3927         YahooSearchIE(),
3928         DepositFilesIE(),
3929         FacebookIE(),
3930         BlipTVUserIE(),
3931         BlipTVIE(),
3932         VimeoIE(),
3933         MyVideoIE(),
3934         ComedyCentralIE(),
3935         EscapistIE(),
3936         CollegeHumorIE(),
3937         XVideosIE(),
3938         SoundcloudIE(),
3939         InfoQIE(),
3940         MixcloudIE(),
3941         StanfordOpenClassroomIE(),
3942         MTVIE(),
3943         YoukuIE(),
3944         XNXXIE(),
3945         YouJizzIE(),
3946         PornotubeIE(),
3947         YouPornIE(),
3948         GooglePlusIE(),
3949         ArteTvIE(),
3950         NBAIE(),
3951         JustinTVIE(),
3952         FunnyOrDieIE(),
3953         TweetReelIE(),
3954         SteamIE(),
3955         UstreamIE(),
3956         RBMARadioIE(),
3957         EightTracksIE(),
3958         GenericIE()
3959     ]
3960
3961