_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import datetime
   5 import HTMLParser
   6 import httplib
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import time
  12 import email.utils
  13 import xml.etree.ElementTree
  14 import random
  15 import math
  16 from urlparse import parse_qs
  17
  18 try:
  19         import cStringIO as StringIO
  20 except ImportError:
  21         import StringIO
  22
  23 from utils import *
  24
  25
  26 class InfoExtractor(object):
  27         """Information Extractor class.
  28
  29         Information extractors are the classes that, given a URL, extract
  30         information about the video (or videos) the URL refers to. This
  31         information includes the real video URL, the video title, author and
  32         others. The information is stored in a dictionary which is then
  33         passed to the FileDownloader. The FileDownloader processes this
  34         information possibly downloading the video to the file system, among
  35         other possible outcomes.
  36
  37         The dictionaries must include the following fields:
  38
  39         id:             Video identifier.
  40         url:            Final video URL.
  41         uploader:       Nickname of the video uploader, unescaped.
  42         upload_date:    Video upload date (YYYYMMDD).
  43         title:          Video title, unescaped.
  44         ext:            Video filename extension.
  45
  46         The following fields are optional:
  47
  48         format:         The video format, defaults to ext (used for --get-format)
  49         thumbnail:      Full URL to a video thumbnail image.
  50         description:    One-line video description.
  51         player_url:     SWF Player URL (used for rtmpdump).
  52         subtitles:      The .srt file contents.
  53         urlhandle:              [internal] The urlHandle to be used to download the file,
  54                         like returned by urllib.request.urlopen
  55
  56         The fields should all be Unicode strings.
  57
  58         Subclasses of this one should re-define the _real_initialize() and
  59         _real_extract() methods and define a _VALID_URL regexp.
  60         Probably, they should also be added to the list of extractors.
  61
  62         _real_extract() must return a *list* of information dictionaries as
  63         described above.
  64
  65         Finally, the _WORKING attribute should be set to False for broken IEs
  66         in order to warn the users and skip the tests.
  67         """
  68
  69         _ready = False
  70         _downloader = None
  71         _WORKING = True
  72
  73         def __init__(self, downloader=None):
  74                 """Constructor. Receives an optional downloader."""
  75                 self._ready = False
  76                 self.set_downloader(downloader)
  77
  78         def suitable(self, url):
  79                 """Receives a URL and returns True if suitable for this IE."""
  80                 return re.match(self._VALID_URL, url) is not None
  81
  82         def working(self):
  83                 """Getter method for _WORKING."""
  84                 return self._WORKING
  85
  86         def initialize(self):
  87                 """Initializes an instance (authentication, etc)."""
  88                 if not self._ready:
  89                         self._real_initialize()
  90                         self._ready = True
  91
  92         def extract(self, url):
  93                 """Extracts URL information and returns it in list of dicts."""
  94                 self.initialize()
  95                 return self._real_extract(url)
  96
  97         def set_downloader(self, downloader):
  98                 """Sets the downloader for this IE."""
  99                 self._downloader = downloader
 100
 101         def _real_initialize(self):
 102                 """Real initialization process. Redefine in subclasses."""
 103                 pass
 104
 105         def _real_extract(self, url):
 106                 """Real extraction process. Redefine in subclasses."""
 107                 pass
 108
 109
 110 class YoutubeIE(InfoExtractor):
 111         """Information extractor for youtube.com."""
 112
 113         _VALID_URL = r"""^
 114                          (
 115                              (?:https?://)?                                       # http(s):// (optional)
 116                              (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 117                                 tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 118                              (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 119                              (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
 120                              (?:                                                  # the various things that can precede the ID:
 121                                  (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 122                                  |(?:                                             # or the v= param in all its forms
 123                                      (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 124                                      (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 125                                      (?:.+&)?                                     # any other preceding param (like /?s=tuff&v=xxxx)
 126                                      v=
 127                                  )
 128                              )?                                                   # optional -> youtube.com/xxxx is OK
 129                          )?                                                       # all until now is optional -> you can pass the naked ID
 130                          ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 131                          (?(1).+)?                                                # if we found the ID, everything can follow
 132                          $"""
 133         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 134         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 135         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 136         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 137         _NETRC_MACHINE = 'youtube'
 138         # Listed in order of quality
 139         _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 140         _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 141         _video_extensions = {
 142                 '13': '3gp',
 143                 '17': 'mp4',
 144                 '18': 'mp4',
 145                 '22': 'mp4',
 146                 '37': 'mp4',
 147                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 148                 '43': 'webm',
 149                 '44': 'webm',
 150                 '45': 'webm',
 151                 '46': 'webm',
 152         }
 153         _video_dimensions = {
 154                 '5': '240x400',
 155                 '6': '???',
 156                 '13': '???',
 157                 '17': '144x176',
 158                 '18': '360x640',
 159                 '22': '720x1280',
 160                 '34': '360x640',
 161                 '35': '480x854',
 162                 '37': '1080x1920',
 163                 '38': '3072x4096',
 164                 '43': '360x640',
 165                 '44': '480x854',
 166                 '45': '720x1280',
 167                 '46': '1080x1920',
 168         }
 169         IE_NAME = u'youtube'
 170
 171         def suitable(self, url):
 172                 """Receives a URL and returns True if suitable for this IE."""
 173                 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
 174
 175         def report_lang(self):
 176                 """Report attempt to set language."""
 177                 self._downloader.to_screen(u'[youtube] Setting language')
 178
 179         def report_login(self):
 180                 """Report attempt to log in."""
 181                 self._downloader.to_screen(u'[youtube] Logging in')
 182
 183         def report_age_confirmation(self):
 184                 """Report attempt to confirm age."""
 185                 self._downloader.to_screen(u'[youtube] Confirming age')
 186
 187         def report_video_webpage_download(self, video_id):
 188                 """Report attempt to download video webpage."""
 189                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 190
 191         def report_video_info_webpage_download(self, video_id):
 192                 """Report attempt to download video info webpage."""
 193                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 194
 195         def report_video_subtitles_download(self, video_id):
 196                 """Report attempt to download video info webpage."""
 197                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 198
 199         def report_information_extraction(self, video_id):
 200                 """Report attempt to extract video information."""
 201                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 202
 203         def report_unavailable_format(self, video_id, format):
 204                 """Report extracted video URL."""
 205                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 206
 207         def report_rtmp_download(self):
 208                 """Indicate the download will use the RTMP protocol."""
 209                 self._downloader.to_screen(u'[youtube] RTMP download detected')
 210
 211         def _closed_captions_xml_to_srt(self, xml_string):
 212                 srt = ''
 213                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 214                 # TODO parse xml instead of regex
 215                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
 216                         if not dur: dur = '4'
 217                         start = float(start)
 218                         end = start + float(dur)
 219                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 220                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 221                         caption = unescapeHTML(caption)
 222                         caption = unescapeHTML(caption) # double cycle, intentional
 223                         srt += str(n+1) + '\n'
 224                         srt += start + ' --> ' + end + '\n'
 225                         srt += caption + '\n\n'
 226                 return srt
 227
 228         def _print_formats(self, formats):
 229                 print('Available formats:')
 230                 for x in formats:
 231                         print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 232
 233         def _real_initialize(self):
 234                 if self._downloader is None:
 235                         return
 236
 237                 username = None
 238                 password = None
 239                 downloader_params = self._downloader.params
 240
 241                 # Attempt to use provided username and password or .netrc data
 242                 if downloader_params.get('username', None) is not None:
 243                         username = downloader_params['username']
 244                         password = downloader_params['password']
 245                 elif downloader_params.get('usenetrc', False):
 246                         try:
 247                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 248                                 if info is not None:
 249                                         username = info[0]
 250                                         password = info[2]
 251                                 else:
 252                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 253                         except (IOError, netrc.NetrcParseError) as err:
 254                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
 255                                 return
 256
 257                 # Set language
 258                 request = compat_urllib_request.Request(self._LANG_URL)
 259                 try:
 260                         self.report_lang()
 261                         compat_urllib_request.urlopen(request).read()
 262                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
 263                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
 264                         return
 265
 266                 # No authentication to be performed
 267                 if username is None:
 268                         return
 269
 270                 # Log in
 271                 login_form = {
 272                                 'current_form': 'loginForm',
 273                                 'next':         '/',
 274                                 'action_login': 'Log In',
 275                                 'username':     username,
 276                                 'password':     password,
 277                                 }
 278                 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
 279                 try:
 280                         self.report_login()
 281                         login_results = compat_urllib_request.urlopen(request).read()
 282                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 283                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 284                                 return
 285                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
 286                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
 287                         return
 288
 289                 # Confirm age
 290                 age_form = {
 291                                 'next_url':             '/',
 292                                 'action_confirm':       'Confirm',
 293                                 }
 294                 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 295                 try:
 296                         self.report_age_confirmation()
 297                         age_results = compat_urllib_request.urlopen(request).read()
 298                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
 299                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 300                         return
 301
 302         def _real_extract(self, url):
 303                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 304                 mobj = re.search(self._NEXT_URL_RE, url)
 305                 if mobj:
 306                         url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 307
 308                 # Extract video id from URL
 309                 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 310                 if mobj is None:
 311                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 312                         return
 313                 video_id = mobj.group(2)
 314
 315                 # Get video webpage
 316                 self.report_video_webpage_download(video_id)
 317                 request = compat_urllib_request.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
 318                 try:
 319                         video_webpage = compat_urllib_request.urlopen(request).read()
 320                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
 321                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
 322                         return
 323
 324                 # Attempt to extract SWF player URL
 325                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 326                 if mobj is not None:
 327                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 328                 else:
 329                         player_url = None
 330
 331                 # Get video info
 332                 self.report_video_info_webpage_download(video_id)
 333                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 334                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 335                                         % (video_id, el_type))
 336                         request = compat_urllib_request.Request(video_info_url)
 337                         try:
 338                                 video_info_webpage = compat_urllib_request.urlopen(request).read()
 339                                 video_info = parse_qs(video_info_webpage)
 340                                 if 'token' in video_info:
 341                                         break
 342                         except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
 343                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 344                                 return
 345                 if 'token' not in video_info:
 346                         if 'reason' in video_info:
 347                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
 348                         else:
 349                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 350                         return
 351
 352                 # Check for "rental" videos
 353                 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 354                         self._downloader.trouble(u'ERROR: "rental" videos not supported')
 355                         return
 356
 357                 # Start extracting information
 358                 self.report_information_extraction(video_id)
 359
 360                 # uploader
 361                 if 'author' not in video_info:
 362                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 363                         return
 364                 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 365
 366                 # title
 367                 if 'title' not in video_info:
 368                         self._downloader.trouble(u'ERROR: unable to extract video title')
 369                         return
 370                 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 371                 video_title = video_title.decode('utf-8')
 372
 373                 # thumbnail image
 374                 if 'thumbnail_url' not in video_info:
 375                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 376                         video_thumbnail = ''
 377                 else:   # don't panic if we can't find it
 378                         video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 379
 380                 # upload date
 381                 upload_date = None
 382                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 383                 if mobj is not None:
 384                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 385                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 386                         for expression in format_expressions:
 387                                 try:
 388                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 389                                 except:
 390                                         pass
 391
 392                 # description
 393                 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
 394                 if video_description: video_description = clean_html(video_description)
 395                 else: video_description = ''
 396
 397                 # closed captions
 398                 video_subtitles = None
 399                 if self._downloader.params.get('writesubtitles', False):
 400                         try:
 401                                 self.report_video_subtitles_download(video_id)
 402                                 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 403                                 try:
 404                                         srt_list = compat_urllib_request.urlopen(request).read()
 405                                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
 406                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
 407                                 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 408                                 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 409                                 if not srt_lang_list:
 410                                         raise Trouble(u'WARNING: video has no closed captions')
 411                                 if self._downloader.params.get('subtitleslang', False):
 412                                         srt_lang = self._downloader.params.get('subtitleslang')
 413                                 elif 'en' in srt_lang_list:
 414                                         srt_lang = 'en'
 415                                 else:
 416                                         srt_lang = srt_lang_list.keys()[0]
 417                                 if not srt_lang in srt_lang_list:
 418                                         raise Trouble(u'WARNING: no closed captions found in the specified language')
 419                                 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
 420                                 try:
 421                                         srt_xml = compat_urllib_request.urlopen(request).read()
 422                                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
 423                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
 424                                 if not srt_xml:
 425                                         raise Trouble(u'WARNING: unable to download video subtitles')
 426                                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
 427                         except Trouble as trouble:
 428                                 self._downloader.trouble(trouble[0])
 429
 430                 if 'length_seconds' not in video_info:
 431                         self._downloader.trouble(u'WARNING: unable to extract video duration')
 432                         video_duration = ''
 433                 else:
 434                         video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 435
 436                 # token
 437                 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 438
 439                 # Decide which formats to download
 440                 req_format = self._downloader.params.get('format', None)
 441
 442                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 443                         self.report_rtmp_download()
 444                         video_url_list = [(None, video_info['conn'][0])]
 445                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 446                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 447                         url_data = [parse_qs(uds) for uds in url_data_strs]
 448                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
 449                         url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 450
 451                         format_limit = self._downloader.params.get('format_limit', None)
 452                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 453                         if format_limit is not None and format_limit in available_formats:
 454                                 format_list = available_formats[available_formats.index(format_limit):]
 455                         else:
 456                                 format_list = available_formats
 457                         existing_formats = [x for x in format_list if x in url_map]
 458                         if len(existing_formats) == 0:
 459                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 460                                 return
 461                         if self._downloader.params.get('listformats', None):
 462                                 self._print_formats(existing_formats)
 463                                 return
 464                         if req_format is None or req_format == 'best':
 465                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 466                         elif req_format == 'worst':
 467                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 468                         elif req_format in ('-1', 'all'):
 469                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 470                         else:
 471                                 # Specific formats. We pick the first in a slash-delimeted sequence.
 472                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 473                                 req_formats = req_format.split('/')
 474                                 video_url_list = None
 475                                 for rf in req_formats:
 476                                         if rf in url_map:
 477                                                 video_url_list = [(rf, url_map[rf])]
 478                                                 break
 479                                 if video_url_list is None:
 480                                         self._downloader.trouble(u'ERROR: requested format not available')
 481                                         return
 482                 else:
 483                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 484                         return
 485
 486                 results = []
 487                 for format_param, video_real_url in video_url_list:
 488                         # Extension
 489                         video_extension = self._video_extensions.get(format_param, 'flv')
 490
 491                         video_format = '{} - {}'.format(format_param.decode('utf-8') if format_param else video_extension.decode('utf-8'),
 492                                                             self._video_dimensions.get(format_param, '???'))
 493
 494                         results.append({
 495                                 'id':           video_id.decode('utf-8'),
 496                                 'url':          video_real_url.decode('utf-8'),
 497                                 'uploader':     video_uploader.decode('utf-8'),
 498                                 'upload_date':  upload_date,
 499                                 'title':        video_title,
 500                                 'ext':          video_extension.decode('utf-8'),
 501                                 'format':       video_format,
 502                                 'thumbnail':    video_thumbnail.decode('utf-8'),
 503                                 'description':  video_description,
 504                                 'player_url':   player_url,
 505                                 'subtitles':    video_subtitles,
 506                                 'duration':             video_duration
 507                         })
 508                 return results
 509
 510
 511 class MetacafeIE(InfoExtractor):
 512         """Information Extractor for metacafe.com."""
 513
 514         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 515         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 516         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 517         IE_NAME = u'metacafe'
 518
 519         def __init__(self, downloader=None):
 520                 InfoExtractor.__init__(self, downloader)
 521
 522         def report_disclaimer(self):
 523                 """Report disclaimer retrieval."""
 524                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 525
 526         def report_age_confirmation(self):
 527                 """Report attempt to confirm age."""
 528                 self._downloader.to_screen(u'[metacafe] Confirming age')
 529
 530         def report_download_webpage(self, video_id):
 531                 """Report webpage download."""
 532                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 533
 534         def report_extraction(self, video_id):
 535                 """Report information extraction."""
 536                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 537
 538         def _real_initialize(self):
 539                 # Retrieve disclaimer
 540                 request = compat_urllib_request.Request(self._DISCLAIMER)
 541                 try:
 542                         self.report_disclaimer()
 543                         disclaimer = compat_urllib_request.urlopen(request).read()
 544                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
 545                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
 546                         return
 547
 548                 # Confirm age
 549                 disclaimer_form = {
 550                         'filters': '0',
 551                         'submit': "Continue - I'm over 18",
 552                         }
 553                 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 554                 try:
 555                         self.report_age_confirmation()
 556                         disclaimer = compat_urllib_request.urlopen(request).read()
 557                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
 558                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 559                         return
 560
 561         def _real_extract(self, url):
 562                 # Extract id and simplified title from URL
 563                 mobj = re.match(self._VALID_URL, url)
 564                 if mobj is None:
 565                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 566                         return
 567
 568                 video_id = mobj.group(1)
 569
 570                 # Check if video comes from YouTube
 571                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 572                 if mobj2 is not None:
 573                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 574                         return
 575
 576                 # Retrieve video webpage to extract further information
 577                 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
 578                 try:
 579                         self.report_download_webpage(video_id)
 580                         webpage = compat_urllib_request.urlopen(request).read()
 581                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
 582                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 583                         return
 584
 585                 # Extract URL, uploader and title from webpage
 586                 self.report_extraction(video_id)
 587                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 588                 if mobj is not None:
 589                         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 590                         video_extension = mediaURL[-3:]
 591
 592                         # Extract gdaKey if available
 593                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 594                         if mobj is None:
 595                                 video_url = mediaURL
 596                         else:
 597                                 gdaKey = mobj.group(1)
 598                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 599                 else:
 600                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 601                         if mobj is None:
 602                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 603                                 return
 604                         vardict = parse_qs(mobj.group(1))
 605                         if 'mediaData' not in vardict:
 606                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 607                                 return
 608                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 609                         if mobj is None:
 610                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 611                                 return
 612                         mediaURL = mobj.group(1).replace('\\/', '/')
 613                         video_extension = mediaURL[-3:]
 614                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 615
 616                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 617                 if mobj is None:
 618                         self._downloader.trouble(u'ERROR: unable to extract title')
 619                         return
 620                 video_title = mobj.group(1).decode('utf-8')
 621
 622                 mobj = re.search(r'submitter=(.*?);', webpage)
 623                 if mobj is None:
 624                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 625                         return
 626                 video_uploader = mobj.group(1)
 627
 628                 return [{
 629                         'id':           video_id.decode('utf-8'),
 630                         'url':          video_url.decode('utf-8'),
 631                         'uploader':     video_uploader.decode('utf-8'),
 632                         'upload_date':  None,
 633                         'title':        video_title,
 634                         'ext':          video_extension.decode('utf-8'),
 635                 }]
 636
 637
 638 class DailymotionIE(InfoExtractor):
 639         """Information Extractor for Dailymotion"""
 640
 641         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 642         IE_NAME = u'dailymotion'
 643
 644         def __init__(self, downloader=None):
 645                 InfoExtractor.__init__(self, downloader)
 646
 647         def report_download_webpage(self, video_id):
 648                 """Report webpage download."""
 649                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
 650
 651         def report_extraction(self, video_id):
 652                 """Report information extraction."""
 653                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 654
 655         def _real_extract(self, url):
 656                 # Extract id and simplified title from URL
 657                 mobj = re.match(self._VALID_URL, url)
 658                 if mobj is None:
 659                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 660                         return
 661
 662                 video_id = mobj.group(1).split('_')[0].split('?')[0]
 663
 664                 video_extension = 'mp4'
 665
 666                 # Retrieve video webpage to extract further information
 667                 request = compat_urllib_request.Request(url)
 668                 request.add_header('Cookie', 'family_filter=off')
 669                 try:
 670                         self.report_download_webpage(video_id)
 671                         webpage = compat_urllib_request.urlopen(request).read()
 672                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
 673                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 674                         return
 675
 676                 # Extract URL, uploader and title from webpage
 677                 self.report_extraction(video_id)
 678                 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 679                 if mobj is None:
 680                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 681                         return
 682                 flashvars = compat_urllib_parse.unquote(mobj.group(1))
 683
 684                 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 685                         if key in flashvars:
 686                                 max_quality = key
 687                                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 688                                 break
 689                 else:
 690                         self._downloader.trouble(u'ERROR: unable to extract video URL')
 691                         return
 692
 693                 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 694                 if mobj is None:
 695                         self._downloader.trouble(u'ERROR: unable to extract video URL')
 696                         return
 697
 698                 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 699
 700                 # TODO: support choosing qualities
 701
 702                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 703                 if mobj is None:
 704                         self._downloader.trouble(u'ERROR: unable to extract title')
 705                         return
 706                 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
 707
 708                 video_uploader = None
 709                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 710                 if mobj is None:
 711                         # lookin for official user
 712                         mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 713                         if mobj_official is None:
 714                                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 715                         else:
 716                                 video_uploader = mobj_official.group(1)
 717                 else:
 718                         video_uploader = mobj.group(1)
 719
 720                 video_upload_date = None
 721                 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 722                 if mobj is not None:
 723                         video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 724
 725                 return [{
 726                         'id':           video_id.decode('utf-8'),
 727                         'url':          video_url.decode('utf-8'),
 728                         'uploader':     video_uploader.decode('utf-8'),
 729                         'upload_date':  video_upload_date,
 730                         'title':        video_title,
 731                         'ext':          video_extension.decode('utf-8'),
 732                 }]
 733
 734
 735 class GoogleIE(InfoExtractor):
 736         """Information extractor for video.google.com."""
 737
 738         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
 739         IE_NAME = u'video.google'
 740
 741         def __init__(self, downloader=None):
 742                 InfoExtractor.__init__(self, downloader)
 743
 744         def report_download_webpage(self, video_id):
 745                 """Report webpage download."""
 746                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
 747
 748         def report_extraction(self, video_id):
 749                 """Report information extraction."""
 750                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
 751
 752         def _real_extract(self, url):
 753                 # Extract id from URL
 754                 mobj = re.match(self._VALID_URL, url)
 755                 if mobj is None:
 756                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 757                         return
 758
 759                 video_id = mobj.group(1)
 760
 761                 video_extension = 'mp4'
 762
 763                 # Retrieve video webpage to extract further information
 764                 request = compat_urllib_request.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
 765                 try:
 766                         self.report_download_webpage(video_id)
 767                         webpage = compat_urllib_request.urlopen(request).read()
 768                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
 769                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 770                         return
 771
 772                 # Extract URL, uploader, and title from webpage
 773                 self.report_extraction(video_id)
 774                 mobj = re.search(r"download_url:'([^']+)'", webpage)
 775                 if mobj is None:
 776                         video_extension = 'flv'
 777                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
 778                 if mobj is None:
 779                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 780                         return
 781                 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 782                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
 783                 mediaURL = mediaURL.replace('\\x26', '\x26')
 784
 785                 video_url = mediaURL
 786
 787                 mobj = re.search(r'<title>(.*)</title>', webpage)
 788                 if mobj is None:
 789                         self._downloader.trouble(u'ERROR: unable to extract title')
 790                         return
 791                 video_title = mobj.group(1).decode('utf-8')
 792
 793                 # Extract video description
 794                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
 795                 if mobj is None:
 796                         self._downloader.trouble(u'ERROR: unable to extract video description')
 797                         return
 798                 video_description = mobj.group(1).decode('utf-8')
 799                 if not video_description:
 800                         video_description = 'No description available.'
 801
 802                 # Extract video thumbnail
 803                 if self._downloader.params.get('forcethumbnail', False):
 804                         request = compat_urllib_request.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
 805                         try:
 806                                 webpage = compat_urllib_request.urlopen(request).read()
 807                         except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
 808                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 809                                 return
 810                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
 811                         if mobj is None:
 812                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 813                                 return
 814                         video_thumbnail = mobj.group(1)
 815                 else:   # we need something to pass to process_info
 816                         video_thumbnail = ''
 817
 818                 return [{
 819                         'id':           video_id.decode('utf-8'),
 820                         'url':          video_url.decode('utf-8'),
 821                         'uploader':     None,
 822                         'upload_date':  None,
 823                         'title':        video_title,
 824                         'ext':          video_extension.decode('utf-8'),
 825                 }]
 826
 827
 828 class PhotobucketIE(InfoExtractor):
 829         """Information extractor for photobucket.com."""
 830
 831         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 832         IE_NAME = u'photobucket'
 833
 834         def __init__(self, downloader=None):
 835                 InfoExtractor.__init__(self, downloader)
 836
 837         def report_download_webpage(self, video_id):
 838                 """Report webpage download."""
 839                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 840
 841         def report_extraction(self, video_id):
 842                 """Report information extraction."""
 843                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 844
 845         def _real_extract(self, url):
 846                 # Extract id from URL
 847                 mobj = re.match(self._VALID_URL, url)
 848                 if mobj is None:
 849                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 850                         return
 851
 852                 video_id = mobj.group(1)
 853
 854                 video_extension = 'flv'
 855
 856                 # Retrieve video webpage to extract further information
 857                 request = compat_urllib_request.Request(url)
 858                 try:
 859                         self.report_download_webpage(video_id)
 860                         webpage = compat_urllib_request.urlopen(request).read()
 861                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
 862                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 863                         return
 864
 865                 # Extract URL, uploader, and title from webpage
 866                 self.report_extraction(video_id)
 867                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 868                 if mobj is None:
 869                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 870                         return
 871                 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 872
 873                 video_url = mediaURL
 874
 875                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 876                 if mobj is None:
 877                         self._downloader.trouble(u'ERROR: unable to extract title')
 878                         return
 879                 video_title = mobj.group(1).decode('utf-8')
 880
 881                 video_uploader = mobj.group(2).decode('utf-8')
 882
 883                 return [{
 884                         'id':           video_id.decode('utf-8'),
 885                         'url':          video_url.decode('utf-8'),
 886                         'uploader':     video_uploader,
 887                         'upload_date':  None,
 888                         'title':        video_title,
 889                         'ext':          video_extension.decode('utf-8'),
 890                 }]
 891
 892
 893 class YahooIE(InfoExtractor):
 894         """Information extractor for video.yahoo.com."""
 895
 896         # _VALID_URL matches all Yahoo! Video URLs
 897         # _VPAGE_URL matches only the extractable '/watch/' URLs
 898         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 899         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 900         IE_NAME = u'video.yahoo'
 901
 902         def __init__(self, downloader=None):
 903                 InfoExtractor.__init__(self, downloader)
 904
 905         def report_download_webpage(self, video_id):
 906                 """Report webpage download."""
 907                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 908
 909         def report_extraction(self, video_id):
 910                 """Report information extraction."""
 911                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 912
 913         def _real_extract(self, url, new_video=True):
 914                 # Extract ID from URL
 915                 mobj = re.match(self._VALID_URL, url)
 916                 if mobj is None:
 917                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 918                         return
 919
 920                 video_id = mobj.group(2)
 921                 video_extension = 'flv'
 922
 923                 # Rewrite valid but non-extractable URLs as
 924                 # extractable English language /watch/ URLs
 925                 if re.match(self._VPAGE_URL, url) is None:
 926                         request = compat_urllib_request.Request(url)
 927                         try:
 928                                 webpage = compat_urllib_request.urlopen(request).read()
 929                         except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
 930                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 931                                 return
 932
 933                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 934                         if mobj is None:
 935                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 936                                 return
 937                         yahoo_id = mobj.group(1)
 938
 939                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 940                         if mobj is None:
 941                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 942                                 return
 943                         yahoo_vid = mobj.group(1)
 944
 945                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 946                         return self._real_extract(url, new_video=False)
 947
 948                 # Retrieve video webpage to extract further information
 949                 request = compat_urllib_request.Request(url)
 950                 try:
 951                         self.report_download_webpage(video_id)
 952                         webpage = compat_urllib_request.urlopen(request).read()
 953                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
 954                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 955                         return
 956
 957                 # Extract uploader and title from webpage
 958                 self.report_extraction(video_id)
 959                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 960                 if mobj is None:
 961                         self._downloader.trouble(u'ERROR: unable to extract video title')
 962                         return
 963                 video_title = mobj.group(1).decode('utf-8')
 964
 965                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 966                 if mobj is None:
 967                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
 968                         return
 969                 video_uploader = mobj.group(1).decode('utf-8')
 970
 971                 # Extract video thumbnail
 972                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 973                 if mobj is None:
 974                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 975                         return
 976                 video_thumbnail = mobj.group(1).decode('utf-8')
 977
 978                 # Extract video description
 979                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 980                 if mobj is None:
 981                         self._downloader.trouble(u'ERROR: unable to extract video description')
 982                         return
 983                 video_description = mobj.group(1).decode('utf-8')
 984                 if not video_description:
 985                         video_description = 'No description available.'
 986
 987                 # Extract video height and width
 988                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 989                 if mobj is None:
 990                         self._downloader.trouble(u'ERROR: unable to extract video height')
 991                         return
 992                 yv_video_height = mobj.group(1)
 993
 994                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 995                 if mobj is None:
 996                         self._downloader.trouble(u'ERROR: unable to extract video width')
 997                         return
 998                 yv_video_width = mobj.group(1)
 999
1000                 # Retrieve video playlist to extract media URL
1001                 # I'm not completely sure what all these options are, but we
1002                 # seem to need most of them, otherwise the server sends a 401.
1003                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1004                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1005                 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1006                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1007                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1008                 try:
1009                         self.report_download_webpage(video_id)
1010                         webpage = compat_urllib_request.urlopen(request).read()
1011                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1012                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1013                         return
1014
1015                 # Extract media URL from playlist XML
1016                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1017                 if mobj is None:
1018                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1019                         return
1020                 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1021                 video_url = unescapeHTML(video_url)
1022
1023                 return [{
1024                         'id':           video_id.decode('utf-8'),
1025                         'url':          video_url,
1026                         'uploader':     video_uploader,
1027                         'upload_date':  None,
1028                         'title':        video_title,
1029                         'ext':          video_extension.decode('utf-8'),
1030                         'thumbnail':    video_thumbnail.decode('utf-8'),
1031                         'description':  video_description,
1032                 }]
1033
1034
1035 class VimeoIE(InfoExtractor):
1036         """Information extractor for vimeo.com."""
1037
1038         # _VALID_URL matches Vimeo URLs
1039         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
1040         IE_NAME = u'vimeo'
1041
1042         def __init__(self, downloader=None):
1043                 InfoExtractor.__init__(self, downloader)
1044
1045         def report_download_webpage(self, video_id):
1046                 """Report webpage download."""
1047                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1048
1049         def report_extraction(self, video_id):
1050                 """Report information extraction."""
1051                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1052
1053         def _real_extract(self, url, new_video=True):
1054                 # Extract ID from URL
1055                 mobj = re.match(self._VALID_URL, url)
1056                 if mobj is None:
1057                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1058                         return
1059
1060                 video_id = mobj.group(1)
1061
1062                 # Retrieve video webpage to extract further information
1063                 request = compat_urllib_request.Request(url, None, std_headers)
1064                 try:
1065                         self.report_download_webpage(video_id)
1066                         webpage = compat_urllib_request.urlopen(request).read()
1067                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1068                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1069                         return
1070
1071                 # Now we begin extracting as much information as we can from what we
1072                 # retrieved. First we extract the information common to all extractors,
1073                 # and latter we extract those that are Vimeo specific.
1074                 self.report_extraction(video_id)
1075
1076                 # Extract the config JSON
1077                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1078                 try:
1079                         config = json.loads(config)
1080                 except:
1081                         self._downloader.trouble(u'ERROR: unable to extract info section')
1082                         return
1083
1084                 # Extract title
1085                 video_title = config["video"]["title"]
1086
1087                 # Extract uploader
1088                 video_uploader = config["video"]["owner"]["name"]
1089
1090                 # Extract video thumbnail
1091                 video_thumbnail = config["video"]["thumbnail"]
1092
1093                 # Extract video description
1094                 video_description = get_element_by_id("description", webpage.decode('utf8'))
1095                 if video_description: video_description = clean_html(video_description)
1096                 else: video_description = ''
1097
1098                 # Extract upload date
1099                 video_upload_date = None
1100                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1101                 if mobj is not None:
1102                         video_upload_date = mobj.group(1)
1103
1104                 # Vimeo specific: extract request signature and timestamp
1105                 sig = config['request']['signature']
1106                 timestamp = config['request']['timestamp']
1107
1108                 # Vimeo specific: extract video codec and quality information
1109                 # First consider quality, then codecs, then take everything
1110                 # TODO bind to format param
1111                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1112                 files = { 'hd': [], 'sd': [], 'other': []}
1113                 for codec_name, codec_extension in codecs:
1114                         if codec_name in config["video"]["files"]:
1115                                 if 'hd' in config["video"]["files"][codec_name]:
1116                                         files['hd'].append((codec_name, codec_extension, 'hd'))
1117                                 elif 'sd' in config["video"]["files"][codec_name]:
1118                                         files['sd'].append((codec_name, codec_extension, 'sd'))
1119                                 else:
1120                                         files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1121
1122                 for quality in ('hd', 'sd', 'other'):
1123                         if len(files[quality]) > 0:
1124                                 video_quality = files[quality][0][2]
1125                                 video_codec = files[quality][0][0]
1126                                 video_extension = files[quality][0][1]
1127                                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1128                                 break
1129                 else:
1130                         self._downloader.trouble(u'ERROR: no known codec found')
1131                         return
1132
1133                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1134                                         %(video_id, sig, timestamp, video_quality, video_codec.upper())
1135
1136                 return [{
1137                         'id':           video_id,
1138                         'url':          video_url,
1139                         'uploader':     video_uploader,
1140                         'upload_date':  video_upload_date,
1141                         'title':        video_title,
1142                         'ext':          video_extension,
1143                         'thumbnail':    video_thumbnail,
1144                         'description':  video_description,
1145                 }]
1146
1147
1148 class ArteTvIE(InfoExtractor):
1149         """arte.tv information extractor."""
1150
1151         _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1152         _LIVE_URL = r'index-[0-9]+\.html$'
1153
1154         IE_NAME = u'arte.tv'
1155
1156         def __init__(self, downloader=None):
1157                 InfoExtractor.__init__(self, downloader)
1158
1159         def report_download_webpage(self, video_id):
1160                 """Report webpage download."""
1161                 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1162
1163         def report_extraction(self, video_id):
1164                 """Report information extraction."""
1165                 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1166
1167         def fetch_webpage(self, url):
1168                 self._downloader.increment_downloads()
1169                 request = compat_urllib_request.Request(url)
1170                 try:
1171                         self.report_download_webpage(url)
1172                         webpage = compat_urllib_request.urlopen(request).read()
1173                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1174                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1175                         return
1176                 except ValueError as err:
1177                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1178                         return
1179                 return webpage
1180
1181         def grep_webpage(self, url, regex, regexFlags, matchTuples):
1182                 page = self.fetch_webpage(url)
1183                 mobj = re.search(regex, page, regexFlags)
1184                 info = {}
1185
1186                 if mobj is None:
1187                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1188                         return
1189
1190                 for (i, key, err) in matchTuples:
1191                         if mobj.group(i) is None:
1192                                 self._downloader.trouble(err)
1193                                 return
1194                         else:
1195                                 info[key] = mobj.group(i)
1196
1197                 return info
1198
1199         def extractLiveStream(self, url):
1200                 video_lang = url.split('/')[-4]
1201                 info = self.grep_webpage(
1202                         url,
1203                         r'src="(.*?/videothek_js.*?\.js)',
1204                         0,
1205                         [
1206                                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1207                         ]
1208                 )
1209                 http_host = url.split('/')[2]
1210                 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1211                 info = self.grep_webpage(
1212                         next_url,
1213                         r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1214                                 '(http://.*?\.swf).*?' +
1215                                 '(rtmp://.*?)\'',
1216                         re.DOTALL,
1217                         [
1218                                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1219                                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1220                                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1221                         ]
1222                 )
1223                 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1224
1225         def extractPlus7Stream(self, url):
1226                 video_lang = url.split('/')[-3]
1227                 info = self.grep_webpage(
1228                         url,
1229                         r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1230                         0,
1231                         [
1232                                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1233                         ]
1234                 )
1235                 next_url = compat_urllib_parse.unquote(info.get('url'))
1236                 info = self.grep_webpage(
1237                         next_url,
1238                         r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1239                         0,
1240                         [
1241                                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1242                         ]
1243                 )
1244                 next_url = compat_urllib_parse.unquote(info.get('url'))
1245
1246                 info = self.grep_webpage(
1247                         next_url,
1248                         r'<video id="(.*?)".*?>.*?' +
1249                                 '<name>(.*?)</name>.*?' +
1250                                 '<dateVideo>(.*?)</dateVideo>.*?' +
1251                                 '<url quality="hd">(.*?)</url>',
1252                         re.DOTALL,
1253                         [
1254                                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1255                                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1256                                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1257                                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1258                         ]
1259                 )
1260
1261                 return {
1262                         'id':           info.get('id'),
1263                         'url':          compat_urllib_parse.unquote(info.get('url')),
1264                         'uploader':     u'arte.tv',
1265                         'upload_date':  info.get('date'),
1266                         'title':        info.get('title'),
1267                         'ext':          u'mp4',
1268                         'format':       u'NA',
1269                         'player_url':   None,
1270                 }
1271
1272         def _real_extract(self, url):
1273                 video_id = url.split('/')[-1]
1274                 self.report_extraction(video_id)
1275
1276                 if re.search(self._LIVE_URL, video_id) is not None:
1277                         self.extractLiveStream(url)
1278                         return
1279                 else:
1280                         info = self.extractPlus7Stream(url)
1281
1282                 return [info]
1283
1284
1285 class GenericIE(InfoExtractor):
1286         """Generic last-resort information extractor."""
1287
1288         _VALID_URL = r'.*'
1289         IE_NAME = u'generic'
1290
1291         def __init__(self, downloader=None):
1292                 InfoExtractor.__init__(self, downloader)
1293
1294         def report_download_webpage(self, video_id):
1295                 """Report webpage download."""
1296                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1297                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1298
1299         def report_extraction(self, video_id):
1300                 """Report information extraction."""
1301                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1302
1303         def report_following_redirect(self, new_url):
1304                 """Report information extraction."""
1305                 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1306
1307         def _test_redirect(self, url):
1308                 """Check if it is a redirect, like url shorteners, in case restart chain."""
1309                 class HeadRequest(compat_urllib_request.Request):
1310                         def get_method(self):
1311                                 return "HEAD"
1312
1313                 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1314                         """
1315                         Subclass the HTTPRedirectHandler to make it use our
1316                         HeadRequest also on the redirected URL
1317                         """
1318                         def redirect_request(self, req, fp, code, msg, headers, newurl):
1319                                 if code in (301, 302, 303, 307):
1320                                         newurl = newurl.replace(' ', '%20')
1321                                         newheaders = dict((k,v) for k,v in req.headers.items()
1322                                                                           if k.lower() not in ("content-length", "content-type"))
1323                                         return HeadRequest(newurl,
1324                                                                            headers=newheaders,
1325                                                                            origin_req_host=req.get_origin_req_host(),
1326                                                                            unverifiable=True)
1327                                 else:
1328                                         raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1329
1330                 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1331                         """
1332                         Fallback to GET if HEAD is not allowed (405 HTTP error)
1333                         """
1334                         def http_error_405(self, req, fp, code, msg, headers):
1335                                 fp.read()
1336                                 fp.close()
1337
1338                                 newheaders = dict((k,v) for k,v in req.headers.items()
1339                                                                   if k.lower() not in ("content-length", "content-type"))
1340                                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1341                                                                                                  headers=newheaders,
1342                                                                                                  origin_req_host=req.get_origin_req_host(),
1343                                                                                                  unverifiable=True))
1344
1345                 # Build our opener
1346                 opener = compat_urllib_request.OpenerDirector()
1347                 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1348                                                 HTTPMethodFallback, HEADRedirectHandler,
1349                                                 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1350                         opener.add_handler(handler())
1351
1352                 response = opener.open(HeadRequest(url))
1353                 new_url = response.geturl()
1354
1355                 if url == new_url:
1356                         return False
1357
1358                 self.report_following_redirect(new_url)
1359                 self._downloader.download([new_url])
1360                 return True
1361
1362         def _real_extract(self, url):
1363                 if self._test_redirect(url): return
1364
1365                 video_id = url.split('/')[-1]
1366                 request = compat_urllib_request.Request(url)
1367                 try:
1368                         self.report_download_webpage(video_id)
1369                         webpage = compat_urllib_request.urlopen(request).read()
1370                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1371                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1372                         return
1373                 except ValueError as err:
1374                         # since this is the last-resort InfoExtractor, if
1375                         # this error is thrown, it'll be thrown here
1376                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1377                         return
1378
1379                 self.report_extraction(video_id)
1380                 # Start with something easy: JW Player in SWFObject
1381                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1382                 if mobj is None:
1383                         # Broaden the search a little bit
1384                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1385                 if mobj is None:
1386                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1387                         return
1388
1389                 # It's possible that one of the regexes
1390                 # matched, but returned an empty group:
1391                 if mobj.group(1) is None:
1392                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1393                         return
1394
1395                 video_url = compat_urllib_parse.unquote(mobj.group(1))
1396                 video_id = os.path.basename(video_url)
1397
1398                 # here's a fun little line of code for you:
1399                 video_extension = os.path.splitext(video_id)[1][1:]
1400                 video_id = os.path.splitext(video_id)[0]
1401
1402                 # it's tempting to parse this further, but you would
1403                 # have to take into account all the variations like
1404                 #   Video Title - Site Name
1405                 #   Site Name | Video Title
1406                 #   Video Title - Tagline | Site Name
1407                 # and so on and so forth; it's just not practical
1408                 mobj = re.search(r'<title>(.*)</title>', webpage)
1409                 if mobj is None:
1410                         self._downloader.trouble(u'ERROR: unable to extract title')
1411                         return
1412                 video_title = mobj.group(1).decode('utf-8')
1413
1414                 # video uploader is domain name
1415                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1416                 if mobj is None:
1417                         self._downloader.trouble(u'ERROR: unable to extract title')
1418                         return
1419                 video_uploader = mobj.group(1).decode('utf-8')
1420
1421                 return [{
1422                         'id':           video_id.decode('utf-8'),
1423                         'url':          video_url.decode('utf-8'),
1424                         'uploader':     video_uploader,
1425                         'upload_date':  None,
1426                         'title':        video_title,
1427                         'ext':          video_extension.decode('utf-8'),
1428                 }]
1429
1430
1431 class YoutubeSearchIE(InfoExtractor):
1432         """Information Extractor for YouTube search queries."""
1433         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1434         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1435         _max_youtube_results = 1000
1436         IE_NAME = u'youtube:search'
1437
1438         def __init__(self, downloader=None):
1439                 InfoExtractor.__init__(self, downloader)
1440
1441         def report_download_page(self, query, pagenum):
1442                 """Report attempt to download search page with given number."""
1443                 query = query.decode(preferredencoding())
1444                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1445
1446         def _real_extract(self, query):
1447                 mobj = re.match(self._VALID_URL, query)
1448                 if mobj is None:
1449                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1450                         return
1451
1452                 prefix, query = query.split(':')
1453                 prefix = prefix[8:]
1454                 query = query.encode('utf-8')
1455                 if prefix == '':
1456                         self._download_n_results(query, 1)
1457                         return
1458                 elif prefix == 'all':
1459                         self._download_n_results(query, self._max_youtube_results)
1460                         return
1461                 else:
1462                         try:
1463                                 n = int(prefix)
1464                                 if n <= 0:
1465                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1466                                         return
1467                                 elif n > self._max_youtube_results:
1468                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1469                                         n = self._max_youtube_results
1470                                 self._download_n_results(query, n)
1471                                 return
1472                         except ValueError: # parsing prefix as integer fails
1473                                 self._download_n_results(query, 1)
1474                                 return
1475
1476         def _download_n_results(self, query, n):
1477                 """Downloads a specified number of results for a query"""
1478
1479                 video_ids = []
1480                 pagenum = 0
1481                 limit = n
1482
1483                 while (50 * pagenum) < limit:
1484                         self.report_download_page(query, pagenum+1)
1485                         result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1486                         request = compat_urllib_request.Request(result_url)
1487                         try:
1488                                 data = compat_urllib_request.urlopen(request).read()
1489                         except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1490                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1491                                 return
1492                         api_response = json.loads(data)['data']
1493
1494                         new_ids = list(video['id'] for video in api_response['items'])
1495                         video_ids += new_ids
1496
1497                         limit = min(n, api_response['totalItems'])
1498                         pagenum += 1
1499
1500                 if len(video_ids) > n:
1501                         video_ids = video_ids[:n]
1502                 for id in video_ids:
1503                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1504                 return
1505
1506
1507 class GoogleSearchIE(InfoExtractor):
1508         """Information Extractor for Google Video search queries."""
1509         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1510         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1511         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1512         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1513         _max_google_results = 1000
1514         IE_NAME = u'video.google:search'
1515
1516         def __init__(self, downloader=None):
1517                 InfoExtractor.__init__(self, downloader)
1518
1519         def report_download_page(self, query, pagenum):
1520                 """Report attempt to download playlist page with given number."""
1521                 query = query.decode(preferredencoding())
1522                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1523
1524         def _real_extract(self, query):
1525                 mobj = re.match(self._VALID_URL, query)
1526                 if mobj is None:
1527                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1528                         return
1529
1530                 prefix, query = query.split(':')
1531                 prefix = prefix[8:]
1532                 query = query.encode('utf-8')
1533                 if prefix == '':
1534                         self._download_n_results(query, 1)
1535                         return
1536                 elif prefix == 'all':
1537                         self._download_n_results(query, self._max_google_results)
1538                         return
1539                 else:
1540                         try:
1541                                 n = int(prefix)
1542                                 if n <= 0:
1543                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1544                                         return
1545                                 elif n > self._max_google_results:
1546                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1547                                         n = self._max_google_results
1548                                 self._download_n_results(query, n)
1549                                 return
1550                         except ValueError: # parsing prefix as integer fails
1551                                 self._download_n_results(query, 1)
1552                                 return
1553
1554         def _download_n_results(self, query, n):
1555                 """Downloads a specified number of results for a query"""
1556
1557                 video_ids = []
1558                 pagenum = 0
1559
1560                 while True:
1561                         self.report_download_page(query, pagenum)
1562                         result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1563                         request = compat_urllib_request.Request(result_url)
1564                         try:
1565                                 page = compat_urllib_request.urlopen(request).read()
1566                         except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1567                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1568                                 return
1569
1570                         # Extract video identifiers
1571                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1572                                 video_id = mobj.group(1)
1573                                 if video_id not in video_ids:
1574                                         video_ids.append(video_id)
1575                                         if len(video_ids) == n:
1576                                                 # Specified n videos reached
1577                                                 for id in video_ids:
1578                                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1579                                                 return
1580
1581                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1582                                 for id in video_ids:
1583                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1584                                 return
1585
1586                         pagenum = pagenum + 1
1587
1588
1589 class YahooSearchIE(InfoExtractor):
1590         """Information Extractor for Yahoo! Video search queries."""
1591         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1592         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1593         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1594         _MORE_PAGES_INDICATOR = r'\s*Next'
1595         _max_yahoo_results = 1000
1596         IE_NAME = u'video.yahoo:search'
1597
1598         def __init__(self, downloader=None):
1599                 InfoExtractor.__init__(self, downloader)
1600
1601         def report_download_page(self, query, pagenum):
1602                 """Report attempt to download playlist page with given number."""
1603                 query = query.decode(preferredencoding())
1604                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1605
1606         def _real_extract(self, query):
1607                 mobj = re.match(self._VALID_URL, query)
1608                 if mobj is None:
1609                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1610                         return
1611
1612                 prefix, query = query.split(':')
1613                 prefix = prefix[8:]
1614                 query = query.encode('utf-8')
1615                 if prefix == '':
1616                         self._download_n_results(query, 1)
1617                         return
1618                 elif prefix == 'all':
1619                         self._download_n_results(query, self._max_yahoo_results)
1620                         return
1621                 else:
1622                         try:
1623                                 n = int(prefix)
1624                                 if n <= 0:
1625                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1626                                         return
1627                                 elif n > self._max_yahoo_results:
1628                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1629                                         n = self._max_yahoo_results
1630                                 self._download_n_results(query, n)
1631                                 return
1632                         except ValueError: # parsing prefix as integer fails
1633                                 self._download_n_results(query, 1)
1634                                 return
1635
1636         def _download_n_results(self, query, n):
1637                 """Downloads a specified number of results for a query"""
1638
1639                 video_ids = []
1640                 already_seen = set()
1641                 pagenum = 1
1642
1643                 while True:
1644                         self.report_download_page(query, pagenum)
1645                         result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1646                         request = compat_urllib_request.Request(result_url)
1647                         try:
1648                                 page = compat_urllib_request.urlopen(request).read()
1649                         except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1650                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1651                                 return
1652
1653                         # Extract video identifiers
1654                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1655                                 video_id = mobj.group(1)
1656                                 if video_id not in already_seen:
1657                                         video_ids.append(video_id)
1658                                         already_seen.add(video_id)
1659                                         if len(video_ids) == n:
1660                                                 # Specified n videos reached
1661                                                 for id in video_ids:
1662                                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1663                                                 return
1664
1665                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1666                                 for id in video_ids:
1667                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1668                                 return
1669
1670                         pagenum = pagenum + 1
1671
1672
1673 class YoutubePlaylistIE(InfoExtractor):
1674         """Information Extractor for YouTube playlists."""
1675
1676         _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1677         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1678         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1679         _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1680         IE_NAME = u'youtube:playlist'
1681
1682         def __init__(self, downloader=None):
1683                 InfoExtractor.__init__(self, downloader)
1684
1685         def report_download_page(self, playlist_id, pagenum):
1686                 """Report attempt to download playlist page with given number."""
1687                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1688
1689         def _real_extract(self, url):
1690                 # Extract playlist id
1691                 mobj = re.match(self._VALID_URL, url)
1692                 if mobj is None:
1693                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1694                         return
1695
1696                 # Single video case
1697                 if mobj.group(3) is not None:
1698                         self._downloader.download([mobj.group(3)])
1699                         return
1700
1701                 # Download playlist pages
1702                 # prefix is 'p' as default for playlists but there are other types that need extra care
1703                 playlist_prefix = mobj.group(1)
1704                 if playlist_prefix == 'a':
1705                         playlist_access = 'artist'
1706                 else:
1707                         playlist_prefix = 'p'
1708                         playlist_access = 'view_play_list'
1709                 playlist_id = mobj.group(2)
1710                 video_ids = []
1711                 pagenum = 1
1712
1713                 while True:
1714                         self.report_download_page(playlist_id, pagenum)
1715                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1716                         request = compat_urllib_request.Request(url)
1717                         try:
1718                                 page = compat_urllib_request.urlopen(request).read()
1719                         except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1720                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1721                                 return
1722
1723                         # Extract video identifiers
1724                         ids_in_page = []
1725                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1726                                 if mobj.group(1) not in ids_in_page:
1727                                         ids_in_page.append(mobj.group(1))
1728                         video_ids.extend(ids_in_page)
1729
1730                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1731                                 break
1732                         pagenum = pagenum + 1
1733
1734                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1735                 playlistend = self._downloader.params.get('playlistend', -1)
1736                 if playlistend == -1:
1737                         video_ids = video_ids[playliststart:]
1738                 else:
1739                         video_ids = video_ids[playliststart:playlistend]
1740
1741                 for id in video_ids:
1742                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1743                 return
1744
1745
1746 class YoutubeChannelIE(InfoExtractor):
1747         """Information Extractor for YouTube channels."""
1748
1749         _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1750         _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1751         _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1752         IE_NAME = u'youtube:channel'
1753
1754         def report_download_page(self, channel_id, pagenum):
1755                 """Report attempt to download channel page with given number."""
1756                 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1757
1758         def _real_extract(self, url):
1759                 # Extract channel id
1760                 mobj = re.match(self._VALID_URL, url)
1761                 if mobj is None:
1762                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1763                         return
1764
1765                 # Download channel pages
1766                 channel_id = mobj.group(1)
1767                 video_ids = []
1768                 pagenum = 1
1769
1770                 while True:
1771                         self.report_download_page(channel_id, pagenum)
1772                         url = self._TEMPLATE_URL % (channel_id, pagenum)
1773                         request = compat_urllib_request.Request(url)
1774                         try:
1775                                 page = compat_urllib_request.urlopen(request).read()
1776                         except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1777                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1778                                 return
1779
1780                         # Extract video identifiers
1781                         ids_in_page = []
1782                         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1783                                 if mobj.group(1) not in ids_in_page:
1784                                         ids_in_page.append(mobj.group(1))
1785                         video_ids.extend(ids_in_page)
1786
1787                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1788                                 break
1789                         pagenum = pagenum + 1
1790
1791                 for id in video_ids:
1792                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1793                 return
1794
1795
1796 class YoutubeUserIE(InfoExtractor):
1797         """Information Extractor for YouTube users."""
1798
1799         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1800         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1801         _GDATA_PAGE_SIZE = 50
1802         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1803         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1804         IE_NAME = u'youtube:user'
1805
1806         def __init__(self, downloader=None):
1807                 InfoExtractor.__init__(self, downloader)
1808
1809         def report_download_page(self, username, start_index):
1810                 """Report attempt to download user page."""
1811                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1812                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1813
1814         def _real_extract(self, url):
1815                 # Extract username
1816                 mobj = re.match(self._VALID_URL, url)
1817                 if mobj is None:
1818                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1819                         return
1820
1821                 username = mobj.group(1)
1822
1823                 # Download video ids using YouTube Data API. Result size per
1824                 # query is limited (currently to 50 videos) so we need to query
1825                 # page by page until there are no video ids - it means we got
1826                 # all of them.
1827
1828                 video_ids = []
1829                 pagenum = 0
1830
1831                 while True:
1832                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1833                         self.report_download_page(username, start_index)
1834
1835                         request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1836
1837                         try:
1838                                 page = compat_urllib_request.urlopen(request).read()
1839                         except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1840                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1841                                 return
1842
1843                         # Extract video identifiers
1844                         ids_in_page = []
1845
1846                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1847                                 if mobj.group(1) not in ids_in_page:
1848                                         ids_in_page.append(mobj.group(1))
1849
1850                         video_ids.extend(ids_in_page)
1851
1852                         # A little optimization - if current page is not
1853                         # "full", ie. does not contain PAGE_SIZE video ids then
1854                         # we can assume that this page is the last one - there
1855                         # are no more ids on further pages - no need to query
1856                         # again.
1857
1858                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1859                                 break
1860
1861                         pagenum += 1
1862
1863                 all_ids_count = len(video_ids)
1864                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1865                 playlistend = self._downloader.params.get('playlistend', -1)
1866
1867                 if playlistend == -1:
1868                         video_ids = video_ids[playliststart:]
1869                 else:
1870                         video_ids = video_ids[playliststart:playlistend]
1871
1872                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1873                                 (username, all_ids_count, len(video_ids)))
1874
1875                 for video_id in video_ids:
1876                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1877
1878
1879 class BlipTVUserIE(InfoExtractor):
1880         """Information Extractor for blip.tv users."""
1881
1882         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1883         _PAGE_SIZE = 12
1884         IE_NAME = u'blip.tv:user'
1885
1886         def __init__(self, downloader=None):
1887                 InfoExtractor.__init__(self, downloader)
1888
1889         def report_download_page(self, username, pagenum):
1890                 """Report attempt to download user page."""
1891                 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1892                                 (self.IE_NAME, username, pagenum))
1893
1894         def _real_extract(self, url):
1895                 # Extract username
1896                 mobj = re.match(self._VALID_URL, url)
1897                 if mobj is None:
1898                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1899                         return
1900
1901                 username = mobj.group(1)
1902
1903                 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1904
1905                 request = compat_urllib_request.Request(url)
1906
1907                 try:
1908                         page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1909                         mobj = re.search(r'data-users-id="([^"]+)"', page)
1910                         page_base = page_base % mobj.group(1)
1911                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1912                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1913                         return
1914
1915
1916                 # Download video ids using BlipTV Ajax calls. Result size per
1917                 # query is limited (currently to 12 videos) so we need to query
1918                 # page by page until there are no video ids - it means we got
1919                 # all of them.
1920
1921                 video_ids = []
1922                 pagenum = 1
1923
1924                 while True:
1925                         self.report_download_page(username, pagenum)
1926
1927                         request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1928
1929                         try:
1930                                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1931                         except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1932                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1933                                 return
1934
1935                         # Extract video identifiers
1936                         ids_in_page = []
1937
1938                         for mobj in re.finditer(r'href="/([^"]+)"', page):
1939                                 if mobj.group(1) not in ids_in_page:
1940                                         ids_in_page.append(unescapeHTML(mobj.group(1)))
1941
1942                         video_ids.extend(ids_in_page)
1943
1944                         # A little optimization - if current page is not
1945                         # "full", ie. does not contain PAGE_SIZE video ids then
1946                         # we can assume that this page is the last one - there
1947                         # are no more ids on further pages - no need to query
1948                         # again.
1949
1950                         if len(ids_in_page) < self._PAGE_SIZE:
1951                                 break
1952
1953                         pagenum += 1
1954
1955                 all_ids_count = len(video_ids)
1956                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1957                 playlistend = self._downloader.params.get('playlistend', -1)
1958
1959                 if playlistend == -1:
1960                         video_ids = video_ids[playliststart:]
1961                 else:
1962                         video_ids = video_ids[playliststart:playlistend]
1963
1964                 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1965                                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1966
1967                 for video_id in video_ids:
1968                         self._downloader.download([u'http://blip.tv/'+video_id])
1969
1970
1971 class DepositFilesIE(InfoExtractor):
1972         """Information extractor for depositfiles.com"""
1973
1974         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1975         IE_NAME = u'DepositFiles'
1976
1977         def __init__(self, downloader=None):
1978                 InfoExtractor.__init__(self, downloader)
1979
1980         def report_download_webpage(self, file_id):
1981                 """Report webpage download."""
1982                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1983
1984         def report_extraction(self, file_id):
1985                 """Report information extraction."""
1986                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1987
1988         def _real_extract(self, url):
1989                 file_id = url.split('/')[-1]
1990                 # Rebuild url in english locale
1991                 url = 'http://depositfiles.com/en/files/' + file_id
1992
1993                 # Retrieve file webpage with 'Free download' button pressed
1994                 free_download_indication = { 'gateway_result' : '1' }
1995                 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1996                 try:
1997                         self.report_download_webpage(file_id)
1998                         webpage = compat_urllib_request.urlopen(request).read()
1999                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2000                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
2001                         return
2002
2003                 # Search for the real file URL
2004                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2005                 if (mobj is None) or (mobj.group(1) is None):
2006                         # Try to figure out reason of the error.
2007                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2008                         if (mobj is not None) and (mobj.group(1) is not None):
2009                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2010                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2011                         else:
2012                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2013                         return
2014
2015                 file_url = mobj.group(1)
2016                 file_extension = os.path.splitext(file_url)[1][1:]
2017
2018                 # Search for file title
2019                 mobj = re.search(r'<b title="(.*?)">', webpage)
2020                 if mobj is None:
2021                         self._downloader.trouble(u'ERROR: unable to extract title')
2022                         return
2023                 file_title = mobj.group(1).decode('utf-8')
2024
2025                 return [{
2026                         'id':           file_id.decode('utf-8'),
2027                         'url':          file_url.decode('utf-8'),
2028                         'uploader':     None,
2029                         'upload_date':  None,
2030                         'title':        file_title,
2031                         'ext':          file_extension.decode('utf-8'),
2032                 }]
2033
2034
2035 class FacebookIE(InfoExtractor):
2036         """Information Extractor for Facebook"""
2037
2038         _WORKING = False
2039         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2040         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2041         _NETRC_MACHINE = 'facebook'
2042         _available_formats = ['video', 'highqual', 'lowqual']
2043         _video_extensions = {
2044                 'video': 'mp4',
2045                 'highqual': 'mp4',
2046                 'lowqual': 'mp4',
2047         }
2048         IE_NAME = u'facebook'
2049
2050         def __init__(self, downloader=None):
2051                 InfoExtractor.__init__(self, downloader)
2052
2053         def _reporter(self, message):
2054                 """Add header and report message."""
2055                 self._downloader.to_screen(u'[facebook] %s' % message)
2056
2057         def report_login(self):
2058                 """Report attempt to log in."""
2059                 self._reporter(u'Logging in')
2060
2061         def report_video_webpage_download(self, video_id):
2062                 """Report attempt to download video webpage."""
2063                 self._reporter(u'%s: Downloading video webpage' % video_id)
2064
2065         def report_information_extraction(self, video_id):
2066                 """Report attempt to extract video information."""
2067                 self._reporter(u'%s: Extracting video information' % video_id)
2068
2069         def _parse_page(self, video_webpage):
2070                 """Extract video information from page"""
2071                 # General data
2072                 data = {'title': r'\("video_title", "(.*?)"\)',
2073                         'description': r'<div class="datawrap">(.*?)</div>',
2074                         'owner': r'\("video_owner_name", "(.*?)"\)',
2075                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2076                         }
2077                 video_info = {}
2078                 for piece in data.keys():
2079                         mobj = re.search(data[piece], video_webpage)
2080                         if mobj is not None:
2081                                 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2082
2083                 # Video urls
2084                 video_urls = {}
2085                 for fmt in self._available_formats:
2086                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2087                         if mobj is not None:
2088                                 # URL is in a Javascript segment inside an escaped Unicode format within
2089                                 # the generally utf-8 page
2090                                 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2091                 video_info['video_urls'] = video_urls
2092
2093                 return video_info
2094
2095         def _real_initialize(self):
2096                 if self._downloader is None:
2097                         return
2098
2099                 useremail = None
2100                 password = None
2101                 downloader_params = self._downloader.params
2102
2103                 # Attempt to use provided username and password or .netrc data
2104                 if downloader_params.get('username', None) is not None:
2105                         useremail = downloader_params['username']
2106                         password = downloader_params['password']
2107                 elif downloader_params.get('usenetrc', False):
2108                         try:
2109                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2110                                 if info is not None:
2111                                         useremail = info[0]
2112                                         password = info[2]
2113                                 else:
2114                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2115                         except (IOError, netrc.NetrcParseError) as err:
2116                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2117                                 return
2118
2119                 if useremail is None:
2120                         return
2121
2122                 # Log in
2123                 login_form = {
2124                         'email': useremail,
2125                         'pass': password,
2126                         'login': 'Log+In'
2127                         }
2128                 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2129                 try:
2130                         self.report_login()
2131                         login_results = compat_urllib_request.urlopen(request).read()
2132                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2133                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2134                                 return
2135                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2136                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2137                         return
2138
2139         def _real_extract(self, url):
2140                 mobj = re.match(self._VALID_URL, url)
2141                 if mobj is None:
2142                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2143                         return
2144                 video_id = mobj.group('ID')
2145
2146                 # Get video webpage
2147                 self.report_video_webpage_download(video_id)
2148                 request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2149                 try:
2150                         page = compat_urllib_request.urlopen(request)
2151                         video_webpage = page.read()
2152                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2153                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2154                         return
2155
2156                 # Start extracting information
2157                 self.report_information_extraction(video_id)
2158
2159                 # Extract information
2160                 video_info = self._parse_page(video_webpage)
2161
2162                 # uploader
2163                 if 'owner' not in video_info:
2164                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2165                         return
2166                 video_uploader = video_info['owner']
2167
2168                 # title
2169                 if 'title' not in video_info:
2170                         self._downloader.trouble(u'ERROR: unable to extract video title')
2171                         return
2172                 video_title = video_info['title']
2173                 video_title = video_title.decode('utf-8')
2174
2175                 # thumbnail image
2176                 if 'thumbnail' not in video_info:
2177                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2178                         video_thumbnail = ''
2179                 else:
2180                         video_thumbnail = video_info['thumbnail']
2181
2182                 # upload date
2183                 upload_date = None
2184                 if 'upload_date' in video_info:
2185                         upload_time = video_info['upload_date']
2186                         timetuple = email.utils.parsedate_tz(upload_time)
2187                         if timetuple is not None:
2188                                 try:
2189                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2190                                 except:
2191                                         pass
2192
2193                 # description
2194                 video_description = video_info.get('description', 'No description available.')
2195
2196                 url_map = video_info['video_urls']
2197                 if len(url_map.keys()) > 0:
2198                         # Decide which formats to download
2199                         req_format = self._downloader.params.get('format', None)
2200                         format_limit = self._downloader.params.get('format_limit', None)
2201
2202                         if format_limit is not None and format_limit in self._available_formats:
2203                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2204                         else:
2205                                 format_list = self._available_formats
2206                         existing_formats = [x for x in format_list if x in url_map]
2207                         if len(existing_formats) == 0:
2208                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2209                                 return
2210                         if req_format is None:
2211                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2212                         elif req_format == 'worst':
2213                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2214                         elif req_format == '-1':
2215                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2216                         else:
2217                                 # Specific format
2218                                 if req_format not in url_map:
2219                                         self._downloader.trouble(u'ERROR: requested format not available')
2220                                         return
2221                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2222
2223                 results = []
2224                 for format_param, video_real_url in video_url_list:
2225                         # Extension
2226                         video_extension = self._video_extensions.get(format_param, 'mp4')
2227
2228                         results.append({
2229                                 'id':           video_id.decode('utf-8'),
2230                                 'url':          video_real_url.decode('utf-8'),
2231                                 'uploader':     video_uploader.decode('utf-8'),
2232                                 'upload_date':  upload_date,
2233                                 'title':        video_title,
2234                                 'ext':          video_extension.decode('utf-8'),
2235                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2236                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2237                                 'description':  video_description.decode('utf-8'),
2238                         })
2239                 return results
2240
2241 class BlipTVIE(InfoExtractor):
2242         """Information extractor for blip.tv"""
2243
2244         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2245         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2246         IE_NAME = u'blip.tv'
2247
2248         def report_extraction(self, file_id):
2249                 """Report information extraction."""
2250                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2251
2252         def report_direct_download(self, title):
2253                 """Report information extraction."""
2254                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2255
2256         def _real_extract(self, url):
2257                 mobj = re.match(self._VALID_URL, url)
2258                 if mobj is None:
2259                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2260                         return
2261
2262                 if '?' in url:
2263                         cchar = '&'
2264                 else:
2265                         cchar = '?'
2266                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2267                 request = compat_urllib_request.Request(json_url.encode('utf-8'))
2268                 self.report_extraction(mobj.group(1))
2269                 info = None
2270                 try:
2271                         urlh = compat_urllib_request.urlopen(request)
2272                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2273                                 basename = url.split('/')[-1]
2274                                 title,ext = os.path.splitext(basename)
2275                                 title = title.decode('UTF-8')
2276                                 ext = ext.replace('.', '')
2277                                 self.report_direct_download(title)
2278                                 info = {
2279                                         'id': title,
2280                                         'url': url,
2281                                         'uploader': None,
2282                                         'upload_date': None,
2283                                         'title': title,
2284                                         'ext': ext,
2285                                         'urlhandle': urlh
2286                                 }
2287                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2288                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2289                         return
2290                 if info is None: # Regular URL
2291                         try:
2292                                 json_code = urlh.read()
2293                         except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2294                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2295                                 return
2296
2297                         try:
2298                                 json_data = json.loads(json_code)
2299                                 if 'Post' in json_data:
2300                                         data = json_data['Post']
2301                                 else:
2302                                         data = json_data
2303
2304                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2305                                 video_url = data['media']['url']
2306                                 umobj = re.match(self._URL_EXT, video_url)
2307                                 if umobj is None:
2308                                         raise ValueError('Can not determine filename extension')
2309                                 ext = umobj.group(1)
2310
2311                                 info = {
2312                                         'id': data['item_id'],
2313                                         'url': video_url,
2314                                         'uploader': data['display_name'],
2315                                         'upload_date': upload_date,
2316                                         'title': data['title'],
2317                                         'ext': ext,
2318                                         'format': data['media']['mimeType'],
2319                                         'thumbnail': data['thumbnailUrl'],
2320                                         'description': data['description'],
2321                                         'player_url': data['embedUrl']
2322                                 }
2323                         except (ValueError,KeyError) as err:
2324                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2325                                 return
2326
2327                 std_headers['User-Agent'] = 'iTunes/10.6.1'
2328                 return [info]
2329
2330
2331 class MyVideoIE(InfoExtractor):
2332         """Information Extractor for myvideo.de."""
2333
2334         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2335         IE_NAME = u'myvideo'
2336
2337         def __init__(self, downloader=None):
2338                 InfoExtractor.__init__(self, downloader)
2339
2340         def report_download_webpage(self, video_id):
2341                 """Report webpage download."""
2342                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2343
2344         def report_extraction(self, video_id):
2345                 """Report information extraction."""
2346                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2347
2348         def _real_extract(self,url):
2349                 mobj = re.match(self._VALID_URL, url)
2350                 if mobj is None:
2351                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2352                         return
2353
2354                 video_id = mobj.group(1)
2355
2356                 # Get video webpage
2357                 request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
2358                 try:
2359                         self.report_download_webpage(video_id)
2360                         webpage = compat_urllib_request.urlopen(request).read()
2361                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2362                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2363                         return
2364
2365                 self.report_extraction(video_id)
2366                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2367                                  webpage)
2368                 if mobj is None:
2369                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2370                         return
2371                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2372
2373                 mobj = re.search('<title>([^<]+)</title>', webpage)
2374                 if mobj is None:
2375                         self._downloader.trouble(u'ERROR: unable to extract title')
2376                         return
2377
2378                 video_title = mobj.group(1)
2379
2380                 return [{
2381                         'id':           video_id,
2382                         'url':          video_url,
2383                         'uploader':     None,
2384                         'upload_date':  None,
2385                         'title':        video_title,
2386                         'ext':          u'flv',
2387                 }]
2388
2389 class ComedyCentralIE(InfoExtractor):
2390         """Information extractor for The Daily Show and Colbert Report """
2391
2392         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2393         IE_NAME = u'comedycentral'
2394
2395         _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2396
2397         _video_extensions = {
2398                 '3500': 'mp4',
2399                 '2200': 'mp4',
2400                 '1700': 'mp4',
2401                 '1200': 'mp4',
2402                 '750': 'mp4',
2403                 '400': 'mp4',
2404         }
2405         _video_dimensions = {
2406                 '3500': '1280x720',
2407                 '2200': '960x540',
2408                 '1700': '768x432',
2409                 '1200': '640x360',
2410                 '750': '512x288',
2411                 '400': '384x216',
2412         }
2413
2414         def report_extraction(self, episode_id):
2415                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2416
2417         def report_config_download(self, episode_id):
2418                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2419
2420         def report_index_download(self, episode_id):
2421                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2422
2423         def report_player_url(self, episode_id):
2424                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2425
2426
2427         def _print_formats(self, formats):
2428                 print('Available formats:')
2429                 for x in formats:
2430                         print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2431
2432
2433         def _real_extract(self, url):
2434                 mobj = re.match(self._VALID_URL, url)
2435                 if mobj is None:
2436                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2437                         return
2438
2439                 if mobj.group('shortname'):
2440                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
2441                                 url = u'http://www.thedailyshow.com/full-episodes/'
2442                         else:
2443                                 url = u'http://www.colbertnation.com/full-episodes/'
2444                         mobj = re.match(self._VALID_URL, url)
2445                         assert mobj is not None
2446
2447                 dlNewest = not mobj.group('episode')
2448                 if dlNewest:
2449                         epTitle = mobj.group('showname')
2450                 else:
2451                         epTitle = mobj.group('episode')
2452
2453                 req = compat_urllib_request.Request(url)
2454                 self.report_extraction(epTitle)
2455                 try:
2456                         htmlHandle = compat_urllib_request.urlopen(req)
2457                         html = htmlHandle.read()
2458                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2459                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2460                         return
2461                 if dlNewest:
2462                         url = htmlHandle.geturl()
2463                         mobj = re.match(self._VALID_URL, url)
2464                         if mobj is None:
2465                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2466                                 return
2467                         if mobj.group('episode') == '':
2468                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2469                                 return
2470                         epTitle = mobj.group('episode')
2471
2472                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2473
2474                 if len(mMovieParams) == 0:
2475                         # The Colbert Report embeds the information in a without
2476                         # a URL prefix; so extract the alternate reference
2477                         # and then add the URL prefix manually.
2478
2479                         altMovieParams = re.findall('data-mgid="([^"]*episode.*?:.*?)"', html)
2480                         if len(altMovieParams) == 0:
2481                                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2482                                 return
2483                         else:
2484                                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2485
2486                 playerUrl_raw = mMovieParams[0][0]
2487                 self.report_player_url(epTitle)
2488                 try:
2489                         urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2490                         playerUrl = urlHandle.geturl()
2491                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2492                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2493                         return
2494
2495                 uri = mMovieParams[0][1]
2496                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2497                 self.report_index_download(epTitle)
2498                 try:
2499                         indexXml = compat_urllib_request.urlopen(indexUrl).read()
2500                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2501                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2502                         return
2503
2504                 results = []
2505
2506                 idoc = xml.etree.ElementTree.fromstring(indexXml)
2507                 itemEls = idoc.findall('.//item')
2508                 for itemEl in itemEls:
2509                         mediaId = itemEl.findall('./guid')[0].text
2510                         shortMediaId = mediaId.split(':')[-1]
2511                         showId = mediaId.split(':')[-2].replace('.com', '')
2512                         officialTitle = itemEl.findall('./title')[0].text
2513                         officialDate = itemEl.findall('./pubDate')[0].text
2514
2515                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2516                                                 compat_urllib_parse.urlencode({'uri': mediaId}))
2517                         configReq = compat_urllib_request.Request(configUrl)
2518                         self.report_config_download(epTitle)
2519                         try:
2520                                 configXml = compat_urllib_request.urlopen(configReq).read()
2521                         except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2522                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2523                                 return
2524
2525                         cdoc = xml.etree.ElementTree.fromstring(configXml)
2526                         turls = []
2527                         for rendition in cdoc.findall('.//rendition'):
2528                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2529                                 turls.append(finfo)
2530
2531                         if len(turls) == 0:
2532                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2533                                 continue
2534
2535                         if self._downloader.params.get('listformats', None):
2536                                 self._print_formats([i[0] for i in turls])
2537                                 return
2538
2539                         # For now, just pick the highest bitrate
2540                         format,video_url = turls[-1]
2541
2542                         # Get the format arg from the arg stream
2543                         req_format = self._downloader.params.get('format', None)
2544
2545                         # Select format if we can find one
2546                         for f,v in turls:
2547                                 if f == req_format:
2548                                         format, video_url = f, v
2549                                         break
2550
2551                         # Patch to download from alternative CDN, which does not
2552                         # break on current RTMPDump builds
2553                         broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2554                         better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2555
2556                         if video_url.startswith(broken_cdn):
2557                                 video_url = video_url.replace(broken_cdn, better_cdn)
2558
2559                         effTitle = showId + u'-' + epTitle
2560                         info = {
2561                                 'id': shortMediaId,
2562                                 'url': video_url,
2563                                 'uploader': showId,
2564                                 'upload_date': officialDate,
2565                                 'title': effTitle,
2566                                 'ext': 'mp4',
2567                                 'format': format,
2568                                 'thumbnail': None,
2569                                 'description': officialTitle,
2570                                 'player_url': None #playerUrl
2571                         }
2572
2573                         results.append(info)
2574
2575                 return results
2576
2577
2578 class EscapistIE(InfoExtractor):
2579         """Information extractor for The Escapist """
2580
2581         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2582         IE_NAME = u'escapist'
2583
2584         def report_extraction(self, showName):
2585                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2586
2587         def report_config_download(self, showName):
2588                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2589
2590         def _real_extract(self, url):
2591                 mobj = re.match(self._VALID_URL, url)
2592                 if mobj is None:
2593                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2594                         return
2595                 showName = mobj.group('showname')
2596                 videoId = mobj.group('episode')
2597
2598                 self.report_extraction(showName)
2599                 try:
2600                         webPage = compat_urllib_request.urlopen(url)
2601                         webPageBytes = webPage.read()
2602                         m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2603                         webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2604                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2605                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2606                         return
2607
2608                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2609                 description = unescapeHTML(descMatch.group(1))
2610                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2611                 imgUrl = unescapeHTML(imgMatch.group(1))
2612                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2613                 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2614                 configUrlMatch = re.search('config=(.*)$', playerUrl)
2615                 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2616
2617                 self.report_config_download(showName)
2618                 try:
2619                         configJSON = compat_urllib_request.urlopen(configUrl).read()
2620                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2621                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2622                         return
2623
2624                 # Technically, it's JavaScript, not JSON
2625                 configJSON = configJSON.replace("'", '"')
2626
2627                 try:
2628                         config = json.loads(configJSON)
2629                 except (ValueError,) as err:
2630                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2631                         return
2632
2633                 playlist = config['playlist']
2634                 videoUrl = playlist[1]['url']
2635
2636                 info = {
2637                         'id': videoId,
2638                         'url': videoUrl,
2639                         'uploader': showName,
2640                         'upload_date': None,
2641                         'title': showName,
2642                         'ext': 'flv',
2643                         'thumbnail': imgUrl,
2644                         'description': description,
2645                         'player_url': playerUrl,
2646                 }
2647
2648                 return [info]
2649
2650
2651 class CollegeHumorIE(InfoExtractor):
2652         """Information extractor for collegehumor.com"""
2653
2654         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2655         IE_NAME = u'collegehumor'
2656
2657         def report_webpage(self, video_id):
2658                 """Report information extraction."""
2659                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2660
2661         def report_extraction(self, video_id):
2662                 """Report information extraction."""
2663                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2664
2665         def _real_extract(self, url):
2666                 mobj = re.match(self._VALID_URL, url)
2667                 if mobj is None:
2668                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2669                         return
2670                 video_id = mobj.group('videoid')
2671
2672                 self.report_webpage(video_id)
2673                 request = compat_urllib_request.Request(url)
2674                 try:
2675                         webpage = compat_urllib_request.urlopen(request).read()
2676                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2677                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2678                         return
2679
2680                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2681                 if m is None:
2682                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2683                         return
2684                 internal_video_id = m.group('internalvideoid')
2685
2686                 info = {
2687                         'id': video_id,
2688                         'internal_id': internal_video_id,
2689                         'uploader': None,
2690                         'upload_date': None,
2691                 }
2692
2693                 self.report_extraction(video_id)
2694                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2695                 try:
2696                         metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2697                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2698                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2699                         return
2700
2701                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2702                 try:
2703                         videoNode = mdoc.findall('./video')[0]
2704                         info['description'] = videoNode.findall('./description')[0].text
2705                         info['title'] = videoNode.findall('./caption')[0].text
2706                         info['url'] = videoNode.findall('./file')[0].text
2707                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2708                         info['ext'] = info['url'].rpartition('.')[2]
2709                 except IndexError:
2710                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2711                         return
2712
2713                 return [info]
2714
2715
2716 class XVideosIE(InfoExtractor):
2717         """Information extractor for xvideos.com"""
2718
2719         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2720         IE_NAME = u'xvideos'
2721
2722         def report_webpage(self, video_id):
2723                 """Report information extraction."""
2724                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2725
2726         def report_extraction(self, video_id):
2727                 """Report information extraction."""
2728                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2729
2730         def _real_extract(self, url):
2731                 mobj = re.match(self._VALID_URL, url)
2732                 if mobj is None:
2733                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2734                         return
2735                 video_id = mobj.group(1).decode('utf-8')
2736
2737                 self.report_webpage(video_id)
2738
2739                 request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2740                 try:
2741                         webpage = compat_urllib_request.urlopen(request).read()
2742                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2743                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2744                         return
2745
2746                 self.report_extraction(video_id)
2747
2748
2749                 # Extract video URL
2750                 mobj = re.search(r'flv_url=(.+?)&', webpage)
2751                 if mobj is None:
2752                         self._downloader.trouble(u'ERROR: unable to extract video url')
2753                         return
2754                 video_url = compat_urllib_parse.unquote(mobj.group(1).decode('utf-8'))
2755
2756
2757                 # Extract title
2758                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2759                 if mobj is None:
2760                         self._downloader.trouble(u'ERROR: unable to extract video title')
2761                         return
2762                 video_title = mobj.group(1).decode('utf-8')
2763
2764
2765                 # Extract video thumbnail
2766                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2767                 if mobj is None:
2768                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2769                         return
2770                 video_thumbnail = mobj.group(0).decode('utf-8')
2771
2772                 info = {
2773                         'id': video_id,
2774                         'url': video_url,
2775                         'uploader': None,
2776                         'upload_date': None,
2777                         'title': video_title,
2778                         'ext': 'flv',
2779                         'thumbnail': video_thumbnail,
2780                         'description': None,
2781                 }
2782
2783                 return [info]
2784
2785
2786 class SoundcloudIE(InfoExtractor):
2787         """Information extractor for soundcloud.com
2788            To access the media, the uid of the song and a stream token
2789            must be extracted from the page source and the script must make
2790            a request to media.soundcloud.com/crossdomain.xml. Then
2791            the media can be grabbed by requesting from an url composed
2792            of the stream token and uid
2793          """
2794
2795         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2796         IE_NAME = u'soundcloud'
2797
2798         def __init__(self, downloader=None):
2799                 InfoExtractor.__init__(self, downloader)
2800
2801         def report_webpage(self, video_id):
2802                 """Report information extraction."""
2803                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2804
2805         def report_extraction(self, video_id):
2806                 """Report information extraction."""
2807                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2808
2809         def _real_extract(self, url):
2810                 mobj = re.match(self._VALID_URL, url)
2811                 if mobj is None:
2812                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2813                         return
2814
2815                 # extract uploader (which is in the url)
2816                 uploader = mobj.group(1).decode('utf-8')
2817                 # extract simple title (uploader + slug of song title)
2818                 slug_title =  mobj.group(2).decode('utf-8')
2819                 simple_title = uploader + u'-' + slug_title
2820
2821                 self.report_webpage('%s/%s' % (uploader, slug_title))
2822
2823                 request = compat_urllib_request.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2824                 try:
2825                         webpage = compat_urllib_request.urlopen(request).read()
2826                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2827                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2828                         return
2829
2830                 self.report_extraction('%s/%s' % (uploader, slug_title))
2831
2832                 # extract uid and stream token that soundcloud hands out for access
2833                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2834                 if mobj:
2835                         video_id = mobj.group(1)
2836                         stream_token = mobj.group(2)
2837
2838                 # extract unsimplified title
2839                 mobj = re.search('"title":"(.*?)",', webpage)
2840                 if mobj:
2841                         title = mobj.group(1).decode('utf-8')
2842                 else:
2843                         title = simple_title
2844
2845                 # construct media url (with uid/token)
2846                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2847                 mediaURL = mediaURL % (video_id, stream_token)
2848
2849                 # description
2850                 description = u'No description available'
2851                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2852                 if mobj:
2853                         description = mobj.group(1)
2854
2855                 # upload date
2856                 upload_date = None
2857                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2858                 if mobj:
2859                         try:
2860                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2861                         except Exception, e:
2862                                 self._downloader.to_stderr(compat_str(e))
2863
2864                 # for soundcloud, a request to a cross domain is required for cookies
2865                 request = compat_urllib_request.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2866
2867                 return [{
2868                         'id':           video_id.decode('utf-8'),
2869                         'url':          mediaURL,
2870                         'uploader':     uploader.decode('utf-8'),
2871                         'upload_date':  upload_date,
2872                         'title':        title,
2873                         'ext':          u'mp3',
2874                         'description': description.decode('utf-8')
2875                 }]
2876
2877
2878 class InfoQIE(InfoExtractor):
2879         """Information extractor for infoq.com"""
2880
2881         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2882         IE_NAME = u'infoq'
2883
2884         def report_webpage(self, video_id):
2885                 """Report information extraction."""
2886                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2887
2888         def report_extraction(self, video_id):
2889                 """Report information extraction."""
2890                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2891
2892         def _real_extract(self, url):
2893                 mobj = re.match(self._VALID_URL, url)
2894                 if mobj is None:
2895                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2896                         return
2897
2898                 self.report_webpage(url)
2899
2900                 request = compat_urllib_request.Request(url)
2901                 try:
2902                         webpage = compat_urllib_request.urlopen(request).read()
2903                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2904                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2905                         return
2906
2907                 self.report_extraction(url)
2908
2909
2910                 # Extract video URL
2911                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2912                 if mobj is None:
2913                         self._downloader.trouble(u'ERROR: unable to extract video url')
2914                         return
2915                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2916
2917
2918                 # Extract title
2919                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2920                 if mobj is None:
2921                         self._downloader.trouble(u'ERROR: unable to extract video title')
2922                         return
2923                 video_title = mobj.group(1).decode('utf-8')
2924
2925                 # Extract description
2926                 video_description = u'No description available.'
2927                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2928                 if mobj is not None:
2929                         video_description = mobj.group(1).decode('utf-8')
2930
2931                 video_filename = video_url.split('/')[-1]
2932                 video_id, extension = video_filename.split('.')
2933
2934                 info = {
2935                         'id': video_id,
2936                         'url': video_url,
2937                         'uploader': None,
2938                         'upload_date': None,
2939                         'title': video_title,
2940                         'ext': extension, # Extension is always(?) mp4, but seems to be flv
2941                         'thumbnail': None,
2942                         'description': video_description,
2943                 }
2944
2945                 return [info]
2946
2947 class MixcloudIE(InfoExtractor):
2948         """Information extractor for www.mixcloud.com"""
2949         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2950         IE_NAME = u'mixcloud'
2951
2952         def __init__(self, downloader=None):
2953                 InfoExtractor.__init__(self, downloader)
2954
2955         def report_download_json(self, file_id):
2956                 """Report JSON download."""
2957                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2958
2959         def report_extraction(self, file_id):
2960                 """Report information extraction."""
2961                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2962
2963         def get_urls(self, jsonData, fmt, bitrate='best'):
2964                 """Get urls from 'audio_formats' section in json"""
2965                 file_url = None
2966                 try:
2967                         bitrate_list = jsonData[fmt]
2968                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2969                                 bitrate = max(bitrate_list) # select highest
2970
2971                         url_list = jsonData[fmt][bitrate]
2972                 except TypeError: # we have no bitrate info.
2973                         url_list = jsonData[fmt]
2974                 return url_list
2975
2976         def check_urls(self, url_list):
2977                 """Returns 1st active url from list"""
2978                 for url in url_list:
2979                         try:
2980                                 compat_urllib_request.urlopen(url)
2981                                 return url
2982                         except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2983                                 url = None
2984
2985                 return None
2986
2987         def _print_formats(self, formats):
2988                 print('Available formats:')
2989                 for fmt in formats.keys():
2990                         for b in formats[fmt]:
2991                                 try:
2992                                         ext = formats[fmt][b][0]
2993                                         print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2994                                 except TypeError: # we have no bitrate info
2995                                         ext = formats[fmt][0]
2996                                         print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2997                                         break
2998
2999         def _real_extract(self, url):
3000                 mobj = re.match(self._VALID_URL, url)
3001                 if mobj is None:
3002                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3003                         return
3004                 # extract uploader & filename from url
3005                 uploader = mobj.group(1).decode('utf-8')
3006                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3007
3008                 # construct API request
3009                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3010                 # retrieve .json file with links to files
3011                 request = compat_urllib_request.Request(file_url)
3012                 try:
3013                         self.report_download_json(file_url)
3014                         jsonData = compat_urllib_request.urlopen(request).read()
3015                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
3016                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
3017                         return
3018
3019                 # parse JSON
3020                 json_data = json.loads(jsonData)
3021                 player_url = json_data['player_swf_url']
3022                 formats = dict(json_data['audio_formats'])
3023
3024                 req_format = self._downloader.params.get('format', None)
3025                 bitrate = None
3026
3027                 if self._downloader.params.get('listformats', None):
3028                         self._print_formats(formats)
3029                         return
3030
3031                 if req_format is None or req_format == 'best':
3032                         for format_param in formats.keys():
3033                                 url_list = self.get_urls(formats, format_param)
3034                                 # check urls
3035                                 file_url = self.check_urls(url_list)
3036                                 if file_url is not None:
3037                                         break # got it!
3038                 else:
3039                         if req_format not in formats.keys():
3040                                 self._downloader.trouble(u'ERROR: format is not available')
3041                                 return
3042
3043                         url_list = self.get_urls(formats, req_format)
3044                         file_url = self.check_urls(url_list)
3045                         format_param = req_format
3046
3047                 return [{
3048                         'id': file_id.decode('utf-8'),
3049                         'url': file_url.decode('utf-8'),
3050                         'uploader':     uploader.decode('utf-8'),
3051                         'upload_date': None,
3052                         'title': json_data['name'],
3053                         'ext': file_url.split('.')[-1].decode('utf-8'),
3054                         'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3055                         'thumbnail': json_data['thumbnail_url'],
3056                         'description': json_data['description'],
3057                         'player_url': player_url.decode('utf-8'),
3058                 }]
3059
3060 class StanfordOpenClassroomIE(InfoExtractor):
3061         """Information extractor for Stanford's Open ClassRoom"""
3062
3063         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3064         IE_NAME = u'stanfordoc'
3065
3066         def report_download_webpage(self, objid):
3067                 """Report information extraction."""
3068                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3069
3070         def report_extraction(self, video_id):
3071                 """Report information extraction."""
3072                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3073
3074         def _real_extract(self, url):
3075                 mobj = re.match(self._VALID_URL, url)
3076                 if mobj is None:
3077                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3078                         return
3079
3080                 if mobj.group('course') and mobj.group('video'): # A specific video
3081                         course = mobj.group('course')
3082                         video = mobj.group('video')
3083                         info = {
3084                                 'id': course + '_' + video,
3085                                 'uploader': None,
3086                                 'upload_date': None,
3087                         }
3088
3089                         self.report_extraction(info['id'])
3090                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3091                         xmlUrl = baseUrl + video + '.xml'
3092                         try:
3093                                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3094                         except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
3095                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3096                                 return
3097                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
3098                         try:
3099                                 info['title'] = mdoc.findall('./title')[0].text
3100                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3101                         except IndexError:
3102                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3103                                 return
3104                         info['ext'] = info['url'].rpartition('.')[2]
3105                         return [info]
3106                 elif mobj.group('course'): # A course page
3107                         course = mobj.group('course')
3108                         info = {
3109                                 'id': course,
3110                                 'type': 'playlist',
3111                                 'uploader': None,
3112                                 'upload_date': None,
3113                         }
3114
3115                         self.report_download_webpage(info['id'])
3116                         try:
3117                                 coursepage = compat_urllib_request.urlopen(url).read()
3118                         except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
3119                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3120                                 return
3121
3122                         m = re.search('<h1>([^<]+)</h1>', coursepage)
3123                         if m:
3124                                 info['title'] = unescapeHTML(m.group(1))
3125                         else:
3126                                 info['title'] = info['id']
3127
3128                         m = re.search('<description>([^<]+)</description>', coursepage)
3129                         if m:
3130                                 info['description'] = unescapeHTML(m.group(1))
3131
3132                         links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3133                         info['list'] = [
3134                                 {
3135                                         'type': 'reference',
3136                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3137                                 }
3138                                         for vpage in links]
3139                         results = []
3140                         for entry in info['list']:
3141                                 assert entry['type'] == 'reference'
3142                                 results += self.extract(entry['url'])
3143                         return results
3144
3145                 else: # Root page
3146                         info = {
3147                                 'id': 'Stanford OpenClassroom',
3148                                 'type': 'playlist',
3149                                 'uploader': None,
3150                                 'upload_date': None,
3151                         }
3152
3153                         self.report_download_webpage(info['id'])
3154                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3155                         try:
3156                                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3157                         except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
3158                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3159                                 return
3160
3161                         info['title'] = info['id']
3162
3163                         links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3164                         info['list'] = [
3165                                 {
3166                                         'type': 'reference',
3167                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3168                                 }
3169                                         for cpage in links]
3170
3171                         results = []
3172                         for entry in info['list']:
3173                                 assert entry['type'] == 'reference'
3174                                 results += self.extract(entry['url'])
3175                         return results
3176
3177 class MTVIE(InfoExtractor):
3178         """Information extractor for MTV.com"""
3179
3180         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3181         IE_NAME = u'mtv'
3182
3183         def report_webpage(self, video_id):
3184                 """Report information extraction."""
3185                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3186
3187         def report_extraction(self, video_id):
3188                 """Report information extraction."""
3189                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3190
3191         def _real_extract(self, url):
3192                 mobj = re.match(self._VALID_URL, url)
3193                 if mobj is None:
3194                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3195                         return
3196                 if not mobj.group('proto'):
3197                         url = 'http://' + url
3198                 video_id = mobj.group('videoid')
3199                 self.report_webpage(video_id)
3200
3201                 request = compat_urllib_request.Request(url)
3202                 try:
3203                         webpage = compat_urllib_request.urlopen(request).read()
3204                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
3205                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3206                         return
3207
3208                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3209                 if mobj is None:
3210                         self._downloader.trouble(u'ERROR: unable to extract song name')
3211                         return
3212                 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3213                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3214                 if mobj is None:
3215                         self._downloader.trouble(u'ERROR: unable to extract performer')
3216                         return
3217                 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3218                 video_title = performer + ' - ' + song_name
3219
3220                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3221                 if mobj is None:
3222                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3223                         return
3224                 mtvn_uri = mobj.group(1)
3225
3226                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3227                 if mobj is None:
3228                         self._downloader.trouble(u'ERROR: unable to extract content id')
3229                         return
3230                 content_id = mobj.group(1)
3231
3232                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3233                 self.report_extraction(video_id)
3234                 request = compat_urllib_request.Request(videogen_url)
3235                 try:
3236                         metadataXml = compat_urllib_request.urlopen(request).read()
3237                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
3238                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3239                         return
3240
3241                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3242                 renditions = mdoc.findall('.//rendition')
3243
3244                 # For now, always pick the highest quality.
3245                 rendition = renditions[-1]
3246
3247                 try:
3248                         _,_,ext = rendition.attrib['type'].partition('/')
3249                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3250                         video_url = rendition.find('./src').text
3251                 except KeyError:
3252                         self._downloader.trouble('Invalid rendition field.')
3253                         return
3254
3255                 info = {
3256                         'id': video_id,
3257                         'url': video_url,
3258                         'uploader': performer,
3259                         'upload_date': None,
3260                         'title': video_title,
3261                         'ext': ext,
3262                         'format': format,
3263                 }
3264
3265                 return [info]
3266
3267
3268 class YoukuIE(InfoExtractor):
3269
3270         _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3271         IE_NAME = u'Youku'
3272
3273         def __init__(self, downloader=None):
3274                 InfoExtractor.__init__(self, downloader)
3275
3276         def report_download_webpage(self, file_id):
3277                 """Report webpage download."""
3278                 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3279
3280         def report_extraction(self, file_id):
3281                 """Report information extraction."""
3282                 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3283
3284         def _gen_sid(self):
3285                 nowTime = int(time.time() * 1000)
3286                 random1 = random.randint(1000,1998)
3287                 random2 = random.randint(1000,9999)
3288
3289                 return "%d%d%d" %(nowTime,random1,random2)
3290
3291         def _get_file_ID_mix_string(self, seed):
3292                 mixed = []
3293                 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3294                 seed = float(seed)
3295                 for i in range(len(source)):
3296                         seed  =  (seed * 211 + 30031 ) % 65536
3297                         index  =  math.floor(seed / 65536 * len(source) )
3298                         mixed.append(source[int(index)])
3299                         source.remove(source[int(index)])
3300                 #return ''.join(mixed)
3301                 return mixed
3302
3303         def _get_file_id(self, fileId, seed):
3304                 mixed = self._get_file_ID_mix_string(seed)
3305                 ids = fileId.split('*')
3306                 realId = []
3307                 for ch in ids:
3308                         if ch:
3309                                 realId.append(mixed[int(ch)])
3310                 return ''.join(realId)
3311
3312         def _real_extract(self, url):
3313                 mobj = re.match(self._VALID_URL, url)
3314                 if mobj is None:
3315                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3316                         return
3317                 video_id = mobj.group('ID')
3318
3319                 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3320
3321                 request = compat_urllib_request.Request(info_url, None, std_headers)
3322                 try:
3323                         self.report_download_webpage(video_id)
3324                         jsondata = compat_urllib_request.urlopen(request).read()
3325                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
3326                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3327                         return
3328
3329                 self.report_extraction(video_id)
3330                 try:
3331                         config = json.loads(jsondata)
3332
3333                         video_title =  config['data'][0]['title']
3334                         seed = config['data'][0]['seed']
3335
3336                         format = self._downloader.params.get('format', None)
3337                         supported_format = config['data'][0]['streamfileids'].keys()
3338
3339                         if format is None or format == 'best':
3340                                 if 'hd2' in supported_format:
3341                                         format = 'hd2'
3342                                 else:
3343                                         format = 'flv'
3344                                 ext = u'flv'
3345                         elif format == 'worst':
3346                                 format = 'mp4'
3347                                 ext = u'mp4'
3348                         else:
3349                                 format = 'flv'
3350                                 ext = u'flv'
3351
3352
3353                         fileid = config['data'][0]['streamfileids'][format]
3354                         seg_number = len(config['data'][0]['segs'][format])
3355
3356                         keys=[]
3357                         for i in xrange(seg_number):
3358                                 keys.append(config['data'][0]['segs'][format][i]['k'])
3359
3360                         #TODO check error
3361                         #youku only could be viewed from mainland china
3362                 except:
3363                         self._downloader.trouble(u'ERROR: unable to extract info section')
3364                         return
3365
3366                 files_info=[]
3367                 sid = self._gen_sid()
3368                 fileid = self._get_file_id(fileid, seed)
3369
3370                 #column 8,9 of fileid represent the segment number
3371                 #fileid[7:9] should be changed
3372                 for index, key in enumerate(keys):
3373
3374                         temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3375                         download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3376
3377                         info = {
3378                                 'id': '%s_part%02d' % (video_id, index),
3379                                 'url': download_url,
3380                                 'uploader': None,
3381                                 'upload_date': None,
3382                                 'title': video_title,
3383                                 'ext': ext,
3384                         }
3385                         files_info.append(info)
3386
3387                 return files_info
3388
3389
3390 class XNXXIE(InfoExtractor):
3391         """Information extractor for xnxx.com"""
3392
3393         _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3394         IE_NAME = u'xnxx'
3395         VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3396         VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3397         VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3398
3399         def report_webpage(self, video_id):
3400                 """Report information extraction"""
3401                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3402
3403         def report_extraction(self, video_id):
3404                 """Report information extraction"""
3405                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3406
3407         def _real_extract(self, url):
3408                 mobj = re.match(self._VALID_URL, url)
3409                 if mobj is None:
3410                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3411                         return
3412                 video_id = mobj.group(1).decode('utf-8')
3413
3414                 self.report_webpage(video_id)
3415
3416                 # Get webpage content
3417                 try:
3418                         webpage = compat_urllib_request.urlopen(url).read()
3419                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
3420                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3421                         return
3422
3423                 result = re.search(self.VIDEO_URL_RE, webpage)
3424                 if result is None:
3425                         self._downloader.trouble(u'ERROR: unable to extract video url')
3426                         return
3427                 video_url = compat_urllib_parse.unquote(result.group(1).decode('utf-8'))
3428
3429                 result = re.search(self.VIDEO_TITLE_RE, webpage)
3430                 if result is None:
3431                         self._downloader.trouble(u'ERROR: unable to extract video title')
3432                         return
3433                 video_title = result.group(1).decode('utf-8')
3434
3435                 result = re.search(self.VIDEO_THUMB_RE, webpage)
3436                 if result is None:
3437                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3438                         return
3439                 video_thumbnail = result.group(1).decode('utf-8')
3440
3441                 return [{
3442                         'id': video_id,
3443                         'url': video_url,
3444                         'uploader': None,
3445                         'upload_date': None,
3446                         'title': video_title,
3447                         'ext': 'flv',
3448                         'thumbnail': video_thumbnail,
3449                         'description': None,
3450                 }]
3451
3452
3453 class GooglePlusIE(InfoExtractor):
3454         """Information extractor for plus.google.com."""
3455
3456         _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3457         IE_NAME = u'plus.google'
3458
3459         def __init__(self, downloader=None):
3460                 InfoExtractor.__init__(self, downloader)
3461
3462         def report_extract_entry(self, url):
3463                 """Report downloading extry"""
3464                 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3465
3466         def report_date(self, upload_date):
3467                 """Report downloading extry"""
3468                 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3469
3470         def report_uploader(self, uploader):
3471                 """Report downloading extry"""
3472                 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3473
3474         def report_title(self, video_title):
3475                 """Report downloading extry"""
3476                 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3477
3478         def report_extract_vid_page(self, video_page):
3479                 """Report information extraction."""
3480                 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3481
3482         def _real_extract(self, url):
3483                 # Extract id from URL
3484                 mobj = re.match(self._VALID_URL, url)
3485                 if mobj is None:
3486                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3487                         return
3488
3489                 post_url = mobj.group(0)
3490                 video_id = mobj.group(2)
3491
3492                 video_extension = 'flv'
3493
3494                 # Step 1, Retrieve post webpage to extract further information
3495                 self.report_extract_entry(post_url)
3496                 request = compat_urllib_request.Request(post_url)
3497                 try:
3498                         webpage = compat_urllib_request.urlopen(request).read()
3499                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
3500                         self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3501                         return
3502
3503                 # Extract update date
3504                 upload_date = None
3505                 pattern = 'title="Timestamp">(.*?)</a>'
3506                 mobj = re.search(pattern, webpage)
3507                 if mobj:
3508                         upload_date = mobj.group(1)
3509                         # Convert timestring to a format suitable for filename
3510                         upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3511                         upload_date = upload_date.strftime('%Y%m%d')
3512                 self.report_date(upload_date)
3513
3514                 # Extract uploader
3515                 uploader = None
3516                 pattern = r'rel\="author".*?>(.*?)</a>'
3517                 mobj = re.search(pattern, webpage)
3518                 if mobj:
3519                         uploader = mobj.group(1)
3520                 self.report_uploader(uploader)
3521
3522                 # Extract title
3523                 # Get the first line for title
3524                 video_title = u'NA'
3525                 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3526                 mobj = re.search(pattern, webpage)
3527                 if mobj:
3528                         video_title = mobj.group(1)
3529                 self.report_title(video_title)
3530
3531                 # Step 2, Stimulate clicking the image box to launch video
3532                 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3533                 mobj = re.search(pattern, webpage)
3534                 if mobj is None:
3535                         self._downloader.trouble(u'ERROR: unable to extract video page URL')
3536
3537                 video_page = mobj.group(1)
3538                 request = compat_urllib_request.Request(video_page)
3539                 try:
3540                         webpage = compat_urllib_request.urlopen(request).read()
3541                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
3542                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3543                         return
3544                 self.report_extract_vid_page(video_page)
3545
3546
3547                 # Extract video links on video page
3548                 """Extract video links of all sizes"""
3549                 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3550                 mobj = re.findall(pattern, webpage)
3551                 if len(mobj) == 0:
3552                         self._downloader.trouble(u'ERROR: unable to extract video links')
3553
3554                 # Sort in resolution
3555                 links = sorted(mobj)
3556
3557                 # Choose the lowest of the sort, i.e. highest resolution
3558                 video_url = links[-1]
3559                 # Only get the url. The resolution part in the tuple has no use anymore
3560                 video_url = video_url[-1]
3561                 # Treat escaped \u0026 style hex
3562                 video_url = unicode(video_url, "unicode_escape")
3563
3564
3565                 return [{
3566                         'id':           video_id.decode('utf-8'),
3567                         'url':          video_url,
3568                         'uploader':     uploader.decode('utf-8'),
3569                         'upload_date':  upload_date.decode('utf-8'),
3570                         'title':        video_title.decode('utf-8'),
3571                         'ext':          video_extension.decode('utf-8'),
3572                 }]