git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import datetime
   5 import httplib
   6 import netrc
   7 import os
   8 import re
   9 import socket
  10 import time
  11 import email.utils
  12 import xml.etree.ElementTree
  13 import random
  14 import math
  15 from urlparse import parse_qs
  16
  17 from utils import *
  18
  19
  20 class InfoExtractor(object):
  21         """Information Extractor class.
  22
  23         Information extractors are the classes that, given a URL, extract
  24         information about the video (or videos) the URL refers to. This
  25         information includes the real video URL, the video title, author and
  26         others. The information is stored in a dictionary which is then
  27         passed to the FileDownloader. The FileDownloader processes this
  28         information possibly downloading the video to the file system, among
  29         other possible outcomes.
  30
  31         The dictionaries must include the following fields:
  32
  33         id:             Video identifier.
  34         url:            Final video URL.
  35         uploader:       Nickname of the video uploader, unescaped.
  36         upload_date:    Video upload date (YYYYMMDD).
  37         title:          Video title, unescaped.
  38         ext:            Video filename extension.
  39
  40         The following fields are optional:
  41
  42         format:         The video format, defaults to ext (used for --get-format)
  43         thumbnail:      Full URL to a video thumbnail image.
  44         description:    One-line video description.
  45         player_url:     SWF Player URL (used for rtmpdump).
  46         subtitles:      The .srt file contents.
  47         urlhandle:              [internal] The urlHandle to be used to download the file,
  48                         like returned by urllib.request.urlopen
  49
  50         The fields should all be Unicode strings.
  51
  52         Subclasses of this one should re-define the _real_initialize() and
  53         _real_extract() methods and define a _VALID_URL regexp.
  54         Probably, they should also be added to the list of extractors.
  55
  56         _real_extract() must return a *list* of information dictionaries as
  57         described above.
  58
  59         Finally, the _WORKING attribute should be set to False for broken IEs
  60         in order to warn the users and skip the tests.
  61         """
  62
  63         _ready = False
  64         _downloader = None
  65         _WORKING = True
  66
  67         def __init__(self, downloader=None):
  68                 """Constructor. Receives an optional downloader."""
  69                 self._ready = False
  70                 self.set_downloader(downloader)
  71
  72         def suitable(self, url):
  73                 """Receives a URL and returns True if suitable for this IE."""
  74                 return re.match(self._VALID_URL, url) is not None
  75
  76         def working(self):
  77                 """Getter method for _WORKING."""
  78                 return self._WORKING
  79
  80         def initialize(self):
  81                 """Initializes an instance (authentication, etc)."""
  82                 if not self._ready:
  83                         self._real_initialize()
  84                         self._ready = True
  85
  86         def extract(self, url):
  87                 """Extracts URL information and returns it in list of dicts."""
  88                 self.initialize()
  89                 return self._real_extract(url)
  90
  91         def set_downloader(self, downloader):
  92                 """Sets the downloader for this IE."""
  93                 self._downloader = downloader
  94
  95         def _real_initialize(self):
  96                 """Real initialization process. Redefine in subclasses."""
  97                 pass
  98
  99         def _real_extract(self, url):
 100                 """Real extraction process. Redefine in subclasses."""
 101                 pass
 102
 103
 104 class YoutubeIE(InfoExtractor):
 105         """Information extractor for youtube.com."""
 106
 107         _VALID_URL = r"""^
 108                          (
 109                              (?:https?://)?                                       # http(s):// (optional)
 110                              (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 111                                 tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 112                              (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 113                              (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
 114                              (?:                                                  # the various things that can precede the ID:
 115                                  (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 116                                  |(?:                                             # or the v= param in all its forms
 117                                      (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 118                                      (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 119                                      (?:.+&)?                                     # any other preceding param (like /?s=tuff&v=xxxx)
 120                                      v=
 121                                  )
 122                              )?                                                   # optional -> youtube.com/xxxx is OK
 123                          )?                                                       # all until now is optional -> you can pass the naked ID
 124                          ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 125                          (?(1).+)?                                                # if we found the ID, everything can follow
 126                          $"""
 127         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 128         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 129         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 130         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 131         _NETRC_MACHINE = 'youtube'
 132         # Listed in order of quality
 133         _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 134         _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 135         _video_extensions = {
 136                 '13': '3gp',
 137                 '17': 'mp4',
 138                 '18': 'mp4',
 139                 '22': 'mp4',
 140                 '37': 'mp4',
 141                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 142                 '43': 'webm',
 143                 '44': 'webm',
 144                 '45': 'webm',
 145                 '46': 'webm',
 146         }
 147         _video_dimensions = {
 148                 '5': '240x400',
 149                 '6': '???',
 150                 '13': '???',
 151                 '17': '144x176',
 152                 '18': '360x640',
 153                 '22': '720x1280',
 154                 '34': '360x640',
 155                 '35': '480x854',
 156                 '37': '1080x1920',
 157                 '38': '3072x4096',
 158                 '43': '360x640',
 159                 '44': '480x854',
 160                 '45': '720x1280',
 161                 '46': '1080x1920',
 162         }
 163         IE_NAME = u'youtube'
 164
 165         def suitable(self, url):
 166                 """Receives a URL and returns True if suitable for this IE."""
 167                 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
 168
 169         def report_lang(self):
 170                 """Report attempt to set language."""
 171                 self._downloader.to_screen(u'[youtube] Setting language')
 172
 173         def report_login(self):
 174                 """Report attempt to log in."""
 175                 self._downloader.to_screen(u'[youtube] Logging in')
 176
 177         def report_age_confirmation(self):
 178                 """Report attempt to confirm age."""
 179                 self._downloader.to_screen(u'[youtube] Confirming age')
 180
 181         def report_video_webpage_download(self, video_id):
 182                 """Report attempt to download video webpage."""
 183                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 184
 185         def report_video_info_webpage_download(self, video_id):
 186                 """Report attempt to download video info webpage."""
 187                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 188
 189         def report_video_subtitles_download(self, video_id):
 190                 """Report attempt to download video info webpage."""
 191                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 192
 193         def report_information_extraction(self, video_id):
 194                 """Report attempt to extract video information."""
 195                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 196
 197         def report_unavailable_format(self, video_id, format):
 198                 """Report extracted video URL."""
 199                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 200
 201         def report_rtmp_download(self):
 202                 """Indicate the download will use the RTMP protocol."""
 203                 self._downloader.to_screen(u'[youtube] RTMP download detected')
 204
 205         def _closed_captions_xml_to_srt(self, xml_string):
 206                 srt = ''
 207                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 208                 # TODO parse xml instead of regex
 209                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
 210                         if not dur: dur = '4'
 211                         start = float(start)
 212                         end = start + float(dur)
 213                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 214                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 215                         caption = unescapeHTML(caption)
 216                         caption = unescapeHTML(caption) # double cycle, intentional
 217                         srt += str(n+1) + '\n'
 218                         srt += start + ' --> ' + end + '\n'
 219                         srt += caption + '\n\n'
 220                 return srt
 221
 222         def _print_formats(self, formats):
 223                 print('Available formats:')
 224                 for x in formats:
 225                         print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 226
 227         def _real_initialize(self):
 228                 if self._downloader is None:
 229                         return
 230
 231                 username = None
 232                 password = None
 233                 downloader_params = self._downloader.params
 234
 235                 # Attempt to use provided username and password or .netrc data
 236                 if downloader_params.get('username', None) is not None:
 237                         username = downloader_params['username']
 238                         password = downloader_params['password']
 239                 elif downloader_params.get('usenetrc', False):
 240                         try:
 241                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 242                                 if info is not None:
 243                                         username = info[0]
 244                                         password = info[2]
 245                                 else:
 246                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 247                         except (IOError, netrc.NetrcParseError) as err:
 248                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
 249                                 return
 250
 251                 # Set language
 252                 request = compat_urllib_request.Request(self._LANG_URL)
 253                 try:
 254                         self.report_lang()
 255                         compat_urllib_request.urlopen(request).read()
 256                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
 257                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
 258                         return
 259
 260                 # No authentication to be performed
 261                 if username is None:
 262                         return
 263
 264                 # Log in
 265                 login_form = {
 266                                 'current_form': 'loginForm',
 267                                 'next':         '/',
 268                                 'action_login': 'Log In',
 269                                 'username':     username,
 270                                 'password':     password,
 271                                 }
 272                 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
 273                 try:
 274                         self.report_login()
 275                         login_results = compat_urllib_request.urlopen(request).read()
 276                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 277                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 278                                 return
 279                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
 280                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
 281                         return
 282
 283                 # Confirm age
 284                 age_form = {
 285                                 'next_url':             '/',
 286                                 'action_confirm':       'Confirm',
 287                                 }
 288                 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 289                 try:
 290                         self.report_age_confirmation()
 291                         age_results = compat_urllib_request.urlopen(request).read()
 292                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
 293                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 294                         return
 295
 296         def _real_extract(self, url):
 297                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 298                 mobj = re.search(self._NEXT_URL_RE, url)
 299                 if mobj:
 300                         url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 301
 302                 # Extract video id from URL
 303                 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 304                 if mobj is None:
 305                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 306                         return
 307                 video_id = mobj.group(2)
 308
 309                 # Get video webpage
 310                 self.report_video_webpage_download(video_id)
 311                 request = compat_urllib_request.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
 312                 try:
 313                         video_webpage = compat_urllib_request.urlopen(request).read()
 314                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
 315                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
 316                         return
 317
 318                 # Attempt to extract SWF player URL
 319                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 320                 if mobj is not None:
 321                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 322                 else:
 323                         player_url = None
 324
 325                 # Get video info
 326                 self.report_video_info_webpage_download(video_id)
 327                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 328                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 329                                         % (video_id, el_type))
 330                         request = compat_urllib_request.Request(video_info_url)
 331                         try:
 332                                 video_info_webpage = compat_urllib_request.urlopen(request).read()
 333                                 video_info = parse_qs(video_info_webpage)
 334                                 if 'token' in video_info:
 335                                         break
 336                         except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
 337                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 338                                 return
 339                 if 'token' not in video_info:
 340                         if 'reason' in video_info:
 341                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
 342                         else:
 343                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 344                         return
 345
 346                 # Check for "rental" videos
 347                 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 348                         self._downloader.trouble(u'ERROR: "rental" videos not supported')
 349                         return
 350
 351                 # Start extracting information
 352                 self.report_information_extraction(video_id)
 353
 354                 # uploader
 355                 if 'author' not in video_info:
 356                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 357                         return
 358                 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 359
 360                 # title
 361                 if 'title' not in video_info:
 362                         self._downloader.trouble(u'ERROR: unable to extract video title')
 363                         return
 364                 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 365                 video_title = video_title.decode('utf-8')
 366
 367                 # thumbnail image
 368                 if 'thumbnail_url' not in video_info:
 369                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 370                         video_thumbnail = ''
 371                 else:   # don't panic if we can't find it
 372                         video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 373
 374                 # upload date
 375                 upload_date = None
 376                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 377                 if mobj is not None:
 378                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 379                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 380                         for expression in format_expressions:
 381                                 try:
 382                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 383                                 except:
 384                                         pass
 385
 386                 # description
 387                 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
 388                 if video_description: video_description = clean_html(video_description)
 389                 else: video_description = ''
 390
 391                 # closed captions
 392                 video_subtitles = None
 393                 if self._downloader.params.get('writesubtitles', False):
 394                         try:
 395                                 self.report_video_subtitles_download(video_id)
 396                                 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 397                                 try:
 398                                         srt_list = compat_urllib_request.urlopen(request).read()
 399                                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
 400                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
 401                                 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 402                                 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 403                                 if not srt_lang_list:
 404                                         raise Trouble(u'WARNING: video has no closed captions')
 405                                 if self._downloader.params.get('subtitleslang', False):
 406                                         srt_lang = self._downloader.params.get('subtitleslang')
 407                                 elif 'en' in srt_lang_list:
 408                                         srt_lang = 'en'
 409                                 else:
 410                                         srt_lang = srt_lang_list.keys()[0]
 411                                 if not srt_lang in srt_lang_list:
 412                                         raise Trouble(u'WARNING: no closed captions found in the specified language')
 413                                 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
 414                                 try:
 415                                         srt_xml = compat_urllib_request.urlopen(request).read()
 416                                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
 417                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
 418                                 if not srt_xml:
 419                                         raise Trouble(u'WARNING: unable to download video subtitles')
 420                                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
 421                         except Trouble as trouble:
 422                                 self._downloader.trouble(trouble[0])
 423
 424                 if 'length_seconds' not in video_info:
 425                         self._downloader.trouble(u'WARNING: unable to extract video duration')
 426                         video_duration = ''
 427                 else:
 428                         video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 429
 430                 # token
 431                 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 432
 433                 # Decide which formats to download
 434                 req_format = self._downloader.params.get('format', None)
 435
 436                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 437                         self.report_rtmp_download()
 438                         video_url_list = [(None, video_info['conn'][0])]
 439                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 440                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 441                         url_data = [parse_qs(uds) for uds in url_data_strs]
 442                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
 443                         url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 444
 445                         format_limit = self._downloader.params.get('format_limit', None)
 446                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 447                         if format_limit is not None and format_limit in available_formats:
 448                                 format_list = available_formats[available_formats.index(format_limit):]
 449                         else:
 450                                 format_list = available_formats
 451                         existing_formats = [x for x in format_list if x in url_map]
 452                         if len(existing_formats) == 0:
 453                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 454                                 return
 455                         if self._downloader.params.get('listformats', None):
 456                                 self._print_formats(existing_formats)
 457                                 return
 458                         if req_format is None or req_format == 'best':
 459                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 460                         elif req_format == 'worst':
 461                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 462                         elif req_format in ('-1', 'all'):
 463                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 464                         else:
 465                                 # Specific formats. We pick the first in a slash-delimeted sequence.
 466                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 467                                 req_formats = req_format.split('/')
 468                                 video_url_list = None
 469                                 for rf in req_formats:
 470                                         if rf in url_map:
 471                                                 video_url_list = [(rf, url_map[rf])]
 472                                                 break
 473                                 if video_url_list is None:
 474                                         self._downloader.trouble(u'ERROR: requested format not available')
 475                                         return
 476                 else:
 477                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 478                         return
 479
 480                 results = []
 481                 for format_param, video_real_url in video_url_list:
 482                         # Extension
 483                         video_extension = self._video_extensions.get(format_param, 'flv')
 484
 485                         video_format = '{} - {}'.format(format_param.decode('utf-8') if format_param else video_extension.decode('utf-8'),
 486                                                             self._video_dimensions.get(format_param, '???'))
 487
 488                         results.append({
 489                                 'id':           video_id.decode('utf-8'),
 490                                 'url':          video_real_url.decode('utf-8'),
 491                                 'uploader':     video_uploader.decode('utf-8'),
 492                                 'upload_date':  upload_date,
 493                                 'title':        video_title,
 494                                 'ext':          video_extension.decode('utf-8'),
 495                                 'format':       video_format,
 496                                 'thumbnail':    video_thumbnail.decode('utf-8'),
 497                                 'description':  video_description,
 498                                 'player_url':   player_url,
 499                                 'subtitles':    video_subtitles,
 500                                 'duration':             video_duration
 501                         })
 502                 return results
 503
 504
 505 class MetacafeIE(InfoExtractor):
 506         """Information Extractor for metacafe.com."""
 507
 508         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 509         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 510         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 511         IE_NAME = u'metacafe'
 512
 513         def __init__(self, downloader=None):
 514                 InfoExtractor.__init__(self, downloader)
 515
 516         def report_disclaimer(self):
 517                 """Report disclaimer retrieval."""
 518                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 519
 520         def report_age_confirmation(self):
 521                 """Report attempt to confirm age."""
 522                 self._downloader.to_screen(u'[metacafe] Confirming age')
 523
 524         def report_download_webpage(self, video_id):
 525                 """Report webpage download."""
 526                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 527
 528         def report_extraction(self, video_id):
 529                 """Report information extraction."""
 530                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 531
 532         def _real_initialize(self):
 533                 # Retrieve disclaimer
 534                 request = compat_urllib_request.Request(self._DISCLAIMER)
 535                 try:
 536                         self.report_disclaimer()
 537                         disclaimer = compat_urllib_request.urlopen(request).read()
 538                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
 539                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
 540                         return
 541
 542                 # Confirm age
 543                 disclaimer_form = {
 544                         'filters': '0',
 545                         'submit': "Continue - I'm over 18",
 546                         }
 547                 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 548                 try:
 549                         self.report_age_confirmation()
 550                         disclaimer = compat_urllib_request.urlopen(request).read()
 551                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
 552                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 553                         return
 554
 555         def _real_extract(self, url):
 556                 # Extract id and simplified title from URL
 557                 mobj = re.match(self._VALID_URL, url)
 558                 if mobj is None:
 559                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 560                         return
 561
 562                 video_id = mobj.group(1)
 563
 564                 # Check if video comes from YouTube
 565                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 566                 if mobj2 is not None:
 567                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 568                         return
 569
 570                 # Retrieve video webpage to extract further information
 571                 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
 572                 try:
 573                         self.report_download_webpage(video_id)
 574                         webpage = compat_urllib_request.urlopen(request).read()
 575                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
 576                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 577                         return
 578
 579                 # Extract URL, uploader and title from webpage
 580                 self.report_extraction(video_id)
 581                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 582                 if mobj is not None:
 583                         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 584                         video_extension = mediaURL[-3:]
 585
 586                         # Extract gdaKey if available
 587                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 588                         if mobj is None:
 589                                 video_url = mediaURL
 590                         else:
 591                                 gdaKey = mobj.group(1)
 592                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 593                 else:
 594                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 595                         if mobj is None:
 596                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 597                                 return
 598                         vardict = parse_qs(mobj.group(1))
 599                         if 'mediaData' not in vardict:
 600                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 601                                 return
 602                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 603                         if mobj is None:
 604                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 605                                 return
 606                         mediaURL = mobj.group(1).replace('\\/', '/')
 607                         video_extension = mediaURL[-3:]
 608                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 609
 610                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 611                 if mobj is None:
 612                         self._downloader.trouble(u'ERROR: unable to extract title')
 613                         return
 614                 video_title = mobj.group(1).decode('utf-8')
 615
 616                 mobj = re.search(r'submitter=(.*?);', webpage)
 617                 if mobj is None:
 618                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 619                         return
 620                 video_uploader = mobj.group(1)
 621
 622                 return [{
 623                         'id':           video_id.decode('utf-8'),
 624                         'url':          video_url.decode('utf-8'),
 625                         'uploader':     video_uploader.decode('utf-8'),
 626                         'upload_date':  None,
 627                         'title':        video_title,
 628                         'ext':          video_extension.decode('utf-8'),
 629                 }]
 630
 631
 632 class DailymotionIE(InfoExtractor):
 633         """Information Extractor for Dailymotion"""
 634
 635         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 636         IE_NAME = u'dailymotion'
 637
 638         def __init__(self, downloader=None):
 639                 InfoExtractor.__init__(self, downloader)
 640
 641         def report_download_webpage(self, video_id):
 642                 """Report webpage download."""
 643                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
 644
 645         def report_extraction(self, video_id):
 646                 """Report information extraction."""
 647                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 648
 649         def _real_extract(self, url):
 650                 # Extract id and simplified title from URL
 651                 mobj = re.match(self._VALID_URL, url)
 652                 if mobj is None:
 653                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 654                         return
 655
 656                 video_id = mobj.group(1).split('_')[0].split('?')[0]
 657
 658                 video_extension = 'mp4'
 659
 660                 # Retrieve video webpage to extract further information
 661                 request = compat_urllib_request.Request(url)
 662                 request.add_header('Cookie', 'family_filter=off')
 663                 try:
 664                         self.report_download_webpage(video_id)
 665                         webpage = compat_urllib_request.urlopen(request).read()
 666                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
 667                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 668                         return
 669
 670                 # Extract URL, uploader and title from webpage
 671                 self.report_extraction(video_id)
 672                 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 673                 if mobj is None:
 674                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 675                         return
 676                 flashvars = compat_urllib_parse.unquote(mobj.group(1))
 677
 678                 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 679                         if key in flashvars:
 680                                 max_quality = key
 681                                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 682                                 break
 683                 else:
 684                         self._downloader.trouble(u'ERROR: unable to extract video URL')
 685                         return
 686
 687                 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 688                 if mobj is None:
 689                         self._downloader.trouble(u'ERROR: unable to extract video URL')
 690                         return
 691
 692                 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 693
 694                 # TODO: support choosing qualities
 695
 696                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 697                 if mobj is None:
 698                         self._downloader.trouble(u'ERROR: unable to extract title')
 699                         return
 700                 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
 701
 702                 video_uploader = None
 703                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 704                 if mobj is None:
 705                         # lookin for official user
 706                         mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 707                         if mobj_official is None:
 708                                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 709                         else:
 710                                 video_uploader = mobj_official.group(1)
 711                 else:
 712                         video_uploader = mobj.group(1)
 713
 714                 video_upload_date = None
 715                 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 716                 if mobj is not None:
 717                         video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 718
 719                 return [{
 720                         'id':           video_id.decode('utf-8'),
 721                         'url':          video_url.decode('utf-8'),
 722                         'uploader':     video_uploader.decode('utf-8'),
 723                         'upload_date':  video_upload_date,
 724                         'title':        video_title,
 725                         'ext':          video_extension.decode('utf-8'),
 726                 }]
 727
 728
 729 class GoogleIE(InfoExtractor):
 730         """Information extractor for video.google.com."""
 731
 732         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
 733         IE_NAME = u'video.google'
 734
 735         def __init__(self, downloader=None):
 736                 InfoExtractor.__init__(self, downloader)
 737
 738         def report_download_webpage(self, video_id):
 739                 """Report webpage download."""
 740                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
 741
 742         def report_extraction(self, video_id):
 743                 """Report information extraction."""
 744                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
 745
 746         def _real_extract(self, url):
 747                 # Extract id from URL
 748                 mobj = re.match(self._VALID_URL, url)
 749                 if mobj is None:
 750                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 751                         return
 752
 753                 video_id = mobj.group(1)
 754
 755                 video_extension = 'mp4'
 756
 757                 # Retrieve video webpage to extract further information
 758                 request = compat_urllib_request.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
 759                 try:
 760                         self.report_download_webpage(video_id)
 761                         webpage = compat_urllib_request.urlopen(request).read()
 762                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
 763                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 764                         return
 765
 766                 # Extract URL, uploader, and title from webpage
 767                 self.report_extraction(video_id)
 768                 mobj = re.search(r"download_url:'([^']+)'", webpage)
 769                 if mobj is None:
 770                         video_extension = 'flv'
 771                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
 772                 if mobj is None:
 773                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 774                         return
 775                 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 776                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
 777                 mediaURL = mediaURL.replace('\\x26', '\x26')
 778
 779                 video_url = mediaURL
 780
 781                 mobj = re.search(r'<title>(.*)</title>', webpage)
 782                 if mobj is None:
 783                         self._downloader.trouble(u'ERROR: unable to extract title')
 784                         return
 785                 video_title = mobj.group(1).decode('utf-8')
 786
 787                 # Extract video description
 788                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
 789                 if mobj is None:
 790                         self._downloader.trouble(u'ERROR: unable to extract video description')
 791                         return
 792                 video_description = mobj.group(1).decode('utf-8')
 793                 if not video_description:
 794                         video_description = 'No description available.'
 795
 796                 # Extract video thumbnail
 797                 if self._downloader.params.get('forcethumbnail', False):
 798                         request = compat_urllib_request.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
 799                         try:
 800                                 webpage = compat_urllib_request.urlopen(request).read()
 801                         except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
 802                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 803                                 return
 804                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
 805                         if mobj is None:
 806                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 807                                 return
 808                         video_thumbnail = mobj.group(1)
 809                 else:   # we need something to pass to process_info
 810                         video_thumbnail = ''
 811
 812                 return [{
 813                         'id':           video_id.decode('utf-8'),
 814                         'url':          video_url.decode('utf-8'),
 815                         'uploader':     None,
 816                         'upload_date':  None,
 817                         'title':        video_title,
 818                         'ext':          video_extension.decode('utf-8'),
 819                 }]
 820
 821
 822 class PhotobucketIE(InfoExtractor):
 823         """Information extractor for photobucket.com."""
 824
 825         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 826         IE_NAME = u'photobucket'
 827
 828         def __init__(self, downloader=None):
 829                 InfoExtractor.__init__(self, downloader)
 830
 831         def report_download_webpage(self, video_id):
 832                 """Report webpage download."""
 833                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 834
 835         def report_extraction(self, video_id):
 836                 """Report information extraction."""
 837                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 838
 839         def _real_extract(self, url):
 840                 # Extract id from URL
 841                 mobj = re.match(self._VALID_URL, url)
 842                 if mobj is None:
 843                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 844                         return
 845
 846                 video_id = mobj.group(1)
 847
 848                 video_extension = 'flv'
 849
 850                 # Retrieve video webpage to extract further information
 851                 request = compat_urllib_request.Request(url)
 852                 try:
 853                         self.report_download_webpage(video_id)
 854                         webpage = compat_urllib_request.urlopen(request).read()
 855                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
 856                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 857                         return
 858
 859                 # Extract URL, uploader, and title from webpage
 860                 self.report_extraction(video_id)
 861                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 862                 if mobj is None:
 863                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 864                         return
 865                 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 866
 867                 video_url = mediaURL
 868
 869                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 870                 if mobj is None:
 871                         self._downloader.trouble(u'ERROR: unable to extract title')
 872                         return
 873                 video_title = mobj.group(1).decode('utf-8')
 874
 875                 video_uploader = mobj.group(2).decode('utf-8')
 876
 877                 return [{
 878                         'id':           video_id.decode('utf-8'),
 879                         'url':          video_url.decode('utf-8'),
 880                         'uploader':     video_uploader,
 881                         'upload_date':  None,
 882                         'title':        video_title,
 883                         'ext':          video_extension.decode('utf-8'),
 884                 }]
 885
 886
 887 class YahooIE(InfoExtractor):
 888         """Information extractor for video.yahoo.com."""
 889
 890         # _VALID_URL matches all Yahoo! Video URLs
 891         # _VPAGE_URL matches only the extractable '/watch/' URLs
 892         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 893         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 894         IE_NAME = u'video.yahoo'
 895
 896         def __init__(self, downloader=None):
 897                 InfoExtractor.__init__(self, downloader)
 898
 899         def report_download_webpage(self, video_id):
 900                 """Report webpage download."""
 901                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 902
 903         def report_extraction(self, video_id):
 904                 """Report information extraction."""
 905                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 906
 907         def _real_extract(self, url, new_video=True):
 908                 # Extract ID from URL
 909                 mobj = re.match(self._VALID_URL, url)
 910                 if mobj is None:
 911                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 912                         return
 913
 914                 video_id = mobj.group(2)
 915                 video_extension = 'flv'
 916
 917                 # Rewrite valid but non-extractable URLs as
 918                 # extractable English language /watch/ URLs
 919                 if re.match(self._VPAGE_URL, url) is None:
 920                         request = compat_urllib_request.Request(url)
 921                         try:
 922                                 webpage = compat_urllib_request.urlopen(request).read()
 923                         except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
 924                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 925                                 return
 926
 927                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 928                         if mobj is None:
 929                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 930                                 return
 931                         yahoo_id = mobj.group(1)
 932
 933                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 934                         if mobj is None:
 935                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 936                                 return
 937                         yahoo_vid = mobj.group(1)
 938
 939                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 940                         return self._real_extract(url, new_video=False)
 941
 942                 # Retrieve video webpage to extract further information
 943                 request = compat_urllib_request.Request(url)
 944                 try:
 945                         self.report_download_webpage(video_id)
 946                         webpage = compat_urllib_request.urlopen(request).read()
 947                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
 948                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 949                         return
 950
 951                 # Extract uploader and title from webpage
 952                 self.report_extraction(video_id)
 953                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 954                 if mobj is None:
 955                         self._downloader.trouble(u'ERROR: unable to extract video title')
 956                         return
 957                 video_title = mobj.group(1).decode('utf-8')
 958
 959                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 960                 if mobj is None:
 961                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
 962                         return
 963                 video_uploader = mobj.group(1).decode('utf-8')
 964
 965                 # Extract video thumbnail
 966                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 967                 if mobj is None:
 968                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 969                         return
 970                 video_thumbnail = mobj.group(1).decode('utf-8')
 971
 972                 # Extract video description
 973                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 974                 if mobj is None:
 975                         self._downloader.trouble(u'ERROR: unable to extract video description')
 976                         return
 977                 video_description = mobj.group(1).decode('utf-8')
 978                 if not video_description:
 979                         video_description = 'No description available.'
 980
 981                 # Extract video height and width
 982                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 983                 if mobj is None:
 984                         self._downloader.trouble(u'ERROR: unable to extract video height')
 985                         return
 986                 yv_video_height = mobj.group(1)
 987
 988                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 989                 if mobj is None:
 990                         self._downloader.trouble(u'ERROR: unable to extract video width')
 991                         return
 992                 yv_video_width = mobj.group(1)
 993
 994                 # Retrieve video playlist to extract media URL
 995                 # I'm not completely sure what all these options are, but we
 996                 # seem to need most of them, otherwise the server sends a 401.
 997                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 998                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 999                 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1000                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1001                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1002                 try:
1003                         self.report_download_webpage(video_id)
1004                         webpage = compat_urllib_request.urlopen(request).read()
1005                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1006                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1007                         return
1008
1009                 # Extract media URL from playlist XML
1010                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1011                 if mobj is None:
1012                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1013                         return
1014                 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1015                 video_url = unescapeHTML(video_url)
1016
1017                 return [{
1018                         'id':           video_id.decode('utf-8'),
1019                         'url':          video_url,
1020                         'uploader':     video_uploader,
1021                         'upload_date':  None,
1022                         'title':        video_title,
1023                         'ext':          video_extension.decode('utf-8'),
1024                         'thumbnail':    video_thumbnail.decode('utf-8'),
1025                         'description':  video_description,
1026                 }]
1027
1028
1029 class VimeoIE(InfoExtractor):
1030         """Information extractor for vimeo.com."""
1031
1032         # _VALID_URL matches Vimeo URLs
1033         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
1034         IE_NAME = u'vimeo'
1035
1036         def __init__(self, downloader=None):
1037                 InfoExtractor.__init__(self, downloader)
1038
1039         def report_download_webpage(self, video_id):
1040                 """Report webpage download."""
1041                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1042
1043         def report_extraction(self, video_id):
1044                 """Report information extraction."""
1045                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1046
1047         def _real_extract(self, url, new_video=True):
1048                 # Extract ID from URL
1049                 mobj = re.match(self._VALID_URL, url)
1050                 if mobj is None:
1051                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1052                         return
1053
1054                 video_id = mobj.group(1)
1055
1056                 # Retrieve video webpage to extract further information
1057                 request = compat_urllib_request.Request(url, None, std_headers)
1058                 try:
1059                         self.report_download_webpage(video_id)
1060                         webpage = compat_urllib_request.urlopen(request).read()
1061                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1062                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1063                         return
1064
1065                 # Now we begin extracting as much information as we can from what we
1066                 # retrieved. First we extract the information common to all extractors,
1067                 # and latter we extract those that are Vimeo specific.
1068                 self.report_extraction(video_id)
1069
1070                 # Extract the config JSON
1071                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1072                 try:
1073                         config = json.loads(config)
1074                 except:
1075                         self._downloader.trouble(u'ERROR: unable to extract info section')
1076                         return
1077
1078                 # Extract title
1079                 video_title = config["video"]["title"]
1080
1081                 # Extract uploader
1082                 video_uploader = config["video"]["owner"]["name"]
1083
1084                 # Extract video thumbnail
1085                 video_thumbnail = config["video"]["thumbnail"]
1086
1087                 # Extract video description
1088                 video_description = get_element_by_id("description", webpage.decode('utf8'))
1089                 if video_description: video_description = clean_html(video_description)
1090                 else: video_description = ''
1091
1092                 # Extract upload date
1093                 video_upload_date = None
1094                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1095                 if mobj is not None:
1096                         video_upload_date = mobj.group(1)
1097
1098                 # Vimeo specific: extract request signature and timestamp
1099                 sig = config['request']['signature']
1100                 timestamp = config['request']['timestamp']
1101
1102                 # Vimeo specific: extract video codec and quality information
1103                 # First consider quality, then codecs, then take everything
1104                 # TODO bind to format param
1105                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1106                 files = { 'hd': [], 'sd': [], 'other': []}
1107                 for codec_name, codec_extension in codecs:
1108                         if codec_name in config["video"]["files"]:
1109                                 if 'hd' in config["video"]["files"][codec_name]:
1110                                         files['hd'].append((codec_name, codec_extension, 'hd'))
1111                                 elif 'sd' in config["video"]["files"][codec_name]:
1112                                         files['sd'].append((codec_name, codec_extension, 'sd'))
1113                                 else:
1114                                         files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1115
1116                 for quality in ('hd', 'sd', 'other'):
1117                         if len(files[quality]) > 0:
1118                                 video_quality = files[quality][0][2]
1119                                 video_codec = files[quality][0][0]
1120                                 video_extension = files[quality][0][1]
1121                                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1122                                 break
1123                 else:
1124                         self._downloader.trouble(u'ERROR: no known codec found')
1125                         return
1126
1127                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1128                                         %(video_id, sig, timestamp, video_quality, video_codec.upper())
1129
1130                 return [{
1131                         'id':           video_id,
1132                         'url':          video_url,
1133                         'uploader':     video_uploader,
1134                         'upload_date':  video_upload_date,
1135                         'title':        video_title,
1136                         'ext':          video_extension,
1137                         'thumbnail':    video_thumbnail,
1138                         'description':  video_description,
1139                 }]
1140
1141
1142 class ArteTvIE(InfoExtractor):
1143         """arte.tv information extractor."""
1144
1145         _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1146         _LIVE_URL = r'index-[0-9]+\.html$'
1147
1148         IE_NAME = u'arte.tv'
1149
1150         def __init__(self, downloader=None):
1151                 InfoExtractor.__init__(self, downloader)
1152
1153         def report_download_webpage(self, video_id):
1154                 """Report webpage download."""
1155                 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1156
1157         def report_extraction(self, video_id):
1158                 """Report information extraction."""
1159                 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1160
1161         def fetch_webpage(self, url):
1162                 self._downloader.increment_downloads()
1163                 request = compat_urllib_request.Request(url)
1164                 try:
1165                         self.report_download_webpage(url)
1166                         webpage = compat_urllib_request.urlopen(request).read()
1167                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1168                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1169                         return
1170                 except ValueError as err:
1171                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1172                         return
1173                 return webpage
1174
1175         def grep_webpage(self, url, regex, regexFlags, matchTuples):
1176                 page = self.fetch_webpage(url)
1177                 mobj = re.search(regex, page, regexFlags)
1178                 info = {}
1179
1180                 if mobj is None:
1181                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1182                         return
1183
1184                 for (i, key, err) in matchTuples:
1185                         if mobj.group(i) is None:
1186                                 self._downloader.trouble(err)
1187                                 return
1188                         else:
1189                                 info[key] = mobj.group(i)
1190
1191                 return info
1192
1193         def extractLiveStream(self, url):
1194                 video_lang = url.split('/')[-4]
1195                 info = self.grep_webpage(
1196                         url,
1197                         r'src="(.*?/videothek_js.*?\.js)',
1198                         0,
1199                         [
1200                                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1201                         ]
1202                 )
1203                 http_host = url.split('/')[2]
1204                 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1205                 info = self.grep_webpage(
1206                         next_url,
1207                         r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1208                                 '(http://.*?\.swf).*?' +
1209                                 '(rtmp://.*?)\'',
1210                         re.DOTALL,
1211                         [
1212                                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1213                                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1214                                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1215                         ]
1216                 )
1217                 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1218
1219         def extractPlus7Stream(self, url):
1220                 video_lang = url.split('/')[-3]
1221                 info = self.grep_webpage(
1222                         url,
1223                         r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1224                         0,
1225                         [
1226                                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1227                         ]
1228                 )
1229                 next_url = compat_urllib_parse.unquote(info.get('url'))
1230                 info = self.grep_webpage(
1231                         next_url,
1232                         r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1233                         0,
1234                         [
1235                                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1236                         ]
1237                 )
1238                 next_url = compat_urllib_parse.unquote(info.get('url'))
1239
1240                 info = self.grep_webpage(
1241                         next_url,
1242                         r'<video id="(.*?)".*?>.*?' +
1243                                 '<name>(.*?)</name>.*?' +
1244                                 '<dateVideo>(.*?)</dateVideo>.*?' +
1245                                 '<url quality="hd">(.*?)</url>',
1246                         re.DOTALL,
1247                         [
1248                                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1249                                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1250                                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1251                                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1252                         ]
1253                 )
1254
1255                 return {
1256                         'id':           info.get('id'),
1257                         'url':          compat_urllib_parse.unquote(info.get('url')),
1258                         'uploader':     u'arte.tv',
1259                         'upload_date':  info.get('date'),
1260                         'title':        info.get('title'),
1261                         'ext':          u'mp4',
1262                         'format':       u'NA',
1263                         'player_url':   None,
1264                 }
1265
1266         def _real_extract(self, url):
1267                 video_id = url.split('/')[-1]
1268                 self.report_extraction(video_id)
1269
1270                 if re.search(self._LIVE_URL, video_id) is not None:
1271                         self.extractLiveStream(url)
1272                         return
1273                 else:
1274                         info = self.extractPlus7Stream(url)
1275
1276                 return [info]
1277
1278
1279 class GenericIE(InfoExtractor):
1280         """Generic last-resort information extractor."""
1281
1282         _VALID_URL = r'.*'
1283         IE_NAME = u'generic'
1284
1285         def __init__(self, downloader=None):
1286                 InfoExtractor.__init__(self, downloader)
1287
1288         def report_download_webpage(self, video_id):
1289                 """Report webpage download."""
1290                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1291                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1292
1293         def report_extraction(self, video_id):
1294                 """Report information extraction."""
1295                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1296
1297         def report_following_redirect(self, new_url):
1298                 """Report information extraction."""
1299                 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1300
1301         def _test_redirect(self, url):
1302                 """Check if it is a redirect, like url shorteners, in case restart chain."""
1303                 class HeadRequest(compat_urllib_request.Request):
1304                         def get_method(self):
1305                                 return "HEAD"
1306
1307                 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1308                         """
1309                         Subclass the HTTPRedirectHandler to make it use our
1310                         HeadRequest also on the redirected URL
1311                         """
1312                         def redirect_request(self, req, fp, code, msg, headers, newurl):
1313                                 if code in (301, 302, 303, 307):
1314                                         newurl = newurl.replace(' ', '%20')
1315                                         newheaders = dict((k,v) for k,v in req.headers.items()
1316                                                                           if k.lower() not in ("content-length", "content-type"))
1317                                         return HeadRequest(newurl,
1318                                                                            headers=newheaders,
1319                                                                            origin_req_host=req.get_origin_req_host(),
1320                                                                            unverifiable=True)
1321                                 else:
1322                                         raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1323
1324                 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1325                         """
1326                         Fallback to GET if HEAD is not allowed (405 HTTP error)
1327                         """
1328                         def http_error_405(self, req, fp, code, msg, headers):
1329                                 fp.read()
1330                                 fp.close()
1331
1332                                 newheaders = dict((k,v) for k,v in req.headers.items()
1333                                                                   if k.lower() not in ("content-length", "content-type"))
1334                                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1335                                                                                                  headers=newheaders,
1336                                                                                                  origin_req_host=req.get_origin_req_host(),
1337                                                                                                  unverifiable=True))
1338
1339                 # Build our opener
1340                 opener = compat_urllib_request.OpenerDirector()
1341                 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1342                                                 HTTPMethodFallback, HEADRedirectHandler,
1343                                                 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1344                         opener.add_handler(handler())
1345
1346                 response = opener.open(HeadRequest(url))
1347                 new_url = response.geturl()
1348
1349                 if url == new_url:
1350                         return False
1351
1352                 self.report_following_redirect(new_url)
1353                 self._downloader.download([new_url])
1354                 return True
1355
1356         def _real_extract(self, url):
1357                 if self._test_redirect(url): return
1358
1359                 video_id = url.split('/')[-1]
1360                 request = compat_urllib_request.Request(url)
1361                 try:
1362                         self.report_download_webpage(video_id)
1363                         webpage = compat_urllib_request.urlopen(request).read()
1364                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1365                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1366                         return
1367                 except ValueError as err:
1368                         # since this is the last-resort InfoExtractor, if
1369                         # this error is thrown, it'll be thrown here
1370                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1371                         return
1372
1373                 self.report_extraction(video_id)
1374                 # Start with something easy: JW Player in SWFObject
1375                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1376                 if mobj is None:
1377                         # Broaden the search a little bit
1378                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1379                 if mobj is None:
1380                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1381                         return
1382
1383                 # It's possible that one of the regexes
1384                 # matched, but returned an empty group:
1385                 if mobj.group(1) is None:
1386                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1387                         return
1388
1389                 video_url = compat_urllib_parse.unquote(mobj.group(1))
1390                 video_id = os.path.basename(video_url)
1391
1392                 # here's a fun little line of code for you:
1393                 video_extension = os.path.splitext(video_id)[1][1:]
1394                 video_id = os.path.splitext(video_id)[0]
1395
1396                 # it's tempting to parse this further, but you would
1397                 # have to take into account all the variations like
1398                 #   Video Title - Site Name
1399                 #   Site Name | Video Title
1400                 #   Video Title - Tagline | Site Name
1401                 # and so on and so forth; it's just not practical
1402                 mobj = re.search(r'<title>(.*)</title>', webpage)
1403                 if mobj is None:
1404                         self._downloader.trouble(u'ERROR: unable to extract title')
1405                         return
1406                 video_title = mobj.group(1).decode('utf-8')
1407
1408                 # video uploader is domain name
1409                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1410                 if mobj is None:
1411                         self._downloader.trouble(u'ERROR: unable to extract title')
1412                         return
1413                 video_uploader = mobj.group(1).decode('utf-8')
1414
1415                 return [{
1416                         'id':           video_id.decode('utf-8'),
1417                         'url':          video_url.decode('utf-8'),
1418                         'uploader':     video_uploader,
1419                         'upload_date':  None,
1420                         'title':        video_title,
1421                         'ext':          video_extension.decode('utf-8'),
1422                 }]
1423
1424
1425 class YoutubeSearchIE(InfoExtractor):
1426         """Information Extractor for YouTube search queries."""
1427         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1428         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1429         _max_youtube_results = 1000
1430         IE_NAME = u'youtube:search'
1431
1432         def __init__(self, downloader=None):
1433                 InfoExtractor.__init__(self, downloader)
1434
1435         def report_download_page(self, query, pagenum):
1436                 """Report attempt to download search page with given number."""
1437                 query = query.decode(preferredencoding())
1438                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1439
1440         def _real_extract(self, query):
1441                 mobj = re.match(self._VALID_URL, query)
1442                 if mobj is None:
1443                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1444                         return
1445
1446                 prefix, query = query.split(':')
1447                 prefix = prefix[8:]
1448                 query = query.encode('utf-8')
1449                 if prefix == '':
1450                         self._download_n_results(query, 1)
1451                         return
1452                 elif prefix == 'all':
1453                         self._download_n_results(query, self._max_youtube_results)
1454                         return
1455                 else:
1456                         try:
1457                                 n = int(prefix)
1458                                 if n <= 0:
1459                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1460                                         return
1461                                 elif n > self._max_youtube_results:
1462                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1463                                         n = self._max_youtube_results
1464                                 self._download_n_results(query, n)
1465                                 return
1466                         except ValueError: # parsing prefix as integer fails
1467                                 self._download_n_results(query, 1)
1468                                 return
1469
1470         def _download_n_results(self, query, n):
1471                 """Downloads a specified number of results for a query"""
1472
1473                 video_ids = []
1474                 pagenum = 0
1475                 limit = n
1476
1477                 while (50 * pagenum) < limit:
1478                         self.report_download_page(query, pagenum+1)
1479                         result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1480                         request = compat_urllib_request.Request(result_url)
1481                         try:
1482                                 data = compat_urllib_request.urlopen(request).read()
1483                         except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1484                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1485                                 return
1486                         api_response = json.loads(data)['data']
1487
1488                         new_ids = list(video['id'] for video in api_response['items'])
1489                         video_ids += new_ids
1490
1491                         limit = min(n, api_response['totalItems'])
1492                         pagenum += 1
1493
1494                 if len(video_ids) > n:
1495                         video_ids = video_ids[:n]
1496                 for id in video_ids:
1497                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1498                 return
1499
1500
1501 class GoogleSearchIE(InfoExtractor):
1502         """Information Extractor for Google Video search queries."""
1503         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1504         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1505         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1506         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1507         _max_google_results = 1000
1508         IE_NAME = u'video.google:search'
1509
1510         def __init__(self, downloader=None):
1511                 InfoExtractor.__init__(self, downloader)
1512
1513         def report_download_page(self, query, pagenum):
1514                 """Report attempt to download playlist page with given number."""
1515                 query = query.decode(preferredencoding())
1516                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1517
1518         def _real_extract(self, query):
1519                 mobj = re.match(self._VALID_URL, query)
1520                 if mobj is None:
1521                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1522                         return
1523
1524                 prefix, query = query.split(':')
1525                 prefix = prefix[8:]
1526                 query = query.encode('utf-8')
1527                 if prefix == '':
1528                         self._download_n_results(query, 1)
1529                         return
1530                 elif prefix == 'all':
1531                         self._download_n_results(query, self._max_google_results)
1532                         return
1533                 else:
1534                         try:
1535                                 n = int(prefix)
1536                                 if n <= 0:
1537                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1538                                         return
1539                                 elif n > self._max_google_results:
1540                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1541                                         n = self._max_google_results
1542                                 self._download_n_results(query, n)
1543                                 return
1544                         except ValueError: # parsing prefix as integer fails
1545                                 self._download_n_results(query, 1)
1546                                 return
1547
1548         def _download_n_results(self, query, n):
1549                 """Downloads a specified number of results for a query"""
1550
1551                 video_ids = []
1552                 pagenum = 0
1553
1554                 while True:
1555                         self.report_download_page(query, pagenum)
1556                         result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1557                         request = compat_urllib_request.Request(result_url)
1558                         try:
1559                                 page = compat_urllib_request.urlopen(request).read()
1560                         except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1561                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1562                                 return
1563
1564                         # Extract video identifiers
1565                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1566                                 video_id = mobj.group(1)
1567                                 if video_id not in video_ids:
1568                                         video_ids.append(video_id)
1569                                         if len(video_ids) == n:
1570                                                 # Specified n videos reached
1571                                                 for id in video_ids:
1572                                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1573                                                 return
1574
1575                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1576                                 for id in video_ids:
1577                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1578                                 return
1579
1580                         pagenum = pagenum + 1
1581
1582
1583 class YahooSearchIE(InfoExtractor):
1584         """Information Extractor for Yahoo! Video search queries."""
1585         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1586         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1587         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1588         _MORE_PAGES_INDICATOR = r'\s*Next'
1589         _max_yahoo_results = 1000
1590         IE_NAME = u'video.yahoo:search'
1591
1592         def __init__(self, downloader=None):
1593                 InfoExtractor.__init__(self, downloader)
1594
1595         def report_download_page(self, query, pagenum):
1596                 """Report attempt to download playlist page with given number."""
1597                 query = query.decode(preferredencoding())
1598                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1599
1600         def _real_extract(self, query):
1601                 mobj = re.match(self._VALID_URL, query)
1602                 if mobj is None:
1603                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1604                         return
1605
1606                 prefix, query = query.split(':')
1607                 prefix = prefix[8:]
1608                 query = query.encode('utf-8')
1609                 if prefix == '':
1610                         self._download_n_results(query, 1)
1611                         return
1612                 elif prefix == 'all':
1613                         self._download_n_results(query, self._max_yahoo_results)
1614                         return
1615                 else:
1616                         try:
1617                                 n = int(prefix)
1618                                 if n <= 0:
1619                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1620                                         return
1621                                 elif n > self._max_yahoo_results:
1622                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1623                                         n = self._max_yahoo_results
1624                                 self._download_n_results(query, n)
1625                                 return
1626                         except ValueError: # parsing prefix as integer fails
1627                                 self._download_n_results(query, 1)
1628                                 return
1629
1630         def _download_n_results(self, query, n):
1631                 """Downloads a specified number of results for a query"""
1632
1633                 video_ids = []
1634                 already_seen = set()
1635                 pagenum = 1
1636
1637                 while True:
1638                         self.report_download_page(query, pagenum)
1639                         result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1640                         request = compat_urllib_request.Request(result_url)
1641                         try:
1642                                 page = compat_urllib_request.urlopen(request).read()
1643                         except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1644                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1645                                 return
1646
1647                         # Extract video identifiers
1648                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1649                                 video_id = mobj.group(1)
1650                                 if video_id not in already_seen:
1651                                         video_ids.append(video_id)
1652                                         already_seen.add(video_id)
1653                                         if len(video_ids) == n:
1654                                                 # Specified n videos reached
1655                                                 for id in video_ids:
1656                                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1657                                                 return
1658
1659                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1660                                 for id in video_ids:
1661                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1662                                 return
1663
1664                         pagenum = pagenum + 1
1665
1666
1667 class YoutubePlaylistIE(InfoExtractor):
1668         """Information Extractor for YouTube playlists."""
1669
1670         _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1671         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1672         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1673         _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1674         IE_NAME = u'youtube:playlist'
1675
1676         def __init__(self, downloader=None):
1677                 InfoExtractor.__init__(self, downloader)
1678
1679         def report_download_page(self, playlist_id, pagenum):
1680                 """Report attempt to download playlist page with given number."""
1681                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1682
1683         def _real_extract(self, url):
1684                 # Extract playlist id
1685                 mobj = re.match(self._VALID_URL, url)
1686                 if mobj is None:
1687                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1688                         return
1689
1690                 # Single video case
1691                 if mobj.group(3) is not None:
1692                         self._downloader.download([mobj.group(3)])
1693                         return
1694
1695                 # Download playlist pages
1696                 # prefix is 'p' as default for playlists but there are other types that need extra care
1697                 playlist_prefix = mobj.group(1)
1698                 if playlist_prefix == 'a':
1699                         playlist_access = 'artist'
1700                 else:
1701                         playlist_prefix = 'p'
1702                         playlist_access = 'view_play_list'
1703                 playlist_id = mobj.group(2)
1704                 video_ids = []
1705                 pagenum = 1
1706
1707                 while True:
1708                         self.report_download_page(playlist_id, pagenum)
1709                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1710                         request = compat_urllib_request.Request(url)
1711                         try:
1712                                 page = compat_urllib_request.urlopen(request).read()
1713                         except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1714                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1715                                 return
1716
1717                         # Extract video identifiers
1718                         ids_in_page = []
1719                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1720                                 if mobj.group(1) not in ids_in_page:
1721                                         ids_in_page.append(mobj.group(1))
1722                         video_ids.extend(ids_in_page)
1723
1724                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1725                                 break
1726                         pagenum = pagenum + 1
1727
1728                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1729                 playlistend = self._downloader.params.get('playlistend', -1)
1730                 if playlistend == -1:
1731                         video_ids = video_ids[playliststart:]
1732                 else:
1733                         video_ids = video_ids[playliststart:playlistend]
1734
1735                 for id in video_ids:
1736                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1737                 return
1738
1739
1740 class YoutubeChannelIE(InfoExtractor):
1741         """Information Extractor for YouTube channels."""
1742
1743         _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1744         _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1745         _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1746         IE_NAME = u'youtube:channel'
1747
1748         def report_download_page(self, channel_id, pagenum):
1749                 """Report attempt to download channel page with given number."""
1750                 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1751
1752         def _real_extract(self, url):
1753                 # Extract channel id
1754                 mobj = re.match(self._VALID_URL, url)
1755                 if mobj is None:
1756                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1757                         return
1758
1759                 # Download channel pages
1760                 channel_id = mobj.group(1)
1761                 video_ids = []
1762                 pagenum = 1
1763
1764                 while True:
1765                         self.report_download_page(channel_id, pagenum)
1766                         url = self._TEMPLATE_URL % (channel_id, pagenum)
1767                         request = compat_urllib_request.Request(url)
1768                         try:
1769                                 page = compat_urllib_request.urlopen(request).read()
1770                         except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1771                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1772                                 return
1773
1774                         # Extract video identifiers
1775                         ids_in_page = []
1776                         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1777                                 if mobj.group(1) not in ids_in_page:
1778                                         ids_in_page.append(mobj.group(1))
1779                         video_ids.extend(ids_in_page)
1780
1781                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1782                                 break
1783                         pagenum = pagenum + 1
1784
1785                 for id in video_ids:
1786                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1787                 return
1788
1789
1790 class YoutubeUserIE(InfoExtractor):
1791         """Information Extractor for YouTube users."""
1792
1793         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1794         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1795         _GDATA_PAGE_SIZE = 50
1796         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1797         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1798         IE_NAME = u'youtube:user'
1799
1800         def __init__(self, downloader=None):
1801                 InfoExtractor.__init__(self, downloader)
1802
1803         def report_download_page(self, username, start_index):
1804                 """Report attempt to download user page."""
1805                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1806                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1807
1808         def _real_extract(self, url):
1809                 # Extract username
1810                 mobj = re.match(self._VALID_URL, url)
1811                 if mobj is None:
1812                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1813                         return
1814
1815                 username = mobj.group(1)
1816
1817                 # Download video ids using YouTube Data API. Result size per
1818                 # query is limited (currently to 50 videos) so we need to query
1819                 # page by page until there are no video ids - it means we got
1820                 # all of them.
1821
1822                 video_ids = []
1823                 pagenum = 0
1824
1825                 while True:
1826                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1827                         self.report_download_page(username, start_index)
1828
1829                         request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1830
1831                         try:
1832                                 page = compat_urllib_request.urlopen(request).read()
1833                         except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1834                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1835                                 return
1836
1837                         # Extract video identifiers
1838                         ids_in_page = []
1839
1840                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1841                                 if mobj.group(1) not in ids_in_page:
1842                                         ids_in_page.append(mobj.group(1))
1843
1844                         video_ids.extend(ids_in_page)
1845
1846                         # A little optimization - if current page is not
1847                         # "full", ie. does not contain PAGE_SIZE video ids then
1848                         # we can assume that this page is the last one - there
1849                         # are no more ids on further pages - no need to query
1850                         # again.
1851
1852                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1853                                 break
1854
1855                         pagenum += 1
1856
1857                 all_ids_count = len(video_ids)
1858                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1859                 playlistend = self._downloader.params.get('playlistend', -1)
1860
1861                 if playlistend == -1:
1862                         video_ids = video_ids[playliststart:]
1863                 else:
1864                         video_ids = video_ids[playliststart:playlistend]
1865
1866                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1867                                 (username, all_ids_count, len(video_ids)))
1868
1869                 for video_id in video_ids:
1870                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1871
1872
1873 class BlipTVUserIE(InfoExtractor):
1874         """Information Extractor for blip.tv users."""
1875
1876         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1877         _PAGE_SIZE = 12
1878         IE_NAME = u'blip.tv:user'
1879
1880         def __init__(self, downloader=None):
1881                 InfoExtractor.__init__(self, downloader)
1882
1883         def report_download_page(self, username, pagenum):
1884                 """Report attempt to download user page."""
1885                 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1886                                 (self.IE_NAME, username, pagenum))
1887
1888         def _real_extract(self, url):
1889                 # Extract username
1890                 mobj = re.match(self._VALID_URL, url)
1891                 if mobj is None:
1892                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1893                         return
1894
1895                 username = mobj.group(1)
1896
1897                 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1898
1899                 request = compat_urllib_request.Request(url)
1900
1901                 try:
1902                         page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1903                         mobj = re.search(r'data-users-id="([^"]+)"', page)
1904                         page_base = page_base % mobj.group(1)
1905                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1906                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1907                         return
1908
1909
1910                 # Download video ids using BlipTV Ajax calls. Result size per
1911                 # query is limited (currently to 12 videos) so we need to query
1912                 # page by page until there are no video ids - it means we got
1913                 # all of them.
1914
1915                 video_ids = []
1916                 pagenum = 1
1917
1918                 while True:
1919                         self.report_download_page(username, pagenum)
1920
1921                         request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1922
1923                         try:
1924                                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1925                         except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1926                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1927                                 return
1928
1929                         # Extract video identifiers
1930                         ids_in_page = []
1931
1932                         for mobj in re.finditer(r'href="/([^"]+)"', page):
1933                                 if mobj.group(1) not in ids_in_page:
1934                                         ids_in_page.append(unescapeHTML(mobj.group(1)))
1935
1936                         video_ids.extend(ids_in_page)
1937
1938                         # A little optimization - if current page is not
1939                         # "full", ie. does not contain PAGE_SIZE video ids then
1940                         # we can assume that this page is the last one - there
1941                         # are no more ids on further pages - no need to query
1942                         # again.
1943
1944                         if len(ids_in_page) < self._PAGE_SIZE:
1945                                 break
1946
1947                         pagenum += 1
1948
1949                 all_ids_count = len(video_ids)
1950                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1951                 playlistend = self._downloader.params.get('playlistend', -1)
1952
1953                 if playlistend == -1:
1954                         video_ids = video_ids[playliststart:]
1955                 else:
1956                         video_ids = video_ids[playliststart:playlistend]
1957
1958                 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1959                                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1960
1961                 for video_id in video_ids:
1962                         self._downloader.download([u'http://blip.tv/'+video_id])
1963
1964
1965 class DepositFilesIE(InfoExtractor):
1966         """Information extractor for depositfiles.com"""
1967
1968         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1969         IE_NAME = u'DepositFiles'
1970
1971         def __init__(self, downloader=None):
1972                 InfoExtractor.__init__(self, downloader)
1973
1974         def report_download_webpage(self, file_id):
1975                 """Report webpage download."""
1976                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1977
1978         def report_extraction(self, file_id):
1979                 """Report information extraction."""
1980                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1981
1982         def _real_extract(self, url):
1983                 file_id = url.split('/')[-1]
1984                 # Rebuild url in english locale
1985                 url = 'http://depositfiles.com/en/files/' + file_id
1986
1987                 # Retrieve file webpage with 'Free download' button pressed
1988                 free_download_indication = { 'gateway_result' : '1' }
1989                 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1990                 try:
1991                         self.report_download_webpage(file_id)
1992                         webpage = compat_urllib_request.urlopen(request).read()
1993                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1994                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1995                         return
1996
1997                 # Search for the real file URL
1998                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1999                 if (mobj is None) or (mobj.group(1) is None):
2000                         # Try to figure out reason of the error.
2001                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2002                         if (mobj is not None) and (mobj.group(1) is not None):
2003                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2004                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2005                         else:
2006                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2007                         return
2008
2009                 file_url = mobj.group(1)
2010                 file_extension = os.path.splitext(file_url)[1][1:]
2011
2012                 # Search for file title
2013                 mobj = re.search(r'<b title="(.*?)">', webpage)
2014                 if mobj is None:
2015                         self._downloader.trouble(u'ERROR: unable to extract title')
2016                         return
2017                 file_title = mobj.group(1).decode('utf-8')
2018
2019                 return [{
2020                         'id':           file_id.decode('utf-8'),
2021                         'url':          file_url.decode('utf-8'),
2022                         'uploader':     None,
2023                         'upload_date':  None,
2024                         'title':        file_title,
2025                         'ext':          file_extension.decode('utf-8'),
2026                 }]
2027
2028
2029 class FacebookIE(InfoExtractor):
2030         """Information Extractor for Facebook"""
2031
2032         _WORKING = False
2033         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2034         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2035         _NETRC_MACHINE = 'facebook'
2036         _available_formats = ['video', 'highqual', 'lowqual']
2037         _video_extensions = {
2038                 'video': 'mp4',
2039                 'highqual': 'mp4',
2040                 'lowqual': 'mp4',
2041         }
2042         IE_NAME = u'facebook'
2043
2044         def __init__(self, downloader=None):
2045                 InfoExtractor.__init__(self, downloader)
2046
2047         def _reporter(self, message):
2048                 """Add header and report message."""
2049                 self._downloader.to_screen(u'[facebook] %s' % message)
2050
2051         def report_login(self):
2052                 """Report attempt to log in."""
2053                 self._reporter(u'Logging in')
2054
2055         def report_video_webpage_download(self, video_id):
2056                 """Report attempt to download video webpage."""
2057                 self._reporter(u'%s: Downloading video webpage' % video_id)
2058
2059         def report_information_extraction(self, video_id):
2060                 """Report attempt to extract video information."""
2061                 self._reporter(u'%s: Extracting video information' % video_id)
2062
2063         def _parse_page(self, video_webpage):
2064                 """Extract video information from page"""
2065                 # General data
2066                 data = {'title': r'\("video_title", "(.*?)"\)',
2067                         'description': r'<div class="datawrap">(.*?)</div>',
2068                         'owner': r'\("video_owner_name", "(.*?)"\)',
2069                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2070                         }
2071                 video_info = {}
2072                 for piece in data.keys():
2073                         mobj = re.search(data[piece], video_webpage)
2074                         if mobj is not None:
2075                                 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2076
2077                 # Video urls
2078                 video_urls = {}
2079                 for fmt in self._available_formats:
2080                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2081                         if mobj is not None:
2082                                 # URL is in a Javascript segment inside an escaped Unicode format within
2083                                 # the generally utf-8 page
2084                                 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2085                 video_info['video_urls'] = video_urls
2086
2087                 return video_info
2088
2089         def _real_initialize(self):
2090                 if self._downloader is None:
2091                         return
2092
2093                 useremail = None
2094                 password = None
2095                 downloader_params = self._downloader.params
2096
2097                 # Attempt to use provided username and password or .netrc data
2098                 if downloader_params.get('username', None) is not None:
2099                         useremail = downloader_params['username']
2100                         password = downloader_params['password']
2101                 elif downloader_params.get('usenetrc', False):
2102                         try:
2103                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2104                                 if info is not None:
2105                                         useremail = info[0]
2106                                         password = info[2]
2107                                 else:
2108                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2109                         except (IOError, netrc.NetrcParseError) as err:
2110                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2111                                 return
2112
2113                 if useremail is None:
2114                         return
2115
2116                 # Log in
2117                 login_form = {
2118                         'email': useremail,
2119                         'pass': password,
2120                         'login': 'Log+In'
2121                         }
2122                 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2123                 try:
2124                         self.report_login()
2125                         login_results = compat_urllib_request.urlopen(request).read()
2126                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2127                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2128                                 return
2129                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2130                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2131                         return
2132
2133         def _real_extract(self, url):
2134                 mobj = re.match(self._VALID_URL, url)
2135                 if mobj is None:
2136                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2137                         return
2138                 video_id = mobj.group('ID')
2139
2140                 # Get video webpage
2141                 self.report_video_webpage_download(video_id)
2142                 request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2143                 try:
2144                         page = compat_urllib_request.urlopen(request)
2145                         video_webpage = page.read()
2146                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2147                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2148                         return
2149
2150                 # Start extracting information
2151                 self.report_information_extraction(video_id)
2152
2153                 # Extract information
2154                 video_info = self._parse_page(video_webpage)
2155
2156                 # uploader
2157                 if 'owner' not in video_info:
2158                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2159                         return
2160                 video_uploader = video_info['owner']
2161
2162                 # title
2163                 if 'title' not in video_info:
2164                         self._downloader.trouble(u'ERROR: unable to extract video title')
2165                         return
2166                 video_title = video_info['title']
2167                 video_title = video_title.decode('utf-8')
2168
2169                 # thumbnail image
2170                 if 'thumbnail' not in video_info:
2171                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2172                         video_thumbnail = ''
2173                 else:
2174                         video_thumbnail = video_info['thumbnail']
2175
2176                 # upload date
2177                 upload_date = None
2178                 if 'upload_date' in video_info:
2179                         upload_time = video_info['upload_date']
2180                         timetuple = email.utils.parsedate_tz(upload_time)
2181                         if timetuple is not None:
2182                                 try:
2183                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2184                                 except:
2185                                         pass
2186
2187                 # description
2188                 video_description = video_info.get('description', 'No description available.')
2189
2190                 url_map = video_info['video_urls']
2191                 if len(url_map.keys()) > 0:
2192                         # Decide which formats to download
2193                         req_format = self._downloader.params.get('format', None)
2194                         format_limit = self._downloader.params.get('format_limit', None)
2195
2196                         if format_limit is not None and format_limit in self._available_formats:
2197                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2198                         else:
2199                                 format_list = self._available_formats
2200                         existing_formats = [x for x in format_list if x in url_map]
2201                         if len(existing_formats) == 0:
2202                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2203                                 return
2204                         if req_format is None:
2205                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2206                         elif req_format == 'worst':
2207                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2208                         elif req_format == '-1':
2209                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2210                         else:
2211                                 # Specific format
2212                                 if req_format not in url_map:
2213                                         self._downloader.trouble(u'ERROR: requested format not available')
2214                                         return
2215                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2216
2217                 results = []
2218                 for format_param, video_real_url in video_url_list:
2219                         # Extension
2220                         video_extension = self._video_extensions.get(format_param, 'mp4')
2221
2222                         results.append({
2223                                 'id':           video_id.decode('utf-8'),
2224                                 'url':          video_real_url.decode('utf-8'),
2225                                 'uploader':     video_uploader.decode('utf-8'),
2226                                 'upload_date':  upload_date,
2227                                 'title':        video_title,
2228                                 'ext':          video_extension.decode('utf-8'),
2229                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2230                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2231                                 'description':  video_description.decode('utf-8'),
2232                         })
2233                 return results
2234
2235 class BlipTVIE(InfoExtractor):
2236         """Information extractor for blip.tv"""
2237
2238         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2239         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2240         IE_NAME = u'blip.tv'
2241
2242         def report_extraction(self, file_id):
2243                 """Report information extraction."""
2244                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2245
2246         def report_direct_download(self, title):
2247                 """Report information extraction."""
2248                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2249
2250         def _real_extract(self, url):
2251                 mobj = re.match(self._VALID_URL, url)
2252                 if mobj is None:
2253                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2254                         return
2255
2256                 if '?' in url:
2257                         cchar = '&'
2258                 else:
2259                         cchar = '?'
2260                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2261                 request = compat_urllib_request.Request(json_url.encode('utf-8'))
2262                 self.report_extraction(mobj.group(1))
2263                 info = None
2264                 try:
2265                         urlh = compat_urllib_request.urlopen(request)
2266                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2267                                 basename = url.split('/')[-1]
2268                                 title,ext = os.path.splitext(basename)
2269                                 title = title.decode('UTF-8')
2270                                 ext = ext.replace('.', '')
2271                                 self.report_direct_download(title)
2272                                 info = {
2273                                         'id': title,
2274                                         'url': url,
2275                                         'uploader': None,
2276                                         'upload_date': None,
2277                                         'title': title,
2278                                         'ext': ext,
2279                                         'urlhandle': urlh
2280                                 }
2281                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2282                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2283                         return
2284                 if info is None: # Regular URL
2285                         try:
2286                                 json_code = urlh.read()
2287                         except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2288                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2289                                 return
2290
2291                         try:
2292                                 json_data = json.loads(json_code)
2293                                 if 'Post' in json_data:
2294                                         data = json_data['Post']
2295                                 else:
2296                                         data = json_data
2297
2298                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2299                                 video_url = data['media']['url']
2300                                 umobj = re.match(self._URL_EXT, video_url)
2301                                 if umobj is None:
2302                                         raise ValueError('Can not determine filename extension')
2303                                 ext = umobj.group(1)
2304
2305                                 info = {
2306                                         'id': data['item_id'],
2307                                         'url': video_url,
2308                                         'uploader': data['display_name'],
2309                                         'upload_date': upload_date,
2310                                         'title': data['title'],
2311                                         'ext': ext,
2312                                         'format': data['media']['mimeType'],
2313                                         'thumbnail': data['thumbnailUrl'],
2314                                         'description': data['description'],
2315                                         'player_url': data['embedUrl']
2316                                 }
2317                         except (ValueError,KeyError) as err:
2318                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2319                                 return
2320
2321                 std_headers['User-Agent'] = 'iTunes/10.6.1'
2322                 return [info]
2323
2324
2325 class MyVideoIE(InfoExtractor):
2326         """Information Extractor for myvideo.de."""
2327
2328         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2329         IE_NAME = u'myvideo'
2330
2331         def __init__(self, downloader=None):
2332                 InfoExtractor.__init__(self, downloader)
2333
2334         def report_download_webpage(self, video_id):
2335                 """Report webpage download."""
2336                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2337
2338         def report_extraction(self, video_id):
2339                 """Report information extraction."""
2340                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2341
2342         def _real_extract(self,url):
2343                 mobj = re.match(self._VALID_URL, url)
2344                 if mobj is None:
2345                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2346                         return
2347
2348                 video_id = mobj.group(1)
2349
2350                 # Get video webpage
2351                 request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
2352                 try:
2353                         self.report_download_webpage(video_id)
2354                         webpage = compat_urllib_request.urlopen(request).read()
2355                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2356                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2357                         return
2358
2359                 self.report_extraction(video_id)
2360                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2361                                  webpage)
2362                 if mobj is None:
2363                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2364                         return
2365                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2366
2367                 mobj = re.search('<title>([^<]+)</title>', webpage)
2368                 if mobj is None:
2369                         self._downloader.trouble(u'ERROR: unable to extract title')
2370                         return
2371
2372                 video_title = mobj.group(1)
2373
2374                 return [{
2375                         'id':           video_id,
2376                         'url':          video_url,
2377                         'uploader':     None,
2378                         'upload_date':  None,
2379                         'title':        video_title,
2380                         'ext':          u'flv',
2381                 }]
2382
2383 class ComedyCentralIE(InfoExtractor):
2384         """Information extractor for The Daily Show and Colbert Report """
2385
2386         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2387         IE_NAME = u'comedycentral'
2388
2389         _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2390
2391         _video_extensions = {
2392                 '3500': 'mp4',
2393                 '2200': 'mp4',
2394                 '1700': 'mp4',
2395                 '1200': 'mp4',
2396                 '750': 'mp4',
2397                 '400': 'mp4',
2398         }
2399         _video_dimensions = {
2400                 '3500': '1280x720',
2401                 '2200': '960x540',
2402                 '1700': '768x432',
2403                 '1200': '640x360',
2404                 '750': '512x288',
2405                 '400': '384x216',
2406         }
2407
2408         def report_extraction(self, episode_id):
2409                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2410
2411         def report_config_download(self, episode_id):
2412                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2413
2414         def report_index_download(self, episode_id):
2415                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2416
2417         def report_player_url(self, episode_id):
2418                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2419
2420
2421         def _print_formats(self, formats):
2422                 print('Available formats:')
2423                 for x in formats:
2424                         print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2425
2426
2427         def _real_extract(self, url):
2428                 mobj = re.match(self._VALID_URL, url)
2429                 if mobj is None:
2430                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2431                         return
2432
2433                 if mobj.group('shortname'):
2434                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
2435                                 url = u'http://www.thedailyshow.com/full-episodes/'
2436                         else:
2437                                 url = u'http://www.colbertnation.com/full-episodes/'
2438                         mobj = re.match(self._VALID_URL, url)
2439                         assert mobj is not None
2440
2441                 dlNewest = not mobj.group('episode')
2442                 if dlNewest:
2443                         epTitle = mobj.group('showname')
2444                 else:
2445                         epTitle = mobj.group('episode')
2446
2447                 req = compat_urllib_request.Request(url)
2448                 self.report_extraction(epTitle)
2449                 try:
2450                         htmlHandle = compat_urllib_request.urlopen(req)
2451                         html = htmlHandle.read()
2452                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2453                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2454                         return
2455                 if dlNewest:
2456                         url = htmlHandle.geturl()
2457                         mobj = re.match(self._VALID_URL, url)
2458                         if mobj is None:
2459                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2460                                 return
2461                         if mobj.group('episode') == '':
2462                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2463                                 return
2464                         epTitle = mobj.group('episode')
2465
2466                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2467
2468                 if len(mMovieParams) == 0:
2469                         # The Colbert Report embeds the information in a without
2470                         # a URL prefix; so extract the alternate reference
2471                         # and then add the URL prefix manually.
2472
2473                         altMovieParams = re.findall('data-mgid="([^"]*episode.*?:.*?)"', html)
2474                         if len(altMovieParams) == 0:
2475                                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2476                                 return
2477                         else:
2478                                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2479
2480                 playerUrl_raw = mMovieParams[0][0]
2481                 self.report_player_url(epTitle)
2482                 try:
2483                         urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2484                         playerUrl = urlHandle.geturl()
2485                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2486                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2487                         return
2488
2489                 uri = mMovieParams[0][1]
2490                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2491                 self.report_index_download(epTitle)
2492                 try:
2493                         indexXml = compat_urllib_request.urlopen(indexUrl).read()
2494                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2495                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2496                         return
2497
2498                 results = []
2499
2500                 idoc = xml.etree.ElementTree.fromstring(indexXml)
2501                 itemEls = idoc.findall('.//item')
2502                 for itemEl in itemEls:
2503                         mediaId = itemEl.findall('./guid')[0].text
2504                         shortMediaId = mediaId.split(':')[-1]
2505                         showId = mediaId.split(':')[-2].replace('.com', '')
2506                         officialTitle = itemEl.findall('./title')[0].text
2507                         officialDate = itemEl.findall('./pubDate')[0].text
2508
2509                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2510                                                 compat_urllib_parse.urlencode({'uri': mediaId}))
2511                         configReq = compat_urllib_request.Request(configUrl)
2512                         self.report_config_download(epTitle)
2513                         try:
2514                                 configXml = compat_urllib_request.urlopen(configReq).read()
2515                         except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2516                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2517                                 return
2518
2519                         cdoc = xml.etree.ElementTree.fromstring(configXml)
2520                         turls = []
2521                         for rendition in cdoc.findall('.//rendition'):
2522                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2523                                 turls.append(finfo)
2524
2525                         if len(turls) == 0:
2526                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2527                                 continue
2528
2529                         if self._downloader.params.get('listformats', None):
2530                                 self._print_formats([i[0] for i in turls])
2531                                 return
2532
2533                         # For now, just pick the highest bitrate
2534                         format,video_url = turls[-1]
2535
2536                         # Get the format arg from the arg stream
2537                         req_format = self._downloader.params.get('format', None)
2538
2539                         # Select format if we can find one
2540                         for f,v in turls:
2541                                 if f == req_format:
2542                                         format, video_url = f, v
2543                                         break
2544
2545                         # Patch to download from alternative CDN, which does not
2546                         # break on current RTMPDump builds
2547                         broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2548                         better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2549
2550                         if video_url.startswith(broken_cdn):
2551                                 video_url = video_url.replace(broken_cdn, better_cdn)
2552
2553                         effTitle = showId + u'-' + epTitle
2554                         info = {
2555                                 'id': shortMediaId,
2556                                 'url': video_url,
2557                                 'uploader': showId,
2558                                 'upload_date': officialDate,
2559                                 'title': effTitle,
2560                                 'ext': 'mp4',
2561                                 'format': format,
2562                                 'thumbnail': None,
2563                                 'description': officialTitle,
2564                                 'player_url': None #playerUrl
2565                         }
2566
2567                         results.append(info)
2568
2569                 return results
2570
2571
2572 class EscapistIE(InfoExtractor):
2573         """Information extractor for The Escapist """
2574
2575         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2576         IE_NAME = u'escapist'
2577
2578         def report_extraction(self, showName):
2579                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2580
2581         def report_config_download(self, showName):
2582                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2583
2584         def _real_extract(self, url):
2585                 mobj = re.match(self._VALID_URL, url)
2586                 if mobj is None:
2587                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2588                         return
2589                 showName = mobj.group('showname')
2590                 videoId = mobj.group('episode')
2591
2592                 self.report_extraction(showName)
2593                 try:
2594                         webPage = compat_urllib_request.urlopen(url)
2595                         webPageBytes = webPage.read()
2596                         m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2597                         webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2598                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2599                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2600                         return
2601
2602                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2603                 description = unescapeHTML(descMatch.group(1))
2604                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2605                 imgUrl = unescapeHTML(imgMatch.group(1))
2606                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2607                 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2608                 configUrlMatch = re.search('config=(.*)$', playerUrl)
2609                 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2610
2611                 self.report_config_download(showName)
2612                 try:
2613                         configJSON = compat_urllib_request.urlopen(configUrl).read()
2614                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2615                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2616                         return
2617
2618                 # Technically, it's JavaScript, not JSON
2619                 configJSON = configJSON.replace("'", '"')
2620
2621                 try:
2622                         config = json.loads(configJSON)
2623                 except (ValueError,) as err:
2624                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2625                         return
2626
2627                 playlist = config['playlist']
2628                 videoUrl = playlist[1]['url']
2629
2630                 info = {
2631                         'id': videoId,
2632                         'url': videoUrl,
2633                         'uploader': showName,
2634                         'upload_date': None,
2635                         'title': showName,
2636                         'ext': 'flv',
2637                         'thumbnail': imgUrl,
2638                         'description': description,
2639                         'player_url': playerUrl,
2640                 }
2641
2642                 return [info]
2643
2644
2645 class CollegeHumorIE(InfoExtractor):
2646         """Information extractor for collegehumor.com"""
2647
2648         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2649         IE_NAME = u'collegehumor'
2650
2651         def report_webpage(self, video_id):
2652                 """Report information extraction."""
2653                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2654
2655         def report_extraction(self, video_id):
2656                 """Report information extraction."""
2657                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2658
2659         def _real_extract(self, url):
2660                 mobj = re.match(self._VALID_URL, url)
2661                 if mobj is None:
2662                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2663                         return
2664                 video_id = mobj.group('videoid')
2665
2666                 self.report_webpage(video_id)
2667                 request = compat_urllib_request.Request(url)
2668                 try:
2669                         webpage = compat_urllib_request.urlopen(request).read()
2670                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2671                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2672                         return
2673
2674                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2675                 if m is None:
2676                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2677                         return
2678                 internal_video_id = m.group('internalvideoid')
2679
2680                 info = {
2681                         'id': video_id,
2682                         'internal_id': internal_video_id,
2683                         'uploader': None,
2684                         'upload_date': None,
2685                 }
2686
2687                 self.report_extraction(video_id)
2688                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2689                 try:
2690                         metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2691                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2692                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2693                         return
2694
2695                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2696                 try:
2697                         videoNode = mdoc.findall('./video')[0]
2698                         info['description'] = videoNode.findall('./description')[0].text
2699                         info['title'] = videoNode.findall('./caption')[0].text
2700                         info['url'] = videoNode.findall('./file')[0].text
2701                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2702                         info['ext'] = info['url'].rpartition('.')[2]
2703                 except IndexError:
2704                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2705                         return
2706
2707                 return [info]
2708
2709
2710 class XVideosIE(InfoExtractor):
2711         """Information extractor for xvideos.com"""
2712
2713         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2714         IE_NAME = u'xvideos'
2715
2716         def report_webpage(self, video_id):
2717                 """Report information extraction."""
2718                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2719
2720         def report_extraction(self, video_id):
2721                 """Report information extraction."""
2722                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2723
2724         def _real_extract(self, url):
2725                 mobj = re.match(self._VALID_URL, url)
2726                 if mobj is None:
2727                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2728                         return
2729                 video_id = mobj.group(1).decode('utf-8')
2730
2731                 self.report_webpage(video_id)
2732
2733                 request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2734                 try:
2735                         webpage = compat_urllib_request.urlopen(request).read()
2736                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2737                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2738                         return
2739
2740                 self.report_extraction(video_id)
2741
2742
2743                 # Extract video URL
2744                 mobj = re.search(r'flv_url=(.+?)&', webpage)
2745                 if mobj is None:
2746                         self._downloader.trouble(u'ERROR: unable to extract video url')
2747                         return
2748                 video_url = compat_urllib_parse.unquote(mobj.group(1).decode('utf-8'))
2749
2750
2751                 # Extract title
2752                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2753                 if mobj is None:
2754                         self._downloader.trouble(u'ERROR: unable to extract video title')
2755                         return
2756                 video_title = mobj.group(1).decode('utf-8')
2757
2758
2759                 # Extract video thumbnail
2760                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2761                 if mobj is None:
2762                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2763                         return
2764                 video_thumbnail = mobj.group(0).decode('utf-8')
2765
2766                 info = {
2767                         'id': video_id,
2768                         'url': video_url,
2769                         'uploader': None,
2770                         'upload_date': None,
2771                         'title': video_title,
2772                         'ext': 'flv',
2773                         'thumbnail': video_thumbnail,
2774                         'description': None,
2775                 }
2776
2777                 return [info]
2778
2779
2780 class SoundcloudIE(InfoExtractor):
2781         """Information extractor for soundcloud.com
2782            To access the media, the uid of the song and a stream token
2783            must be extracted from the page source and the script must make
2784            a request to media.soundcloud.com/crossdomain.xml. Then
2785            the media can be grabbed by requesting from an url composed
2786            of the stream token and uid
2787          """
2788
2789         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2790         IE_NAME = u'soundcloud'
2791
2792         def __init__(self, downloader=None):
2793                 InfoExtractor.__init__(self, downloader)
2794
2795         def report_webpage(self, video_id):
2796                 """Report information extraction."""
2797                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2798
2799         def report_extraction(self, video_id):
2800                 """Report information extraction."""
2801                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2802
2803         def _real_extract(self, url):
2804                 mobj = re.match(self._VALID_URL, url)
2805                 if mobj is None:
2806                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2807                         return
2808
2809                 # extract uploader (which is in the url)
2810                 uploader = mobj.group(1).decode('utf-8')
2811                 # extract simple title (uploader + slug of song title)
2812                 slug_title =  mobj.group(2).decode('utf-8')
2813                 simple_title = uploader + u'-' + slug_title
2814
2815                 self.report_webpage('%s/%s' % (uploader, slug_title))
2816
2817                 request = compat_urllib_request.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2818                 try:
2819                         webpage = compat_urllib_request.urlopen(request).read()
2820                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2821                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2822                         return
2823
2824                 self.report_extraction('%s/%s' % (uploader, slug_title))
2825
2826                 # extract uid and stream token that soundcloud hands out for access
2827                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2828                 if mobj:
2829                         video_id = mobj.group(1)
2830                         stream_token = mobj.group(2)
2831
2832                 # extract unsimplified title
2833                 mobj = re.search('"title":"(.*?)",', webpage)
2834                 if mobj:
2835                         title = mobj.group(1).decode('utf-8')
2836                 else:
2837                         title = simple_title
2838
2839                 # construct media url (with uid/token)
2840                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2841                 mediaURL = mediaURL % (video_id, stream_token)
2842
2843                 # description
2844                 description = u'No description available'
2845                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2846                 if mobj:
2847                         description = mobj.group(1)
2848
2849                 # upload date
2850                 upload_date = None
2851                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2852                 if mobj:
2853                         try:
2854                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2855                         except Exception, e:
2856                                 self._downloader.to_stderr(compat_str(e))
2857
2858                 # for soundcloud, a request to a cross domain is required for cookies
2859                 request = compat_urllib_request.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2860
2861                 return [{
2862                         'id':           video_id.decode('utf-8'),
2863                         'url':          mediaURL,
2864                         'uploader':     uploader.decode('utf-8'),
2865                         'upload_date':  upload_date,
2866                         'title':        title,
2867                         'ext':          u'mp3',
2868                         'description': description.decode('utf-8')
2869                 }]
2870
2871
2872 class InfoQIE(InfoExtractor):
2873         """Information extractor for infoq.com"""
2874
2875         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2876         IE_NAME = u'infoq'
2877
2878         def report_webpage(self, video_id):
2879                 """Report information extraction."""
2880                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2881
2882         def report_extraction(self, video_id):
2883                 """Report information extraction."""
2884                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2885
2886         def _real_extract(self, url):
2887                 mobj = re.match(self._VALID_URL, url)
2888                 if mobj is None:
2889                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2890                         return
2891
2892                 self.report_webpage(url)
2893
2894                 request = compat_urllib_request.Request(url)
2895                 try:
2896                         webpage = compat_urllib_request.urlopen(request).read()
2897                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2898                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2899                         return
2900
2901                 self.report_extraction(url)
2902
2903
2904                 # Extract video URL
2905                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2906                 if mobj is None:
2907                         self._downloader.trouble(u'ERROR: unable to extract video url')
2908                         return
2909                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2910
2911
2912                 # Extract title
2913                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2914                 if mobj is None:
2915                         self._downloader.trouble(u'ERROR: unable to extract video title')
2916                         return
2917                 video_title = mobj.group(1).decode('utf-8')
2918
2919                 # Extract description
2920                 video_description = u'No description available.'
2921                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2922                 if mobj is not None:
2923                         video_description = mobj.group(1).decode('utf-8')
2924
2925                 video_filename = video_url.split('/')[-1]
2926                 video_id, extension = video_filename.split('.')
2927
2928                 info = {
2929                         'id': video_id,
2930                         'url': video_url,
2931                         'uploader': None,
2932                         'upload_date': None,
2933                         'title': video_title,
2934                         'ext': extension, # Extension is always(?) mp4, but seems to be flv
2935                         'thumbnail': None,
2936                         'description': video_description,
2937                 }
2938
2939                 return [info]
2940
2941 class MixcloudIE(InfoExtractor):
2942         """Information extractor for www.mixcloud.com"""
2943         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2944         IE_NAME = u'mixcloud'
2945
2946         def __init__(self, downloader=None):
2947                 InfoExtractor.__init__(self, downloader)
2948
2949         def report_download_json(self, file_id):
2950                 """Report JSON download."""
2951                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2952
2953         def report_extraction(self, file_id):
2954                 """Report information extraction."""
2955                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2956
2957         def get_urls(self, jsonData, fmt, bitrate='best'):
2958                 """Get urls from 'audio_formats' section in json"""
2959                 file_url = None
2960                 try:
2961                         bitrate_list = jsonData[fmt]
2962                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2963                                 bitrate = max(bitrate_list) # select highest
2964
2965                         url_list = jsonData[fmt][bitrate]
2966                 except TypeError: # we have no bitrate info.
2967                         url_list = jsonData[fmt]
2968                 return url_list
2969
2970         def check_urls(self, url_list):
2971                 """Returns 1st active url from list"""
2972                 for url in url_list:
2973                         try:
2974                                 compat_urllib_request.urlopen(url)
2975                                 return url
2976                         except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2977                                 url = None
2978
2979                 return None
2980
2981         def _print_formats(self, formats):
2982                 print('Available formats:')
2983                 for fmt in formats.keys():
2984                         for b in formats[fmt]:
2985                                 try:
2986                                         ext = formats[fmt][b][0]
2987                                         print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2988                                 except TypeError: # we have no bitrate info
2989                                         ext = formats[fmt][0]
2990                                         print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2991                                         break
2992
2993         def _real_extract(self, url):
2994                 mobj = re.match(self._VALID_URL, url)
2995                 if mobj is None:
2996                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2997                         return
2998                 # extract uploader & filename from url
2999                 uploader = mobj.group(1).decode('utf-8')
3000                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3001
3002                 # construct API request
3003                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3004                 # retrieve .json file with links to files
3005                 request = compat_urllib_request.Request(file_url)
3006                 try:
3007                         self.report_download_json(file_url)
3008                         jsonData = compat_urllib_request.urlopen(request).read()
3009                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
3010                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
3011                         return
3012
3013                 # parse JSON
3014                 json_data = json.loads(jsonData)
3015                 player_url = json_data['player_swf_url']
3016                 formats = dict(json_data['audio_formats'])
3017
3018                 req_format = self._downloader.params.get('format', None)
3019                 bitrate = None
3020
3021                 if self._downloader.params.get('listformats', None):
3022                         self._print_formats(formats)
3023                         return
3024
3025                 if req_format is None or req_format == 'best':
3026                         for format_param in formats.keys():
3027                                 url_list = self.get_urls(formats, format_param)
3028                                 # check urls
3029                                 file_url = self.check_urls(url_list)
3030                                 if file_url is not None:
3031                                         break # got it!
3032                 else:
3033                         if req_format not in formats.keys():
3034                                 self._downloader.trouble(u'ERROR: format is not available')
3035                                 return
3036
3037                         url_list = self.get_urls(formats, req_format)
3038                         file_url = self.check_urls(url_list)
3039                         format_param = req_format
3040
3041                 return [{
3042                         'id': file_id.decode('utf-8'),
3043                         'url': file_url.decode('utf-8'),
3044                         'uploader':     uploader.decode('utf-8'),
3045                         'upload_date': None,
3046                         'title': json_data['name'],
3047                         'ext': file_url.split('.')[-1].decode('utf-8'),
3048                         'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3049                         'thumbnail': json_data['thumbnail_url'],
3050                         'description': json_data['description'],
3051                         'player_url': player_url.decode('utf-8'),
3052                 }]
3053
3054 class StanfordOpenClassroomIE(InfoExtractor):
3055         """Information extractor for Stanford's Open ClassRoom"""
3056
3057         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3058         IE_NAME = u'stanfordoc'
3059
3060         def report_download_webpage(self, objid):
3061                 """Report information extraction."""
3062                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3063
3064         def report_extraction(self, video_id):
3065                 """Report information extraction."""
3066                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3067
3068         def _real_extract(self, url):
3069                 mobj = re.match(self._VALID_URL, url)
3070                 if mobj is None:
3071                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3072                         return
3073
3074                 if mobj.group('course') and mobj.group('video'): # A specific video
3075                         course = mobj.group('course')
3076                         video = mobj.group('video')
3077                         info = {
3078                                 'id': course + '_' + video,
3079                                 'uploader': None,
3080                                 'upload_date': None,
3081                         }
3082
3083                         self.report_extraction(info['id'])
3084                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3085                         xmlUrl = baseUrl + video + '.xml'
3086                         try:
3087                                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3088                         except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
3089                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3090                                 return
3091                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
3092                         try:
3093                                 info['title'] = mdoc.findall('./title')[0].text
3094                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3095                         except IndexError:
3096                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3097                                 return
3098                         info['ext'] = info['url'].rpartition('.')[2]
3099                         return [info]
3100                 elif mobj.group('course'): # A course page
3101                         course = mobj.group('course')
3102                         info = {
3103                                 'id': course,
3104                                 'type': 'playlist',
3105                                 'uploader': None,
3106                                 'upload_date': None,
3107                         }
3108
3109                         self.report_download_webpage(info['id'])
3110                         try:
3111                                 coursepage = compat_urllib_request.urlopen(url).read()
3112                         except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
3113                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3114                                 return
3115
3116                         m = re.search('<h1>([^<]+)</h1>', coursepage)
3117                         if m:
3118                                 info['title'] = unescapeHTML(m.group(1))
3119                         else:
3120                                 info['title'] = info['id']
3121
3122                         m = re.search('<description>([^<]+)</description>', coursepage)
3123                         if m:
3124                                 info['description'] = unescapeHTML(m.group(1))
3125
3126                         links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3127                         info['list'] = [
3128                                 {
3129                                         'type': 'reference',
3130                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3131                                 }
3132                                         for vpage in links]
3133                         results = []
3134                         for entry in info['list']:
3135                                 assert entry['type'] == 'reference'
3136                                 results += self.extract(entry['url'])
3137                         return results
3138
3139                 else: # Root page
3140                         info = {
3141                                 'id': 'Stanford OpenClassroom',
3142                                 'type': 'playlist',
3143                                 'uploader': None,
3144                                 'upload_date': None,
3145                         }
3146
3147                         self.report_download_webpage(info['id'])
3148                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3149                         try:
3150                                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3151                         except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
3152                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3153                                 return
3154
3155                         info['title'] = info['id']
3156
3157                         links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3158                         info['list'] = [
3159                                 {
3160                                         'type': 'reference',
3161                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3162                                 }
3163                                         for cpage in links]
3164
3165                         results = []
3166                         for entry in info['list']:
3167                                 assert entry['type'] == 'reference'
3168                                 results += self.extract(entry['url'])
3169                         return results
3170
3171 class MTVIE(InfoExtractor):
3172         """Information extractor for MTV.com"""
3173
3174         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3175         IE_NAME = u'mtv'
3176
3177         def report_webpage(self, video_id):
3178                 """Report information extraction."""
3179                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3180
3181         def report_extraction(self, video_id):
3182                 """Report information extraction."""
3183                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3184
3185         def _real_extract(self, url):
3186                 mobj = re.match(self._VALID_URL, url)
3187                 if mobj is None:
3188                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3189                         return
3190                 if not mobj.group('proto'):
3191                         url = 'http://' + url
3192                 video_id = mobj.group('videoid')
3193                 self.report_webpage(video_id)
3194
3195                 request = compat_urllib_request.Request(url)
3196                 try:
3197                         webpage = compat_urllib_request.urlopen(request).read()
3198                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
3199                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3200                         return
3201
3202                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3203                 if mobj is None:
3204                         self._downloader.trouble(u'ERROR: unable to extract song name')
3205                         return
3206                 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3207                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3208                 if mobj is None:
3209                         self._downloader.trouble(u'ERROR: unable to extract performer')
3210                         return
3211                 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3212                 video_title = performer + ' - ' + song_name
3213
3214                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3215                 if mobj is None:
3216                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3217                         return
3218                 mtvn_uri = mobj.group(1)
3219
3220                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3221                 if mobj is None:
3222                         self._downloader.trouble(u'ERROR: unable to extract content id')
3223                         return
3224                 content_id = mobj.group(1)
3225
3226                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3227                 self.report_extraction(video_id)
3228                 request = compat_urllib_request.Request(videogen_url)
3229                 try:
3230                         metadataXml = compat_urllib_request.urlopen(request).read()
3231                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
3232                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3233                         return
3234
3235                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3236                 renditions = mdoc.findall('.//rendition')
3237
3238                 # For now, always pick the highest quality.
3239                 rendition = renditions[-1]
3240
3241                 try:
3242                         _,_,ext = rendition.attrib['type'].partition('/')
3243                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3244                         video_url = rendition.find('./src').text
3245                 except KeyError:
3246                         self._downloader.trouble('Invalid rendition field.')
3247                         return
3248
3249                 info = {
3250                         'id': video_id,
3251                         'url': video_url,
3252                         'uploader': performer,
3253                         'upload_date': None,
3254                         'title': video_title,
3255                         'ext': ext,
3256                         'format': format,
3257                 }
3258
3259                 return [info]
3260
3261
3262 class YoukuIE(InfoExtractor):
3263
3264         _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3265         IE_NAME = u'Youku'
3266
3267         def __init__(self, downloader=None):
3268                 InfoExtractor.__init__(self, downloader)
3269
3270         def report_download_webpage(self, file_id):
3271                 """Report webpage download."""
3272                 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3273
3274         def report_extraction(self, file_id):
3275                 """Report information extraction."""
3276                 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3277
3278         def _gen_sid(self):
3279                 nowTime = int(time.time() * 1000)
3280                 random1 = random.randint(1000,1998)
3281                 random2 = random.randint(1000,9999)
3282
3283                 return "%d%d%d" %(nowTime,random1,random2)
3284
3285         def _get_file_ID_mix_string(self, seed):
3286                 mixed = []
3287                 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3288                 seed = float(seed)
3289                 for i in range(len(source)):
3290                         seed  =  (seed * 211 + 30031 ) % 65536
3291                         index  =  math.floor(seed / 65536 * len(source) )
3292                         mixed.append(source[int(index)])
3293                         source.remove(source[int(index)])
3294                 #return ''.join(mixed)
3295                 return mixed
3296
3297         def _get_file_id(self, fileId, seed):
3298                 mixed = self._get_file_ID_mix_string(seed)
3299                 ids = fileId.split('*')
3300                 realId = []
3301                 for ch in ids:
3302                         if ch:
3303                                 realId.append(mixed[int(ch)])
3304                 return ''.join(realId)
3305
3306         def _real_extract(self, url):
3307                 mobj = re.match(self._VALID_URL, url)
3308                 if mobj is None:
3309                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3310                         return
3311                 video_id = mobj.group('ID')
3312
3313                 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3314
3315                 request = compat_urllib_request.Request(info_url, None, std_headers)
3316                 try:
3317                         self.report_download_webpage(video_id)
3318                         jsondata = compat_urllib_request.urlopen(request).read()
3319                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
3320                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3321                         return
3322
3323                 self.report_extraction(video_id)
3324                 try:
3325                         config = json.loads(jsondata)
3326
3327                         video_title =  config['data'][0]['title']
3328                         seed = config['data'][0]['seed']
3329
3330                         format = self._downloader.params.get('format', None)
3331                         supported_format = config['data'][0]['streamfileids'].keys()
3332
3333                         if format is None or format == 'best':
3334                                 if 'hd2' in supported_format:
3335                                         format = 'hd2'
3336                                 else:
3337                                         format = 'flv'
3338                                 ext = u'flv'
3339                         elif format == 'worst':
3340                                 format = 'mp4'
3341                                 ext = u'mp4'
3342                         else:
3343                                 format = 'flv'
3344                                 ext = u'flv'
3345
3346
3347                         fileid = config['data'][0]['streamfileids'][format]
3348                         seg_number = len(config['data'][0]['segs'][format])
3349
3350                         keys=[]
3351                         for i in xrange(seg_number):
3352                                 keys.append(config['data'][0]['segs'][format][i]['k'])
3353
3354                         #TODO check error
3355                         #youku only could be viewed from mainland china
3356                 except:
3357                         self._downloader.trouble(u'ERROR: unable to extract info section')
3358                         return
3359
3360                 files_info=[]
3361                 sid = self._gen_sid()
3362                 fileid = self._get_file_id(fileid, seed)
3363
3364                 #column 8,9 of fileid represent the segment number
3365                 #fileid[7:9] should be changed
3366                 for index, key in enumerate(keys):
3367
3368                         temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3369                         download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3370
3371                         info = {
3372                                 'id': '%s_part%02d' % (video_id, index),
3373                                 'url': download_url,
3374                                 'uploader': None,
3375                                 'upload_date': None,
3376                                 'title': video_title,
3377                                 'ext': ext,
3378                         }
3379                         files_info.append(info)
3380
3381                 return files_info
3382
3383
3384 class XNXXIE(InfoExtractor):
3385         """Information extractor for xnxx.com"""
3386
3387         _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3388         IE_NAME = u'xnxx'
3389         VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3390         VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3391         VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3392
3393         def report_webpage(self, video_id):
3394                 """Report information extraction"""
3395                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3396
3397         def report_extraction(self, video_id):
3398                 """Report information extraction"""
3399                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3400
3401         def _real_extract(self, url):
3402                 mobj = re.match(self._VALID_URL, url)
3403                 if mobj is None:
3404                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3405                         return
3406                 video_id = mobj.group(1).decode('utf-8')
3407
3408                 self.report_webpage(video_id)
3409
3410                 # Get webpage content
3411                 try:
3412                         webpage = compat_urllib_request.urlopen(url).read()
3413                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
3414                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3415                         return
3416
3417                 result = re.search(self.VIDEO_URL_RE, webpage)
3418                 if result is None:
3419                         self._downloader.trouble(u'ERROR: unable to extract video url')
3420                         return
3421                 video_url = compat_urllib_parse.unquote(result.group(1).decode('utf-8'))
3422
3423                 result = re.search(self.VIDEO_TITLE_RE, webpage)
3424                 if result is None:
3425                         self._downloader.trouble(u'ERROR: unable to extract video title')
3426                         return
3427                 video_title = result.group(1).decode('utf-8')
3428
3429                 result = re.search(self.VIDEO_THUMB_RE, webpage)
3430                 if result is None:
3431                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3432                         return
3433                 video_thumbnail = result.group(1).decode('utf-8')
3434
3435                 return [{
3436                         'id': video_id,
3437                         'url': video_url,
3438                         'uploader': None,
3439                         'upload_date': None,
3440                         'title': video_title,
3441                         'ext': 'flv',
3442                         'thumbnail': video_thumbnail,
3443                         'description': None,
3444                 }]
3445
3446
3447 class GooglePlusIE(InfoExtractor):
3448         """Information extractor for plus.google.com."""
3449
3450         _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3451         IE_NAME = u'plus.google'
3452
3453         def __init__(self, downloader=None):
3454                 InfoExtractor.__init__(self, downloader)
3455
3456         def report_extract_entry(self, url):
3457                 """Report downloading extry"""
3458                 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3459
3460         def report_date(self, upload_date):
3461                 """Report downloading extry"""
3462                 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3463
3464         def report_uploader(self, uploader):
3465                 """Report downloading extry"""
3466                 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3467
3468         def report_title(self, video_title):
3469                 """Report downloading extry"""
3470                 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3471
3472         def report_extract_vid_page(self, video_page):
3473                 """Report information extraction."""
3474                 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3475
3476         def _real_extract(self, url):
3477                 # Extract id from URL
3478                 mobj = re.match(self._VALID_URL, url)
3479                 if mobj is None:
3480                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3481                         return
3482
3483                 post_url = mobj.group(0)
3484                 video_id = mobj.group(2)
3485
3486                 video_extension = 'flv'
3487
3488                 # Step 1, Retrieve post webpage to extract further information
3489                 self.report_extract_entry(post_url)
3490                 request = compat_urllib_request.Request(post_url)
3491                 try:
3492                         webpage = compat_urllib_request.urlopen(request).read()
3493                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
3494                         self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3495                         return
3496
3497                 # Extract update date
3498                 upload_date = None
3499                 pattern = 'title="Timestamp">(.*?)</a>'
3500                 mobj = re.search(pattern, webpage)
3501                 if mobj:
3502                         upload_date = mobj.group(1)
3503                         # Convert timestring to a format suitable for filename
3504                         upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3505                         upload_date = upload_date.strftime('%Y%m%d')
3506                 self.report_date(upload_date)
3507
3508                 # Extract uploader
3509                 uploader = None
3510                 pattern = r'rel\="author".*?>(.*?)</a>'
3511                 mobj = re.search(pattern, webpage)
3512                 if mobj:
3513                         uploader = mobj.group(1)
3514                 self.report_uploader(uploader)
3515
3516                 # Extract title
3517                 # Get the first line for title
3518                 video_title = u'NA'
3519                 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3520                 mobj = re.search(pattern, webpage)
3521                 if mobj:
3522                         video_title = mobj.group(1)
3523                 self.report_title(video_title)
3524
3525                 # Step 2, Stimulate clicking the image box to launch video
3526                 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3527                 mobj = re.search(pattern, webpage)
3528                 if mobj is None:
3529                         self._downloader.trouble(u'ERROR: unable to extract video page URL')
3530
3531                 video_page = mobj.group(1)
3532                 request = compat_urllib_request.Request(video_page)
3533                 try:
3534                         webpage = compat_urllib_request.urlopen(request).read()
3535                 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
3536                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3537                         return
3538                 self.report_extract_vid_page(video_page)
3539
3540
3541                 # Extract video links on video page
3542                 """Extract video links of all sizes"""
3543                 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3544                 mobj = re.findall(pattern, webpage)
3545                 if len(mobj) == 0:
3546                         self._downloader.trouble(u'ERROR: unable to extract video links')
3547
3548                 # Sort in resolution
3549                 links = sorted(mobj)
3550
3551                 # Choose the lowest of the sort, i.e. highest resolution
3552                 video_url = links[-1]
3553                 # Only get the url. The resolution part in the tuple has no use anymore
3554                 video_url = video_url[-1]
3555                 # Treat escaped \u0026 style hex
3556                 video_url = unicode(video_url, "unicode_escape")
3557
3558
3559                 return [{
3560                         'id':           video_id.decode('utf-8'),
3561                         'url':          video_url,
3562                         'uploader':     uploader.decode('utf-8'),
3563                         'upload_date':  upload_date.decode('utf-8'),
3564                         'title':        video_title.decode('utf-8'),
3565                         'ext':          video_extension.decode('utf-8'),
3566                 }]