_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import datetime
   5 import HTMLParser
   6 import httplib
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import time
  12 import urllib
  13 import urllib2
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 from urlparse import parse_qs
  19
  20 try:
  21         import cStringIO as StringIO
  22 except ImportError:
  23         import StringIO
  24
  25 from utils import *
  26
  27
  28 class InfoExtractor(object):
  29         """Information Extractor class.
  30
  31         Information extractors are the classes that, given a URL, extract
  32         information from the video (or videos) the URL refers to. This
  33         information includes the real video URL, the video title and simplified
  34         title, author and others. The information is stored in a dictionary
  35         which is then passed to the FileDownloader. The FileDownloader
  36         processes this information possibly downloading the video to the file
  37         system, among other possible outcomes. The dictionaries must include
  38         the following fields:
  39
  40         id:             Video identifier.
  41         url:            Final video URL.
  42         uploader:       Nickname of the video uploader.
  43         title:          Literal title.
  44         ext:            Video filename extension.
  45         format:         Video format.
  46         player_url:     SWF Player URL (may be None).
  47
  48         The following fields are optional. Their primary purpose is to allow
  49         youtube-dl to serve as the backend for a video search function, such
  50         as the one in youtube2mp3.  They are only used when their respective
  51         forced printing functions are called:
  52
  53         thumbnail:      Full URL to a video thumbnail image.
  54         description:    One-line video description.
  55
  56         Subclasses of this one should re-define the _real_initialize() and
  57         _real_extract() methods and define a _VALID_URL regexp.
  58         Probably, they should also be added to the list of extractors.
  59         """
  60
  61         _ready = False
  62         _downloader = None
  63
  64         def __init__(self, downloader=None):
  65                 """Constructor. Receives an optional downloader."""
  66                 self._ready = False
  67                 self.set_downloader(downloader)
  68
  69         def suitable(self, url):
  70                 """Receives a URL and returns True if suitable for this IE."""
  71                 return re.match(self._VALID_URL, url) is not None
  72
  73         def initialize(self):
  74                 """Initializes an instance (authentication, etc)."""
  75                 if not self._ready:
  76                         self._real_initialize()
  77                         self._ready = True
  78
  79         def extract(self, url):
  80                 """Extracts URL information and returns it in list of dicts."""
  81                 self.initialize()
  82                 return self._real_extract(url)
  83
  84         def set_downloader(self, downloader):
  85                 """Sets the downloader for this IE."""
  86                 self._downloader = downloader
  87
  88         def _real_initialize(self):
  89                 """Real initialization process. Redefine in subclasses."""
  90                 pass
  91
  92         def _real_extract(self, url):
  93                 """Real extraction process. Redefine in subclasses."""
  94                 pass
  95
  96
  97 class YoutubeIE(InfoExtractor):
  98         """Information extractor for youtube.com."""
  99
 100         _VALID_URL = r"""^
 101                          (
 102                              (?:https?://)?                                       # http(s):// (optional)
 103                              (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 104                                 tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 105                              (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
 106                              (?:                                                  # the various things that can precede the ID:
 107                                  (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 108                                  |(?:                                             # or the v= param in all its forms
 109                                      (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 110                                      (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 111                                      (?:.+&)?                                     # any other preceding param (like /?s=tuff&v=xxxx)
 112                                      v=
 113                                  )
 114                              )?                                                   # optional -> youtube.com/xxxx is OK
 115                          )?                                                       # all until now is optional -> you can pass the naked ID
 116                          ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 117                          (?(1).+)?                                                # if we found the ID, everything can follow
 118                          $"""
 119         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 120         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 121         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 122         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 123         _NETRC_MACHINE = 'youtube'
 124         # Listed in order of quality
 125         _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 126         _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 127         _video_extensions = {
 128                 '13': '3gp',
 129                 '17': 'mp4',
 130                 '18': 'mp4',
 131                 '22': 'mp4',
 132                 '37': 'mp4',
 133                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 134                 '43': 'webm',
 135                 '44': 'webm',
 136                 '45': 'webm',
 137                 '46': 'webm',
 138         }
 139         _video_dimensions = {
 140                 '5': '240x400',
 141                 '6': '???',
 142                 '13': '???',
 143                 '17': '144x176',
 144                 '18': '360x640',
 145                 '22': '720x1280',
 146                 '34': '360x640',
 147                 '35': '480x854',
 148                 '37': '1080x1920',
 149                 '38': '3072x4096',
 150                 '43': '360x640',
 151                 '44': '480x854',
 152                 '45': '720x1280',
 153                 '46': '1080x1920',
 154         }
 155         IE_NAME = u'youtube'
 156
 157         def suitable(self, url):
 158                 """Receives a URL and returns True if suitable for this IE."""
 159                 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
 160
 161         def report_lang(self):
 162                 """Report attempt to set language."""
 163                 self._downloader.to_screen(u'[youtube] Setting language')
 164
 165         def report_login(self):
 166                 """Report attempt to log in."""
 167                 self._downloader.to_screen(u'[youtube] Logging in')
 168
 169         def report_age_confirmation(self):
 170                 """Report attempt to confirm age."""
 171                 self._downloader.to_screen(u'[youtube] Confirming age')
 172
 173         def report_video_webpage_download(self, video_id):
 174                 """Report attempt to download video webpage."""
 175                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 176
 177         def report_video_info_webpage_download(self, video_id):
 178                 """Report attempt to download video info webpage."""
 179                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 180
 181         def report_video_subtitles_download(self, video_id):
 182                 """Report attempt to download video info webpage."""
 183                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 184
 185         def report_information_extraction(self, video_id):
 186                 """Report attempt to extract video information."""
 187                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 188
 189         def report_unavailable_format(self, video_id, format):
 190                 """Report extracted video URL."""
 191                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 192
 193         def report_rtmp_download(self):
 194                 """Indicate the download will use the RTMP protocol."""
 195                 self._downloader.to_screen(u'[youtube] RTMP download detected')
 196
 197         def _closed_captions_xml_to_srt(self, xml_string):
 198                 srt = ''
 199                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 200                 # TODO parse xml instead of regex
 201                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
 202                         if not dur: dur = '4'
 203                         start = float(start)
 204                         end = start + float(dur)
 205                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 206                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 207                         caption = unescapeHTML(caption)
 208                         caption = unescapeHTML(caption) # double cycle, intentional
 209                         srt += str(n+1) + '\n'
 210                         srt += start + ' --> ' + end + '\n'
 211                         srt += caption + '\n\n'
 212                 return srt
 213
 214         def _print_formats(self, formats):
 215                 print 'Available formats:'
 216                 for x in formats:
 217                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
 218
 219         def _real_initialize(self):
 220                 if self._downloader is None:
 221                         return
 222
 223                 username = None
 224                 password = None
 225                 downloader_params = self._downloader.params
 226
 227                 # Attempt to use provided username and password or .netrc data
 228                 if downloader_params.get('username', None) is not None:
 229                         username = downloader_params['username']
 230                         password = downloader_params['password']
 231                 elif downloader_params.get('usenetrc', False):
 232                         try:
 233                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 234                                 if info is not None:
 235                                         username = info[0]
 236                                         password = info[2]
 237                                 else:
 238                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 239                         except (IOError, netrc.NetrcParseError), err:
 240                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 241                                 return
 242
 243                 # Set language
 244                 request = urllib2.Request(self._LANG_URL)
 245                 try:
 246                         self.report_lang()
 247                         urllib2.urlopen(request).read()
 248                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 249                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 250                         return
 251
 252                 # No authentication to be performed
 253                 if username is None:
 254                         return
 255
 256                 # Log in
 257                 login_form = {
 258                                 'current_form': 'loginForm',
 259                                 'next':         '/',
 260                                 'action_login': 'Log In',
 261                                 'username':     username,
 262                                 'password':     password,
 263                                 }
 264                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
 265                 try:
 266                         self.report_login()
 267                         login_results = urllib2.urlopen(request).read()
 268                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 269                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 270                                 return
 271                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 272                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 273                         return
 274
 275                 # Confirm age
 276                 age_form = {
 277                                 'next_url':             '/',
 278                                 'action_confirm':       'Confirm',
 279                                 }
 280                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
 281                 try:
 282                         self.report_age_confirmation()
 283                         age_results = urllib2.urlopen(request).read()
 284                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 285                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 286                         return
 287
 288         def _real_extract(self, url):
 289                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 290                 mobj = re.search(self._NEXT_URL_RE, url)
 291                 if mobj:
 292                         url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
 293
 294                 # Extract video id from URL
 295                 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 296                 if mobj is None:
 297                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 298                         return
 299                 video_id = mobj.group(2)
 300
 301                 # Get video webpage
 302                 self.report_video_webpage_download(video_id)
 303                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
 304                 try:
 305                         video_webpage = urllib2.urlopen(request).read()
 306                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 307                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 308                         return
 309
 310                 # Attempt to extract SWF player URL
 311                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 312                 if mobj is not None:
 313                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 314                 else:
 315                         player_url = None
 316
 317                 # Get video info
 318                 self.report_video_info_webpage_download(video_id)
 319                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 320                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 321                                         % (video_id, el_type))
 322                         request = urllib2.Request(video_info_url)
 323                         try:
 324                                 video_info_webpage = urllib2.urlopen(request).read()
 325                                 video_info = parse_qs(video_info_webpage)
 326                                 if 'token' in video_info:
 327                                         break
 328                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 329                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 330                                 return
 331                 if 'token' not in video_info:
 332                         if 'reason' in video_info:
 333                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
 334                         else:
 335                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 336                         return
 337
 338                 # Check for "rental" videos
 339                 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 340                         self._downloader.trouble(u'ERROR: "rental" videos not supported')
 341                         return
 342
 343                 # Start extracting information
 344                 self.report_information_extraction(video_id)
 345
 346                 # uploader
 347                 if 'author' not in video_info:
 348                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 349                         return
 350                 video_uploader = urllib.unquote_plus(video_info['author'][0])
 351
 352                 # title
 353                 if 'title' not in video_info:
 354                         self._downloader.trouble(u'ERROR: unable to extract video title')
 355                         return
 356                 video_title = urllib.unquote_plus(video_info['title'][0])
 357                 video_title = video_title.decode('utf-8')
 358
 359                 # thumbnail image
 360                 if 'thumbnail_url' not in video_info:
 361                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 362                         video_thumbnail = ''
 363                 else:   # don't panic if we can't find it
 364                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
 365
 366                 # upload date
 367                 upload_date = u'NA'
 368                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 369                 if mobj is not None:
 370                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 371                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 372                         for expression in format_expressions:
 373                                 try:
 374                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 375                                 except:
 376                                         pass
 377
 378                 # description
 379                 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
 380                 if video_description: video_description = clean_html(video_description)
 381                 else: video_description = ''
 382
 383                 # closed captions
 384                 video_subtitles = None
 385                 if self._downloader.params.get('writesubtitles', False):
 386                         try:
 387                                 self.report_video_subtitles_download(video_id)
 388                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 389                                 try:
 390                                         srt_list = urllib2.urlopen(request).read()
 391                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 392                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
 393                                 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 394                                 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 395                                 if not srt_lang_list:
 396                                         raise Trouble(u'WARNING: video has no closed captions')
 397                                 if self._downloader.params.get('subtitleslang', False):
 398                                         srt_lang = self._downloader.params.get('subtitleslang')
 399                                 elif 'en' in srt_lang_list:
 400                                         srt_lang = 'en'
 401                                 else:
 402                                         srt_lang = srt_lang_list.keys()[0]
 403                                 if not srt_lang in srt_lang_list:
 404                                         raise Trouble(u'WARNING: no closed captions found in the specified language')
 405                                 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
 406                                 try:
 407                                         srt_xml = urllib2.urlopen(request).read()
 408                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 409                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
 410                                 if not srt_xml:
 411                                         raise Trouble(u'WARNING: unable to download video subtitles')
 412                                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
 413                         except Trouble as trouble:
 414                                 self._downloader.trouble(trouble[0])
 415
 416                 # token
 417                 video_token = urllib.unquote_plus(video_info['token'][0])
 418
 419                 # Decide which formats to download
 420                 req_format = self._downloader.params.get('format', None)
 421
 422                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 423                         self.report_rtmp_download()
 424                         video_url_list = [(None, video_info['conn'][0])]
 425                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 426                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 427                         url_data = [parse_qs(uds) for uds in url_data_strs]
 428                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
 429                         url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 430
 431                         format_limit = self._downloader.params.get('format_limit', None)
 432                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 433                         if format_limit is not None and format_limit in available_formats:
 434                                 format_list = available_formats[available_formats.index(format_limit):]
 435                         else:
 436                                 format_list = available_formats
 437                         existing_formats = [x for x in format_list if x in url_map]
 438                         if len(existing_formats) == 0:
 439                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 440                                 return
 441                         if self._downloader.params.get('listformats', None):
 442                                 self._print_formats(existing_formats)
 443                                 return
 444                         if req_format is None or req_format == 'best':
 445                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 446                         elif req_format == 'worst':
 447                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 448                         elif req_format in ('-1', 'all'):
 449                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 450                         else:
 451                                 # Specific formats. We pick the first in a slash-delimeted sequence.
 452                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 453                                 req_formats = req_format.split('/')
 454                                 video_url_list = None
 455                                 for rf in req_formats:
 456                                         if rf in url_map:
 457                                                 video_url_list = [(rf, url_map[rf])]
 458                                                 break
 459                                 if video_url_list is None:
 460                                         self._downloader.trouble(u'ERROR: requested format not available')
 461                                         return
 462                 else:
 463                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 464                         return
 465
 466                 results = []
 467                 for format_param, video_real_url in video_url_list:
 468                         # Extension
 469                         video_extension = self._video_extensions.get(format_param, 'flv')
 470
 471                         results.append({
 472                                 'id':           video_id.decode('utf-8'),
 473                                 'url':          video_real_url.decode('utf-8'),
 474                                 'uploader':     video_uploader.decode('utf-8'),
 475                                 'upload_date':  upload_date,
 476                                 'title':        video_title,
 477                                 'ext':          video_extension.decode('utf-8'),
 478                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 479                                 'thumbnail':    video_thumbnail.decode('utf-8'),
 480                                 'description':  video_description,
 481                                 'player_url':   player_url,
 482                                 'subtitles':    video_subtitles
 483                         })
 484                 return results
 485
 486
 487 class MetacafeIE(InfoExtractor):
 488         """Information Extractor for metacafe.com."""
 489
 490         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 491         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 492         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 493         IE_NAME = u'metacafe'
 494
 495         def __init__(self, downloader=None):
 496                 InfoExtractor.__init__(self, downloader)
 497
 498         def report_disclaimer(self):
 499                 """Report disclaimer retrieval."""
 500                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 501
 502         def report_age_confirmation(self):
 503                 """Report attempt to confirm age."""
 504                 self._downloader.to_screen(u'[metacafe] Confirming age')
 505
 506         def report_download_webpage(self, video_id):
 507                 """Report webpage download."""
 508                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 509
 510         def report_extraction(self, video_id):
 511                 """Report information extraction."""
 512                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 513
 514         def _real_initialize(self):
 515                 # Retrieve disclaimer
 516                 request = urllib2.Request(self._DISCLAIMER)
 517                 try:
 518                         self.report_disclaimer()
 519                         disclaimer = urllib2.urlopen(request).read()
 520                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 521                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 522                         return
 523
 524                 # Confirm age
 525                 disclaimer_form = {
 526                         'filters': '0',
 527                         'submit': "Continue - I'm over 18",
 528                         }
 529                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
 530                 try:
 531                         self.report_age_confirmation()
 532                         disclaimer = urllib2.urlopen(request).read()
 533                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 534                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 535                         return
 536
 537         def _real_extract(self, url):
 538                 # Extract id and simplified title from URL
 539                 mobj = re.match(self._VALID_URL, url)
 540                 if mobj is None:
 541                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 542                         return
 543
 544                 video_id = mobj.group(1)
 545
 546                 # Check if video comes from YouTube
 547                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 548                 if mobj2 is not None:
 549                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 550                         return
 551
 552                 # Retrieve video webpage to extract further information
 553                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 554                 try:
 555                         self.report_download_webpage(video_id)
 556                         webpage = urllib2.urlopen(request).read()
 557                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 558                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 559                         return
 560
 561                 # Extract URL, uploader and title from webpage
 562                 self.report_extraction(video_id)
 563                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 564                 if mobj is not None:
 565                         mediaURL = urllib.unquote(mobj.group(1))
 566                         video_extension = mediaURL[-3:]
 567
 568                         # Extract gdaKey if available
 569                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 570                         if mobj is None:
 571                                 video_url = mediaURL
 572                         else:
 573                                 gdaKey = mobj.group(1)
 574                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 575                 else:
 576                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 577                         if mobj is None:
 578                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 579                                 return
 580                         vardict = parse_qs(mobj.group(1))
 581                         if 'mediaData' not in vardict:
 582                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 583                                 return
 584                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 585                         if mobj is None:
 586                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 587                                 return
 588                         mediaURL = mobj.group(1).replace('\\/', '/')
 589                         video_extension = mediaURL[-3:]
 590                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 591
 592                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 593                 if mobj is None:
 594                         self._downloader.trouble(u'ERROR: unable to extract title')
 595                         return
 596                 video_title = mobj.group(1).decode('utf-8')
 597
 598                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
 599                 if mobj is None:
 600                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 601                         return
 602                 video_uploader = mobj.group(1)
 603
 604                 return [{
 605                         'id':           video_id.decode('utf-8'),
 606                         'url':          video_url.decode('utf-8'),
 607                         'uploader':     video_uploader.decode('utf-8'),
 608                         'upload_date':  u'NA',
 609                         'title':        video_title,
 610                         'ext':          video_extension.decode('utf-8'),
 611                         'format':       u'NA',
 612                         'player_url':   None,
 613                 }]
 614
 615
 616 class DailymotionIE(InfoExtractor):
 617         """Information Extractor for Dailymotion"""
 618
 619         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 620         IE_NAME = u'dailymotion'
 621
 622         def __init__(self, downloader=None):
 623                 InfoExtractor.__init__(self, downloader)
 624
 625         def report_download_webpage(self, video_id):
 626                 """Report webpage download."""
 627                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
 628
 629         def report_extraction(self, video_id):
 630                 """Report information extraction."""
 631                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 632
 633         def _real_extract(self, url):
 634                 # Extract id and simplified title from URL
 635                 mobj = re.match(self._VALID_URL, url)
 636                 if mobj is None:
 637                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 638                         return
 639
 640                 video_id = mobj.group(1).split('_')[0].split('?')[0]
 641
 642                 video_extension = 'mp4'
 643
 644                 # Retrieve video webpage to extract further information
 645                 request = urllib2.Request(url)
 646                 request.add_header('Cookie', 'family_filter=off')
 647                 try:
 648                         self.report_download_webpage(video_id)
 649                         webpage = urllib2.urlopen(request).read()
 650                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 651                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 652                         return
 653
 654                 # Extract URL, uploader and title from webpage
 655                 self.report_extraction(video_id)
 656                 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 657                 if mobj is None:
 658                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 659                         return
 660                 flashvars = urllib.unquote(mobj.group(1))
 661
 662                 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 663                         if key in flashvars:
 664                                 max_quality = key
 665                                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 666                                 break
 667                 else:
 668                         self._downloader.trouble(u'ERROR: unable to extract video URL')
 669                         return
 670
 671                 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 672                 if mobj is None:
 673                         self._downloader.trouble(u'ERROR: unable to extract video URL')
 674                         return
 675
 676                 video_url = urllib.unquote(mobj.group(1)).replace('\\/', '/')
 677
 678                 # TODO: support choosing qualities
 679
 680                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 681                 if mobj is None:
 682                         self._downloader.trouble(u'ERROR: unable to extract title')
 683                         return
 684                 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
 685
 686                 video_uploader = u'NA'
 687                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
 688                 if mobj is None:
 689                         self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 690                 else:
 691                         video_uploader = mobj.group(1)
 692
 693                 video_upload_date = u'NA'
 694                 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 695                 if mobj is not None:
 696                         video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 697
 698                 return [{
 699                         'id':           video_id.decode('utf-8'),
 700                         'url':          video_url.decode('utf-8'),
 701                         'uploader':     video_uploader.decode('utf-8'),
 702                         'upload_date':  video_upload_date,
 703                         'title':        video_title,
 704                         'ext':          video_extension.decode('utf-8'),
 705                         'format':       u'NA',
 706                         'player_url':   None,
 707                 }]
 708
 709
 710 class GoogleIE(InfoExtractor):
 711         """Information extractor for video.google.com."""
 712
 713         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
 714         IE_NAME = u'video.google'
 715
 716         def __init__(self, downloader=None):
 717                 InfoExtractor.__init__(self, downloader)
 718
 719         def report_download_webpage(self, video_id):
 720                 """Report webpage download."""
 721                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
 722
 723         def report_extraction(self, video_id):
 724                 """Report information extraction."""
 725                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
 726
 727         def _real_extract(self, url):
 728                 # Extract id from URL
 729                 mobj = re.match(self._VALID_URL, url)
 730                 if mobj is None:
 731                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 732                         return
 733
 734                 video_id = mobj.group(1)
 735
 736                 video_extension = 'mp4'
 737
 738                 # Retrieve video webpage to extract further information
 739                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
 740                 try:
 741                         self.report_download_webpage(video_id)
 742                         webpage = urllib2.urlopen(request).read()
 743                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 744                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 745                         return
 746
 747                 # Extract URL, uploader, and title from webpage
 748                 self.report_extraction(video_id)
 749                 mobj = re.search(r"download_url:'([^']+)'", webpage)
 750                 if mobj is None:
 751                         video_extension = 'flv'
 752                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
 753                 if mobj is None:
 754                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 755                         return
 756                 mediaURL = urllib.unquote(mobj.group(1))
 757                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
 758                 mediaURL = mediaURL.replace('\\x26', '\x26')
 759
 760                 video_url = mediaURL
 761
 762                 mobj = re.search(r'<title>(.*)</title>', webpage)
 763                 if mobj is None:
 764                         self._downloader.trouble(u'ERROR: unable to extract title')
 765                         return
 766                 video_title = mobj.group(1).decode('utf-8')
 767
 768                 # Extract video description
 769                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
 770                 if mobj is None:
 771                         self._downloader.trouble(u'ERROR: unable to extract video description')
 772                         return
 773                 video_description = mobj.group(1).decode('utf-8')
 774                 if not video_description:
 775                         video_description = 'No description available.'
 776
 777                 # Extract video thumbnail
 778                 if self._downloader.params.get('forcethumbnail', False):
 779                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
 780                         try:
 781                                 webpage = urllib2.urlopen(request).read()
 782                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 783                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 784                                 return
 785                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
 786                         if mobj is None:
 787                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 788                                 return
 789                         video_thumbnail = mobj.group(1)
 790                 else:   # we need something to pass to process_info
 791                         video_thumbnail = ''
 792
 793                 return [{
 794                         'id':           video_id.decode('utf-8'),
 795                         'url':          video_url.decode('utf-8'),
 796                         'uploader':     u'NA',
 797                         'upload_date':  u'NA',
 798                         'title':        video_title,
 799                         'ext':          video_extension.decode('utf-8'),
 800                         'format':       u'NA',
 801                         'player_url':   None,
 802                 }]
 803
 804
 805 class PhotobucketIE(InfoExtractor):
 806         """Information extractor for photobucket.com."""
 807
 808         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 809         IE_NAME = u'photobucket'
 810
 811         def __init__(self, downloader=None):
 812                 InfoExtractor.__init__(self, downloader)
 813
 814         def report_download_webpage(self, video_id):
 815                 """Report webpage download."""
 816                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 817
 818         def report_extraction(self, video_id):
 819                 """Report information extraction."""
 820                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 821
 822         def _real_extract(self, url):
 823                 # Extract id from URL
 824                 mobj = re.match(self._VALID_URL, url)
 825                 if mobj is None:
 826                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 827                         return
 828
 829                 video_id = mobj.group(1)
 830
 831                 video_extension = 'flv'
 832
 833                 # Retrieve video webpage to extract further information
 834                 request = urllib2.Request(url)
 835                 try:
 836                         self.report_download_webpage(video_id)
 837                         webpage = urllib2.urlopen(request).read()
 838                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 839                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 840                         return
 841
 842                 # Extract URL, uploader, and title from webpage
 843                 self.report_extraction(video_id)
 844                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 845                 if mobj is None:
 846                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 847                         return
 848                 mediaURL = urllib.unquote(mobj.group(1))
 849
 850                 video_url = mediaURL
 851
 852                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 853                 if mobj is None:
 854                         self._downloader.trouble(u'ERROR: unable to extract title')
 855                         return
 856                 video_title = mobj.group(1).decode('utf-8')
 857
 858                 video_uploader = mobj.group(2).decode('utf-8')
 859
 860                 return [{
 861                         'id':           video_id.decode('utf-8'),
 862                         'url':          video_url.decode('utf-8'),
 863                         'uploader':     video_uploader,
 864                         'upload_date':  u'NA',
 865                         'title':        video_title,
 866                         'ext':          video_extension.decode('utf-8'),
 867                         'format':       u'NA',
 868                         'player_url':   None,
 869                 }]
 870
 871
 872 class YahooIE(InfoExtractor):
 873         """Information extractor for video.yahoo.com."""
 874
 875         # _VALID_URL matches all Yahoo! Video URLs
 876         # _VPAGE_URL matches only the extractable '/watch/' URLs
 877         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 878         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 879         IE_NAME = u'video.yahoo'
 880
 881         def __init__(self, downloader=None):
 882                 InfoExtractor.__init__(self, downloader)
 883
 884         def report_download_webpage(self, video_id):
 885                 """Report webpage download."""
 886                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 887
 888         def report_extraction(self, video_id):
 889                 """Report information extraction."""
 890                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 891
 892         def _real_extract(self, url, new_video=True):
 893                 # Extract ID from URL
 894                 mobj = re.match(self._VALID_URL, url)
 895                 if mobj is None:
 896                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 897                         return
 898
 899                 video_id = mobj.group(2)
 900                 video_extension = 'flv'
 901
 902                 # Rewrite valid but non-extractable URLs as
 903                 # extractable English language /watch/ URLs
 904                 if re.match(self._VPAGE_URL, url) is None:
 905                         request = urllib2.Request(url)
 906                         try:
 907                                 webpage = urllib2.urlopen(request).read()
 908                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 909                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 910                                 return
 911
 912                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 913                         if mobj is None:
 914                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 915                                 return
 916                         yahoo_id = mobj.group(1)
 917
 918                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 919                         if mobj is None:
 920                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 921                                 return
 922                         yahoo_vid = mobj.group(1)
 923
 924                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 925                         return self._real_extract(url, new_video=False)
 926
 927                 # Retrieve video webpage to extract further information
 928                 request = urllib2.Request(url)
 929                 try:
 930                         self.report_download_webpage(video_id)
 931                         webpage = urllib2.urlopen(request).read()
 932                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 933                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 934                         return
 935
 936                 # Extract uploader and title from webpage
 937                 self.report_extraction(video_id)
 938                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 939                 if mobj is None:
 940                         self._downloader.trouble(u'ERROR: unable to extract video title')
 941                         return
 942                 video_title = mobj.group(1).decode('utf-8')
 943
 944                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 945                 if mobj is None:
 946                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
 947                         return
 948                 video_uploader = mobj.group(1).decode('utf-8')
 949
 950                 # Extract video thumbnail
 951                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 952                 if mobj is None:
 953                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 954                         return
 955                 video_thumbnail = mobj.group(1).decode('utf-8')
 956
 957                 # Extract video description
 958                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 959                 if mobj is None:
 960                         self._downloader.trouble(u'ERROR: unable to extract video description')
 961                         return
 962                 video_description = mobj.group(1).decode('utf-8')
 963                 if not video_description:
 964                         video_description = 'No description available.'
 965
 966                 # Extract video height and width
 967                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 968                 if mobj is None:
 969                         self._downloader.trouble(u'ERROR: unable to extract video height')
 970                         return
 971                 yv_video_height = mobj.group(1)
 972
 973                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 974                 if mobj is None:
 975                         self._downloader.trouble(u'ERROR: unable to extract video width')
 976                         return
 977                 yv_video_width = mobj.group(1)
 978
 979                 # Retrieve video playlist to extract media URL
 980                 # I'm not completely sure what all these options are, but we
 981                 # seem to need most of them, otherwise the server sends a 401.
 982                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 983                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 984                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 985                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 986                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 987                 try:
 988                         self.report_download_webpage(video_id)
 989                         webpage = urllib2.urlopen(request).read()
 990                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 991                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 992                         return
 993
 994                 # Extract media URL from playlist XML
 995                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 996                 if mobj is None:
 997                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
 998                         return
 999                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1000                 video_url = unescapeHTML(video_url)
1001
1002                 return [{
1003                         'id':           video_id.decode('utf-8'),
1004                         'url':          video_url,
1005                         'uploader':     video_uploader,
1006                         'upload_date':  u'NA',
1007                         'title':        video_title,
1008                         'ext':          video_extension.decode('utf-8'),
1009                         'thumbnail':    video_thumbnail.decode('utf-8'),
1010                         'description':  video_description,
1011                         'thumbnail':    video_thumbnail,
1012                         'player_url':   None,
1013                 }]
1014
1015
1016 class VimeoIE(InfoExtractor):
1017         """Information extractor for vimeo.com."""
1018
1019         # _VALID_URL matches Vimeo URLs
1020         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1021         IE_NAME = u'vimeo'
1022
1023         def __init__(self, downloader=None):
1024                 InfoExtractor.__init__(self, downloader)
1025
1026         def report_download_webpage(self, video_id):
1027                 """Report webpage download."""
1028                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1029
1030         def report_extraction(self, video_id):
1031                 """Report information extraction."""
1032                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1033
1034         def _real_extract(self, url, new_video=True):
1035                 # Extract ID from URL
1036                 mobj = re.match(self._VALID_URL, url)
1037                 if mobj is None:
1038                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1039                         return
1040
1041                 video_id = mobj.group(1)
1042
1043                 # Retrieve video webpage to extract further information
1044                 request = urllib2.Request(url, None, std_headers)
1045                 try:
1046                         self.report_download_webpage(video_id)
1047                         webpage = urllib2.urlopen(request).read()
1048                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1049                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1050                         return
1051
1052                 # Now we begin extracting as much information as we can from what we
1053                 # retrieved. First we extract the information common to all extractors,
1054                 # and latter we extract those that are Vimeo specific.
1055                 self.report_extraction(video_id)
1056
1057                 # Extract the config JSON
1058                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1059                 try:
1060                         config = json.loads(config)
1061                 except:
1062                         self._downloader.trouble(u'ERROR: unable to extract info section')
1063                         return
1064
1065                 # Extract title
1066                 video_title = config["video"]["title"]
1067
1068                 # Extract uploader
1069                 video_uploader = config["video"]["owner"]["name"]
1070
1071                 # Extract video thumbnail
1072                 video_thumbnail = config["video"]["thumbnail"]
1073
1074                 # Extract video description
1075                 video_description = get_element_by_id("description", webpage.decode('utf8'))
1076                 if video_description: video_description = clean_html(video_description)
1077                 else: video_description = ''
1078
1079                 # Extract upload date
1080                 video_upload_date = u'NA'
1081                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1082                 if mobj is not None:
1083                         video_upload_date = mobj.group(1)
1084
1085                 # Vimeo specific: extract request signature and timestamp
1086                 sig = config['request']['signature']
1087                 timestamp = config['request']['timestamp']
1088
1089                 # Vimeo specific: extract video codec and quality information
1090                 # TODO bind to format param
1091                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1092                 for codec in codecs:
1093                         if codec[0] in config["video"]["files"]:
1094                                 video_codec = codec[0]
1095                                 video_extension = codec[1]
1096                                 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1097                                 else: quality = 'sd'
1098                                 break
1099                 else:
1100                         self._downloader.trouble(u'ERROR: no known codec found')
1101                         return
1102
1103                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1104                                         %(video_id, sig, timestamp, quality, video_codec.upper())
1105
1106                 return [{
1107                         'id':           video_id,
1108                         'url':          video_url,
1109                         'uploader':     video_uploader,
1110                         'upload_date':  video_upload_date,
1111                         'title':        video_title,
1112                         'ext':          video_extension,
1113                         'thumbnail':    video_thumbnail,
1114                         'description':  video_description,
1115                         'player_url':   None,
1116                 }]
1117
1118
1119 class GenericIE(InfoExtractor):
1120         """Generic last-resort information extractor."""
1121
1122         _VALID_URL = r'.*'
1123         IE_NAME = u'generic'
1124
1125         def __init__(self, downloader=None):
1126                 InfoExtractor.__init__(self, downloader)
1127
1128         def report_download_webpage(self, video_id):
1129                 """Report webpage download."""
1130                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1131                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1132
1133         def report_extraction(self, video_id):
1134                 """Report information extraction."""
1135                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1136
1137         def report_following_redirect(self, new_url):
1138                 """Report information extraction."""
1139                 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1140
1141         def _test_redirect(self, url):
1142                 """Check if it is a redirect, like url shorteners, in case restart chain."""
1143                 class HeadRequest(urllib2.Request):
1144                         def get_method(self):
1145                                 return "HEAD"
1146
1147                 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1148                         """
1149                         Subclass the HTTPRedirectHandler to make it use our
1150                         HeadRequest also on the redirected URL
1151                         """
1152                         def redirect_request(self, req, fp, code, msg, headers, newurl):
1153                                 if code in (301, 302, 303, 307):
1154                                         newurl = newurl.replace(' ', '%20')
1155                                         newheaders = dict((k,v) for k,v in req.headers.items()
1156                                                                           if k.lower() not in ("content-length", "content-type"))
1157                                         return HeadRequest(newurl,
1158                                                                            headers=newheaders,
1159                                                                            origin_req_host=req.get_origin_req_host(),
1160                                                                            unverifiable=True)
1161                                 else:
1162                                         raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1163
1164                 class HTTPMethodFallback(urllib2.BaseHandler):
1165                         """
1166                         Fallback to GET if HEAD is not allowed (405 HTTP error)
1167                         """
1168                         def http_error_405(self, req, fp, code, msg, headers):
1169                                 fp.read()
1170                                 fp.close()
1171
1172                                 newheaders = dict((k,v) for k,v in req.headers.items()
1173                                                                   if k.lower() not in ("content-length", "content-type"))
1174                                 return self.parent.open(urllib2.Request(req.get_full_url(),
1175                                                                                                  headers=newheaders,
1176                                                                                                  origin_req_host=req.get_origin_req_host(),
1177                                                                                                  unverifiable=True))
1178
1179                 # Build our opener
1180                 opener = urllib2.OpenerDirector()
1181                 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1182                                                 HTTPMethodFallback, HEADRedirectHandler,
1183                                                 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1184                         opener.add_handler(handler())
1185
1186                 response = opener.open(HeadRequest(url))
1187                 new_url = response.geturl()
1188
1189                 if url == new_url: return False
1190
1191                 self.report_following_redirect(new_url)
1192                 self._downloader.download([new_url])
1193                 return True
1194
1195         def _real_extract(self, url):
1196                 if self._test_redirect(url): return
1197
1198                 video_id = url.split('/')[-1]
1199                 request = urllib2.Request(url)
1200                 try:
1201                         self.report_download_webpage(video_id)
1202                         webpage = urllib2.urlopen(request).read()
1203                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1204                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1205                         return
1206                 except ValueError, err:
1207                         # since this is the last-resort InfoExtractor, if
1208                         # this error is thrown, it'll be thrown here
1209                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1210                         return
1211
1212                 self.report_extraction(video_id)
1213                 # Start with something easy: JW Player in SWFObject
1214                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1215                 if mobj is None:
1216                         # Broaden the search a little bit
1217                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1218                 if mobj is None:
1219                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1220                         return
1221
1222                 # It's possible that one of the regexes
1223                 # matched, but returned an empty group:
1224                 if mobj.group(1) is None:
1225                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1226                         return
1227
1228                 video_url = urllib.unquote(mobj.group(1))
1229                 video_id = os.path.basename(video_url)
1230
1231                 # here's a fun little line of code for you:
1232                 video_extension = os.path.splitext(video_id)[1][1:]
1233                 video_id = os.path.splitext(video_id)[0]
1234
1235                 # it's tempting to parse this further, but you would
1236                 # have to take into account all the variations like
1237                 #   Video Title - Site Name
1238                 #   Site Name | Video Title
1239                 #   Video Title - Tagline | Site Name
1240                 # and so on and so forth; it's just not practical
1241                 mobj = re.search(r'<title>(.*)</title>', webpage)
1242                 if mobj is None:
1243                         self._downloader.trouble(u'ERROR: unable to extract title')
1244                         return
1245                 video_title = mobj.group(1).decode('utf-8')
1246
1247                 # video uploader is domain name
1248                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1249                 if mobj is None:
1250                         self._downloader.trouble(u'ERROR: unable to extract title')
1251                         return
1252                 video_uploader = mobj.group(1).decode('utf-8')
1253
1254                 return [{
1255                         'id':           video_id.decode('utf-8'),
1256                         'url':          video_url.decode('utf-8'),
1257                         'uploader':     video_uploader,
1258                         'upload_date':  u'NA',
1259                         'title':        video_title,
1260                         'ext':          video_extension.decode('utf-8'),
1261                         'format':       u'NA',
1262                         'player_url':   None,
1263                 }]
1264
1265
1266 class YoutubeSearchIE(InfoExtractor):
1267         """Information Extractor for YouTube search queries."""
1268         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1269         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1270         _max_youtube_results = 1000
1271         IE_NAME = u'youtube:search'
1272
1273         def __init__(self, downloader=None):
1274                 InfoExtractor.__init__(self, downloader)
1275
1276         def report_download_page(self, query, pagenum):
1277                 """Report attempt to download search page with given number."""
1278                 query = query.decode(preferredencoding())
1279                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1280
1281         def _real_extract(self, query):
1282                 mobj = re.match(self._VALID_URL, query)
1283                 if mobj is None:
1284                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1285                         return
1286
1287                 prefix, query = query.split(':')
1288                 prefix = prefix[8:]
1289                 query = query.encode('utf-8')
1290                 if prefix == '':
1291                         self._download_n_results(query, 1)
1292                         return
1293                 elif prefix == 'all':
1294                         self._download_n_results(query, self._max_youtube_results)
1295                         return
1296                 else:
1297                         try:
1298                                 n = long(prefix)
1299                                 if n <= 0:
1300                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1301                                         return
1302                                 elif n > self._max_youtube_results:
1303                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1304                                         n = self._max_youtube_results
1305                                 self._download_n_results(query, n)
1306                                 return
1307                         except ValueError: # parsing prefix as integer fails
1308                                 self._download_n_results(query, 1)
1309                                 return
1310
1311         def _download_n_results(self, query, n):
1312                 """Downloads a specified number of results for a query"""
1313
1314                 video_ids = []
1315                 pagenum = 0
1316                 limit = n
1317
1318                 while (50 * pagenum) < limit:
1319                         self.report_download_page(query, pagenum+1)
1320                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1321                         request = urllib2.Request(result_url)
1322                         try:
1323                                 data = urllib2.urlopen(request).read()
1324                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1325                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1326                                 return
1327                         api_response = json.loads(data)['data']
1328
1329                         new_ids = list(video['id'] for video in api_response['items'])
1330                         video_ids += new_ids
1331
1332                         limit = min(n, api_response['totalItems'])
1333                         pagenum += 1
1334
1335                 if len(video_ids) > n:
1336                         video_ids = video_ids[:n]
1337                 for id in video_ids:
1338                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1339                 return
1340
1341
1342 class GoogleSearchIE(InfoExtractor):
1343         """Information Extractor for Google Video search queries."""
1344         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1345         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1346         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1347         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1348         _max_google_results = 1000
1349         IE_NAME = u'video.google:search'
1350
1351         def __init__(self, downloader=None):
1352                 InfoExtractor.__init__(self, downloader)
1353
1354         def report_download_page(self, query, pagenum):
1355                 """Report attempt to download playlist page with given number."""
1356                 query = query.decode(preferredencoding())
1357                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1358
1359         def _real_extract(self, query):
1360                 mobj = re.match(self._VALID_URL, query)
1361                 if mobj is None:
1362                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1363                         return
1364
1365                 prefix, query = query.split(':')
1366                 prefix = prefix[8:]
1367                 query = query.encode('utf-8')
1368                 if prefix == '':
1369                         self._download_n_results(query, 1)
1370                         return
1371                 elif prefix == 'all':
1372                         self._download_n_results(query, self._max_google_results)
1373                         return
1374                 else:
1375                         try:
1376                                 n = long(prefix)
1377                                 if n <= 0:
1378                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1379                                         return
1380                                 elif n > self._max_google_results:
1381                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1382                                         n = self._max_google_results
1383                                 self._download_n_results(query, n)
1384                                 return
1385                         except ValueError: # parsing prefix as integer fails
1386                                 self._download_n_results(query, 1)
1387                                 return
1388
1389         def _download_n_results(self, query, n):
1390                 """Downloads a specified number of results for a query"""
1391
1392                 video_ids = []
1393                 pagenum = 0
1394
1395                 while True:
1396                         self.report_download_page(query, pagenum)
1397                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1398                         request = urllib2.Request(result_url)
1399                         try:
1400                                 page = urllib2.urlopen(request).read()
1401                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1402                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1403                                 return
1404
1405                         # Extract video identifiers
1406                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1407                                 video_id = mobj.group(1)
1408                                 if video_id not in video_ids:
1409                                         video_ids.append(video_id)
1410                                         if len(video_ids) == n:
1411                                                 # Specified n videos reached
1412                                                 for id in video_ids:
1413                                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1414                                                 return
1415
1416                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1417                                 for id in video_ids:
1418                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1419                                 return
1420
1421                         pagenum = pagenum + 1
1422
1423
1424 class YahooSearchIE(InfoExtractor):
1425         """Information Extractor for Yahoo! Video search queries."""
1426         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1427         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1428         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1429         _MORE_PAGES_INDICATOR = r'\s*Next'
1430         _max_yahoo_results = 1000
1431         IE_NAME = u'video.yahoo:search'
1432
1433         def __init__(self, downloader=None):
1434                 InfoExtractor.__init__(self, downloader)
1435
1436         def report_download_page(self, query, pagenum):
1437                 """Report attempt to download playlist page with given number."""
1438                 query = query.decode(preferredencoding())
1439                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1440
1441         def _real_extract(self, query):
1442                 mobj = re.match(self._VALID_URL, query)
1443                 if mobj is None:
1444                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1445                         return
1446
1447                 prefix, query = query.split(':')
1448                 prefix = prefix[8:]
1449                 query = query.encode('utf-8')
1450                 if prefix == '':
1451                         self._download_n_results(query, 1)
1452                         return
1453                 elif prefix == 'all':
1454                         self._download_n_results(query, self._max_yahoo_results)
1455                         return
1456                 else:
1457                         try:
1458                                 n = long(prefix)
1459                                 if n <= 0:
1460                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1461                                         return
1462                                 elif n > self._max_yahoo_results:
1463                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1464                                         n = self._max_yahoo_results
1465                                 self._download_n_results(query, n)
1466                                 return
1467                         except ValueError: # parsing prefix as integer fails
1468                                 self._download_n_results(query, 1)
1469                                 return
1470
1471         def _download_n_results(self, query, n):
1472                 """Downloads a specified number of results for a query"""
1473
1474                 video_ids = []
1475                 already_seen = set()
1476                 pagenum = 1
1477
1478                 while True:
1479                         self.report_download_page(query, pagenum)
1480                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1481                         request = urllib2.Request(result_url)
1482                         try:
1483                                 page = urllib2.urlopen(request).read()
1484                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1485                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1486                                 return
1487
1488                         # Extract video identifiers
1489                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1490                                 video_id = mobj.group(1)
1491                                 if video_id not in already_seen:
1492                                         video_ids.append(video_id)
1493                                         already_seen.add(video_id)
1494                                         if len(video_ids) == n:
1495                                                 # Specified n videos reached
1496                                                 for id in video_ids:
1497                                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1498                                                 return
1499
1500                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1501                                 for id in video_ids:
1502                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1503                                 return
1504
1505                         pagenum = pagenum + 1
1506
1507
1508 class YoutubePlaylistIE(InfoExtractor):
1509         """Information Extractor for YouTube playlists."""
1510
1511         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1512         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1513         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1514         _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1515         IE_NAME = u'youtube:playlist'
1516
1517         def __init__(self, downloader=None):
1518                 InfoExtractor.__init__(self, downloader)
1519
1520         def report_download_page(self, playlist_id, pagenum):
1521                 """Report attempt to download playlist page with given number."""
1522                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1523
1524         def _real_extract(self, url):
1525                 # Extract playlist id
1526                 mobj = re.match(self._VALID_URL, url)
1527                 if mobj is None:
1528                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1529                         return
1530
1531                 # Single video case
1532                 if mobj.group(3) is not None:
1533                         self._downloader.download([mobj.group(3)])
1534                         return
1535
1536                 # Download playlist pages
1537                 # prefix is 'p' as default for playlists but there are other types that need extra care
1538                 playlist_prefix = mobj.group(1)
1539                 if playlist_prefix == 'a':
1540                         playlist_access = 'artist'
1541                 else:
1542                         playlist_prefix = 'p'
1543                         playlist_access = 'view_play_list'
1544                 playlist_id = mobj.group(2)
1545                 video_ids = []
1546                 pagenum = 1
1547
1548                 while True:
1549                         self.report_download_page(playlist_id, pagenum)
1550                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1551                         request = urllib2.Request(url)
1552                         try:
1553                                 page = urllib2.urlopen(request).read()
1554                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1555                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1556                                 return
1557
1558                         # Extract video identifiers
1559                         ids_in_page = []
1560                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1561                                 if mobj.group(1) not in ids_in_page:
1562                                         ids_in_page.append(mobj.group(1))
1563                         video_ids.extend(ids_in_page)
1564
1565                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1566                                 break
1567                         pagenum = pagenum + 1
1568
1569                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1570                 playlistend = self._downloader.params.get('playlistend', -1)
1571                 if playlistend == -1:
1572                         video_ids = video_ids[playliststart:]
1573                 else:
1574                         video_ids = video_ids[playliststart:playlistend]
1575
1576                 for id in video_ids:
1577                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1578                 return
1579
1580
1581 class YoutubeChannelIE(InfoExtractor):
1582         """Information Extractor for YouTube channels."""
1583
1584         _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1585         _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1586         _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1587         IE_NAME = u'youtube:channel'
1588
1589         def report_download_page(self, channel_id, pagenum):
1590                 """Report attempt to download channel page with given number."""
1591                 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1592
1593         def _real_extract(self, url):
1594                 # Extract channel id
1595                 mobj = re.match(self._VALID_URL, url)
1596                 if mobj is None:
1597                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1598                         return
1599
1600                 # Download channel pages
1601                 channel_id = mobj.group(1)
1602                 video_ids = []
1603                 pagenum = 1
1604
1605                 while True:
1606                         self.report_download_page(channel_id, pagenum)
1607                         url = self._TEMPLATE_URL % (channel_id, pagenum)
1608                         request = urllib2.Request(url)
1609                         try:
1610                                 page = urllib2.urlopen(request).read()
1611                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1612                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1613                                 return
1614
1615                         # Extract video identifiers
1616                         ids_in_page = []
1617                         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1618                                 if mobj.group(1) not in ids_in_page:
1619                                         ids_in_page.append(mobj.group(1))
1620                         video_ids.extend(ids_in_page)
1621
1622                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1623                                 break
1624                         pagenum = pagenum + 1
1625
1626                 for id in video_ids:
1627                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1628                 return
1629
1630
1631 class YoutubeUserIE(InfoExtractor):
1632         """Information Extractor for YouTube users."""
1633
1634         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1635         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1636         _GDATA_PAGE_SIZE = 50
1637         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1638         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1639         IE_NAME = u'youtube:user'
1640
1641         def __init__(self, downloader=None):
1642                 InfoExtractor.__init__(self, downloader)
1643
1644         def report_download_page(self, username, start_index):
1645                 """Report attempt to download user page."""
1646                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1647                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1648
1649         def _real_extract(self, url):
1650                 # Extract username
1651                 mobj = re.match(self._VALID_URL, url)
1652                 if mobj is None:
1653                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1654                         return
1655
1656                 username = mobj.group(1)
1657
1658                 # Download video ids using YouTube Data API. Result size per
1659                 # query is limited (currently to 50 videos) so we need to query
1660                 # page by page until there are no video ids - it means we got
1661                 # all of them.
1662
1663                 video_ids = []
1664                 pagenum = 0
1665
1666                 while True:
1667                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1668                         self.report_download_page(username, start_index)
1669
1670                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1671
1672                         try:
1673                                 page = urllib2.urlopen(request).read()
1674                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1675                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1676                                 return
1677
1678                         # Extract video identifiers
1679                         ids_in_page = []
1680
1681                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1682                                 if mobj.group(1) not in ids_in_page:
1683                                         ids_in_page.append(mobj.group(1))
1684
1685                         video_ids.extend(ids_in_page)
1686
1687                         # A little optimization - if current page is not
1688                         # "full", ie. does not contain PAGE_SIZE video ids then
1689                         # we can assume that this page is the last one - there
1690                         # are no more ids on further pages - no need to query
1691                         # again.
1692
1693                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1694                                 break
1695
1696                         pagenum += 1
1697
1698                 all_ids_count = len(video_ids)
1699                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1700                 playlistend = self._downloader.params.get('playlistend', -1)
1701
1702                 if playlistend == -1:
1703                         video_ids = video_ids[playliststart:]
1704                 else:
1705                         video_ids = video_ids[playliststart:playlistend]
1706
1707                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1708                                 (username, all_ids_count, len(video_ids)))
1709
1710                 for video_id in video_ids:
1711                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1712
1713
1714 class BlipTVUserIE(InfoExtractor):
1715         """Information Extractor for blip.tv users."""
1716
1717         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1718         _PAGE_SIZE = 12
1719         IE_NAME = u'blip.tv:user'
1720
1721         def __init__(self, downloader=None):
1722                 InfoExtractor.__init__(self, downloader)
1723
1724         def report_download_page(self, username, pagenum):
1725                 """Report attempt to download user page."""
1726                 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1727                                 (self.IE_NAME, username, pagenum))
1728
1729         def _real_extract(self, url):
1730                 # Extract username
1731                 mobj = re.match(self._VALID_URL, url)
1732                 if mobj is None:
1733                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1734                         return
1735
1736                 username = mobj.group(1)
1737
1738                 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1739
1740                 request = urllib2.Request(url)
1741
1742                 try:
1743                         page = urllib2.urlopen(request).read().decode('utf-8')
1744                         mobj = re.search(r'data-users-id="([^"]+)"', page)
1745                         page_base = page_base % mobj.group(1)
1746                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1747                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1748                         return
1749
1750
1751                 # Download video ids using BlipTV Ajax calls. Result size per
1752                 # query is limited (currently to 12 videos) so we need to query
1753                 # page by page until there are no video ids - it means we got
1754                 # all of them.
1755
1756                 video_ids = []
1757                 pagenum = 1
1758
1759                 while True:
1760                         self.report_download_page(username, pagenum)
1761
1762                         request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1763
1764                         try:
1765                                 page = urllib2.urlopen(request).read().decode('utf-8')
1766                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1767                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1768                                 return
1769
1770                         # Extract video identifiers
1771                         ids_in_page = []
1772
1773                         for mobj in re.finditer(r'href="/([^"]+)"', page):
1774                                 if mobj.group(1) not in ids_in_page:
1775                                         ids_in_page.append(unescapeHTML(mobj.group(1)))
1776
1777                         video_ids.extend(ids_in_page)
1778
1779                         # A little optimization - if current page is not
1780                         # "full", ie. does not contain PAGE_SIZE video ids then
1781                         # we can assume that this page is the last one - there
1782                         # are no more ids on further pages - no need to query
1783                         # again.
1784
1785                         if len(ids_in_page) < self._PAGE_SIZE:
1786                                 break
1787
1788                         pagenum += 1
1789
1790                 all_ids_count = len(video_ids)
1791                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1792                 playlistend = self._downloader.params.get('playlistend', -1)
1793
1794                 if playlistend == -1:
1795                         video_ids = video_ids[playliststart:]
1796                 else:
1797                         video_ids = video_ids[playliststart:playlistend]
1798
1799                 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1800                                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1801
1802                 for video_id in video_ids:
1803                         self._downloader.download([u'http://blip.tv/'+video_id])
1804
1805
1806 class DepositFilesIE(InfoExtractor):
1807         """Information extractor for depositfiles.com"""
1808
1809         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1810         IE_NAME = u'DepositFiles'
1811
1812         def __init__(self, downloader=None):
1813                 InfoExtractor.__init__(self, downloader)
1814
1815         def report_download_webpage(self, file_id):
1816                 """Report webpage download."""
1817                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1818
1819         def report_extraction(self, file_id):
1820                 """Report information extraction."""
1821                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1822
1823         def _real_extract(self, url):
1824                 file_id = url.split('/')[-1]
1825                 # Rebuild url in english locale
1826                 url = 'http://depositfiles.com/en/files/' + file_id
1827
1828                 # Retrieve file webpage with 'Free download' button pressed
1829                 free_download_indication = { 'gateway_result' : '1' }
1830                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1831                 try:
1832                         self.report_download_webpage(file_id)
1833                         webpage = urllib2.urlopen(request).read()
1834                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1835                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1836                         return
1837
1838                 # Search for the real file URL
1839                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1840                 if (mobj is None) or (mobj.group(1) is None):
1841                         # Try to figure out reason of the error.
1842                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1843                         if (mobj is not None) and (mobj.group(1) is not None):
1844                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1845                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1846                         else:
1847                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1848                         return
1849
1850                 file_url = mobj.group(1)
1851                 file_extension = os.path.splitext(file_url)[1][1:]
1852
1853                 # Search for file title
1854                 mobj = re.search(r'<b title="(.*?)">', webpage)
1855                 if mobj is None:
1856                         self._downloader.trouble(u'ERROR: unable to extract title')
1857                         return
1858                 file_title = mobj.group(1).decode('utf-8')
1859
1860                 return [{
1861                         'id':           file_id.decode('utf-8'),
1862                         'url':          file_url.decode('utf-8'),
1863                         'uploader':     u'NA',
1864                         'upload_date':  u'NA',
1865                         'title':        file_title,
1866                         'ext':          file_extension.decode('utf-8'),
1867                         'format':       u'NA',
1868                         'player_url':   None,
1869                 }]
1870
1871
1872 class FacebookIE(InfoExtractor):
1873         """Information Extractor for Facebook"""
1874
1875         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1876         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1877         _NETRC_MACHINE = 'facebook'
1878         _available_formats = ['video', 'highqual', 'lowqual']
1879         _video_extensions = {
1880                 'video': 'mp4',
1881                 'highqual': 'mp4',
1882                 'lowqual': 'mp4',
1883         }
1884         IE_NAME = u'facebook'
1885
1886         def __init__(self, downloader=None):
1887                 InfoExtractor.__init__(self, downloader)
1888
1889         def _reporter(self, message):
1890                 """Add header and report message."""
1891                 self._downloader.to_screen(u'[facebook] %s' % message)
1892
1893         def report_login(self):
1894                 """Report attempt to log in."""
1895                 self._reporter(u'Logging in')
1896
1897         def report_video_webpage_download(self, video_id):
1898                 """Report attempt to download video webpage."""
1899                 self._reporter(u'%s: Downloading video webpage' % video_id)
1900
1901         def report_information_extraction(self, video_id):
1902                 """Report attempt to extract video information."""
1903                 self._reporter(u'%s: Extracting video information' % video_id)
1904
1905         def _parse_page(self, video_webpage):
1906                 """Extract video information from page"""
1907                 # General data
1908                 data = {'title': r'\("video_title", "(.*?)"\)',
1909                         'description': r'<div class="datawrap">(.*?)</div>',
1910                         'owner': r'\("video_owner_name", "(.*?)"\)',
1911                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1912                         }
1913                 video_info = {}
1914                 for piece in data.keys():
1915                         mobj = re.search(data[piece], video_webpage)
1916                         if mobj is not None:
1917                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1918
1919                 # Video urls
1920                 video_urls = {}
1921                 for fmt in self._available_formats:
1922                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1923                         if mobj is not None:
1924                                 # URL is in a Javascript segment inside an escaped Unicode format within
1925                                 # the generally utf-8 page
1926                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1927                 video_info['video_urls'] = video_urls
1928
1929                 return video_info
1930
1931         def _real_initialize(self):
1932                 if self._downloader is None:
1933                         return
1934
1935                 useremail = None
1936                 password = None
1937                 downloader_params = self._downloader.params
1938
1939                 # Attempt to use provided username and password or .netrc data
1940                 if downloader_params.get('username', None) is not None:
1941                         useremail = downloader_params['username']
1942                         password = downloader_params['password']
1943                 elif downloader_params.get('usenetrc', False):
1944                         try:
1945                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1946                                 if info is not None:
1947                                         useremail = info[0]
1948                                         password = info[2]
1949                                 else:
1950                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1951                         except (IOError, netrc.NetrcParseError), err:
1952                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1953                                 return
1954
1955                 if useremail is None:
1956                         return
1957
1958                 # Log in
1959                 login_form = {
1960                         'email': useremail,
1961                         'pass': password,
1962                         'login': 'Log+In'
1963                         }
1964                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1965                 try:
1966                         self.report_login()
1967                         login_results = urllib2.urlopen(request).read()
1968                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1969                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1970                                 return
1971                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1972                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1973                         return
1974
1975         def _real_extract(self, url):
1976                 mobj = re.match(self._VALID_URL, url)
1977                 if mobj is None:
1978                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1979                         return
1980                 video_id = mobj.group('ID')
1981
1982                 # Get video webpage
1983                 self.report_video_webpage_download(video_id)
1984                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1985                 try:
1986                         page = urllib2.urlopen(request)
1987                         video_webpage = page.read()
1988                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1989                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1990                         return
1991
1992                 # Start extracting information
1993                 self.report_information_extraction(video_id)
1994
1995                 # Extract information
1996                 video_info = self._parse_page(video_webpage)
1997
1998                 # uploader
1999                 if 'owner' not in video_info:
2000                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2001                         return
2002                 video_uploader = video_info['owner']
2003
2004                 # title
2005                 if 'title' not in video_info:
2006                         self._downloader.trouble(u'ERROR: unable to extract video title')
2007                         return
2008                 video_title = video_info['title']
2009                 video_title = video_title.decode('utf-8')
2010
2011                 # thumbnail image
2012                 if 'thumbnail' not in video_info:
2013                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2014                         video_thumbnail = ''
2015                 else:
2016                         video_thumbnail = video_info['thumbnail']
2017
2018                 # upload date
2019                 upload_date = u'NA'
2020                 if 'upload_date' in video_info:
2021                         upload_time = video_info['upload_date']
2022                         timetuple = email.utils.parsedate_tz(upload_time)
2023                         if timetuple is not None:
2024                                 try:
2025                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2026                                 except:
2027                                         pass
2028
2029                 # description
2030                 video_description = video_info.get('description', 'No description available.')
2031
2032                 url_map = video_info['video_urls']
2033                 if len(url_map.keys()) > 0:
2034                         # Decide which formats to download
2035                         req_format = self._downloader.params.get('format', None)
2036                         format_limit = self._downloader.params.get('format_limit', None)
2037
2038                         if format_limit is not None and format_limit in self._available_formats:
2039                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2040                         else:
2041                                 format_list = self._available_formats
2042                         existing_formats = [x for x in format_list if x in url_map]
2043                         if len(existing_formats) == 0:
2044                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2045                                 return
2046                         if req_format is None:
2047                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2048                         elif req_format == 'worst':
2049                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2050                         elif req_format == '-1':
2051                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2052                         else:
2053                                 # Specific format
2054                                 if req_format not in url_map:
2055                                         self._downloader.trouble(u'ERROR: requested format not available')
2056                                         return
2057                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2058
2059                 results = []
2060                 for format_param, video_real_url in video_url_list:
2061                         # Extension
2062                         video_extension = self._video_extensions.get(format_param, 'mp4')
2063
2064                         results.append({
2065                                 'id':           video_id.decode('utf-8'),
2066                                 'url':          video_real_url.decode('utf-8'),
2067                                 'uploader':     video_uploader.decode('utf-8'),
2068                                 'upload_date':  upload_date,
2069                                 'title':        video_title,
2070                                 'ext':          video_extension.decode('utf-8'),
2071                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2072                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2073                                 'description':  video_description.decode('utf-8'),
2074                                 'player_url':   None,
2075                         })
2076                 return results
2077
2078 class BlipTVIE(InfoExtractor):
2079         """Information extractor for blip.tv"""
2080
2081         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2082         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2083         IE_NAME = u'blip.tv'
2084
2085         def report_extraction(self, file_id):
2086                 """Report information extraction."""
2087                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2088
2089         def report_direct_download(self, title):
2090                 """Report information extraction."""
2091                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2092
2093         def _real_extract(self, url):
2094                 mobj = re.match(self._VALID_URL, url)
2095                 if mobj is None:
2096                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2097                         return
2098
2099                 if '?' in url:
2100                         cchar = '&'
2101                 else:
2102                         cchar = '?'
2103                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2104                 request = urllib2.Request(json_url.encode('utf-8'))
2105                 self.report_extraction(mobj.group(1))
2106                 info = None
2107                 try:
2108                         urlh = urllib2.urlopen(request)
2109                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2110                                 basename = url.split('/')[-1]
2111                                 title,ext = os.path.splitext(basename)
2112                                 title = title.decode('UTF-8')
2113                                 ext = ext.replace('.', '')
2114                                 self.report_direct_download(title)
2115                                 info = {
2116                                         'id': title,
2117                                         'url': url,
2118                                         'title': title,
2119                                         'ext': ext,
2120                                         'urlhandle': urlh
2121                                 }
2122                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2123                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2124                         return
2125                 if info is None: # Regular URL
2126                         try:
2127                                 json_code = urlh.read()
2128                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2129                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2130                                 return
2131
2132                         try:
2133                                 json_data = json.loads(json_code)
2134                                 if 'Post' in json_data:
2135                                         data = json_data['Post']
2136                                 else:
2137                                         data = json_data
2138
2139                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2140                                 video_url = data['media']['url']
2141                                 umobj = re.match(self._URL_EXT, video_url)
2142                                 if umobj is None:
2143                                         raise ValueError('Can not determine filename extension')
2144                                 ext = umobj.group(1)
2145
2146                                 info = {
2147                                         'id': data['item_id'],
2148                                         'url': video_url,
2149                                         'uploader': data['display_name'],
2150                                         'upload_date': upload_date,
2151                                         'title': data['title'],
2152                                         'ext': ext,
2153                                         'format': data['media']['mimeType'],
2154                                         'thumbnail': data['thumbnailUrl'],
2155                                         'description': data['description'],
2156                                         'player_url': data['embedUrl']
2157                                 }
2158                         except (ValueError,KeyError), err:
2159                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2160                                 return
2161
2162                 std_headers['User-Agent'] = 'iTunes/10.6.1'
2163                 return [info]
2164
2165
2166 class MyVideoIE(InfoExtractor):
2167         """Information Extractor for myvideo.de."""
2168
2169         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2170         IE_NAME = u'myvideo'
2171
2172         def __init__(self, downloader=None):
2173                 InfoExtractor.__init__(self, downloader)
2174
2175         def report_download_webpage(self, video_id):
2176                 """Report webpage download."""
2177                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2178
2179         def report_extraction(self, video_id):
2180                 """Report information extraction."""
2181                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2182
2183         def _real_extract(self,url):
2184                 mobj = re.match(self._VALID_URL, url)
2185                 if mobj is None:
2186                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2187                         return
2188
2189                 video_id = mobj.group(1)
2190
2191                 # Get video webpage
2192                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2193                 try:
2194                         self.report_download_webpage(video_id)
2195                         webpage = urllib2.urlopen(request).read()
2196                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2197                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2198                         return
2199
2200                 self.report_extraction(video_id)
2201                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2202                                  webpage)
2203                 if mobj is None:
2204                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2205                         return
2206                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2207
2208                 mobj = re.search('<title>([^<]+)</title>', webpage)
2209                 if mobj is None:
2210                         self._downloader.trouble(u'ERROR: unable to extract title')
2211                         return
2212
2213                 video_title = mobj.group(1)
2214
2215                 return [{
2216                         'id':           video_id,
2217                         'url':          video_url,
2218                         'uploader':     u'NA',
2219                         'upload_date':  u'NA',
2220                         'title':        video_title,
2221                         'ext':          u'flv',
2222                         'format':       u'NA',
2223                         'player_url':   None,
2224                 }]
2225
2226 class ComedyCentralIE(InfoExtractor):
2227         """Information extractor for The Daily Show and Colbert Report """
2228
2229         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2230         IE_NAME = u'comedycentral'
2231
2232         def report_extraction(self, episode_id):
2233                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2234
2235         def report_config_download(self, episode_id):
2236                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2237
2238         def report_index_download(self, episode_id):
2239                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2240
2241         def report_player_url(self, episode_id):
2242                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2243
2244         def _real_extract(self, url):
2245                 mobj = re.match(self._VALID_URL, url)
2246                 if mobj is None:
2247                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2248                         return
2249
2250                 if mobj.group('shortname'):
2251                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
2252                                 url = u'http://www.thedailyshow.com/full-episodes/'
2253                         else:
2254                                 url = u'http://www.colbertnation.com/full-episodes/'
2255                         mobj = re.match(self._VALID_URL, url)
2256                         assert mobj is not None
2257
2258                 dlNewest = not mobj.group('episode')
2259                 if dlNewest:
2260                         epTitle = mobj.group('showname')
2261                 else:
2262                         epTitle = mobj.group('episode')
2263
2264                 req = urllib2.Request(url)
2265                 self.report_extraction(epTitle)
2266                 try:
2267                         htmlHandle = urllib2.urlopen(req)
2268                         html = htmlHandle.read()
2269                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2270                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2271                         return
2272                 if dlNewest:
2273                         url = htmlHandle.geturl()
2274                         mobj = re.match(self._VALID_URL, url)
2275                         if mobj is None:
2276                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2277                                 return
2278                         if mobj.group('episode') == '':
2279                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2280                                 return
2281                         epTitle = mobj.group('episode')
2282
2283                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2284                 if len(mMovieParams) == 0:
2285                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2286                         return
2287
2288                 playerUrl_raw = mMovieParams[0][0]
2289                 self.report_player_url(epTitle)
2290                 try:
2291                         urlHandle = urllib2.urlopen(playerUrl_raw)
2292                         playerUrl = urlHandle.geturl()
2293                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2294                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2295                         return
2296
2297                 uri = mMovieParams[0][1]
2298                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2299                 self.report_index_download(epTitle)
2300                 try:
2301                         indexXml = urllib2.urlopen(indexUrl).read()
2302                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2303                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2304                         return
2305
2306                 results = []
2307
2308                 idoc = xml.etree.ElementTree.fromstring(indexXml)
2309                 itemEls = idoc.findall('.//item')
2310                 for itemEl in itemEls:
2311                         mediaId = itemEl.findall('./guid')[0].text
2312                         shortMediaId = mediaId.split(':')[-1]
2313                         showId = mediaId.split(':')[-2].replace('.com', '')
2314                         officialTitle = itemEl.findall('./title')[0].text
2315                         officialDate = itemEl.findall('./pubDate')[0].text
2316
2317                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2318                                                 urllib.urlencode({'uri': mediaId}))
2319                         configReq = urllib2.Request(configUrl)
2320                         self.report_config_download(epTitle)
2321                         try:
2322                                 configXml = urllib2.urlopen(configReq).read()
2323                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2324                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2325                                 return
2326
2327                         cdoc = xml.etree.ElementTree.fromstring(configXml)
2328                         turls = []
2329                         for rendition in cdoc.findall('.//rendition'):
2330                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2331                                 turls.append(finfo)
2332
2333                         if len(turls) == 0:
2334                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2335                                 continue
2336
2337                         # For now, just pick the highest bitrate
2338                         format,video_url = turls[-1]
2339
2340                         effTitle = showId + u'-' + epTitle
2341                         info = {
2342                                 'id': shortMediaId,
2343                                 'url': video_url,
2344                                 'uploader': showId,
2345                                 'upload_date': officialDate,
2346                                 'title': effTitle,
2347                                 'ext': 'mp4',
2348                                 'format': format,
2349                                 'thumbnail': None,
2350                                 'description': officialTitle,
2351                                 'player_url': playerUrl
2352                         }
2353
2354                         results.append(info)
2355
2356                 return results
2357
2358
2359 class EscapistIE(InfoExtractor):
2360         """Information extractor for The Escapist """
2361
2362         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2363         IE_NAME = u'escapist'
2364
2365         def report_extraction(self, showName):
2366                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2367
2368         def report_config_download(self, showName):
2369                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2370
2371         def _real_extract(self, url):
2372                 mobj = re.match(self._VALID_URL, url)
2373                 if mobj is None:
2374                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2375                         return
2376                 showName = mobj.group('showname')
2377                 videoId = mobj.group('episode')
2378
2379                 self.report_extraction(showName)
2380                 try:
2381                         webPage = urllib2.urlopen(url)
2382                         webPageBytes = webPage.read()
2383                         m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2384                         webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2385                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2386                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2387                         return
2388
2389                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2390                 description = unescapeHTML(descMatch.group(1))
2391                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2392                 imgUrl = unescapeHTML(imgMatch.group(1))
2393                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2394                 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2395                 configUrlMatch = re.search('config=(.*)$', playerUrl)
2396                 configUrl = urllib2.unquote(configUrlMatch.group(1))
2397
2398                 self.report_config_download(showName)
2399                 try:
2400                         configJSON = urllib2.urlopen(configUrl).read()
2401                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2402                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2403                         return
2404
2405                 # Technically, it's JavaScript, not JSON
2406                 configJSON = configJSON.replace("'", '"')
2407
2408                 try:
2409                         config = json.loads(configJSON)
2410                 except (ValueError,), err:
2411                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2412                         return
2413
2414                 playlist = config['playlist']
2415                 videoUrl = playlist[1]['url']
2416
2417                 info = {
2418                         'id': videoId,
2419                         'url': videoUrl,
2420                         'uploader': showName,
2421                         'upload_date': None,
2422                         'title': showName,
2423                         'ext': 'flv',
2424                         'format': 'flv',
2425                         'thumbnail': imgUrl,
2426                         'description': description,
2427                         'player_url': playerUrl,
2428                 }
2429
2430                 return [info]
2431
2432
2433 class CollegeHumorIE(InfoExtractor):
2434         """Information extractor for collegehumor.com"""
2435
2436         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2437         IE_NAME = u'collegehumor'
2438
2439         def report_webpage(self, video_id):
2440                 """Report information extraction."""
2441                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2442
2443         def report_extraction(self, video_id):
2444                 """Report information extraction."""
2445                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2446
2447         def _real_extract(self, url):
2448                 mobj = re.match(self._VALID_URL, url)
2449                 if mobj is None:
2450                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2451                         return
2452                 video_id = mobj.group('videoid')
2453
2454                 self.report_webpage(video_id)
2455                 request = urllib2.Request(url)
2456                 try:
2457                         webpage = urllib2.urlopen(request).read()
2458                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2459                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2460                         return
2461
2462                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2463                 if m is None:
2464                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2465                         return
2466                 internal_video_id = m.group('internalvideoid')
2467
2468                 info = {
2469                         'id': video_id,
2470                         'internal_id': internal_video_id,
2471                 }
2472
2473                 self.report_extraction(video_id)
2474                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2475                 try:
2476                         metaXml = urllib2.urlopen(xmlUrl).read()
2477                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2478                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2479                         return
2480
2481                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2482                 try:
2483                         videoNode = mdoc.findall('./video')[0]
2484                         info['description'] = videoNode.findall('./description')[0].text
2485                         info['title'] = videoNode.findall('./caption')[0].text
2486                         info['url'] = videoNode.findall('./file')[0].text
2487                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2488                         info['ext'] = info['url'].rpartition('.')[2]
2489                         info['format'] = info['ext']
2490                 except IndexError:
2491                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2492                         return
2493
2494                 return [info]
2495
2496
2497 class XVideosIE(InfoExtractor):
2498         """Information extractor for xvideos.com"""
2499
2500         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2501         IE_NAME = u'xvideos'
2502
2503         def report_webpage(self, video_id):
2504                 """Report information extraction."""
2505                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2506
2507         def report_extraction(self, video_id):
2508                 """Report information extraction."""
2509                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2510
2511         def _real_extract(self, url):
2512                 mobj = re.match(self._VALID_URL, url)
2513                 if mobj is None:
2514                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2515                         return
2516                 video_id = mobj.group(1).decode('utf-8')
2517
2518                 self.report_webpage(video_id)
2519
2520                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2521                 try:
2522                         webpage = urllib2.urlopen(request).read()
2523                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2524                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2525                         return
2526
2527                 self.report_extraction(video_id)
2528
2529
2530                 # Extract video URL
2531                 mobj = re.search(r'flv_url=(.+?)&', webpage)
2532                 if mobj is None:
2533                         self._downloader.trouble(u'ERROR: unable to extract video url')
2534                         return
2535                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2536
2537
2538                 # Extract title
2539                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2540                 if mobj is None:
2541                         self._downloader.trouble(u'ERROR: unable to extract video title')
2542                         return
2543                 video_title = mobj.group(1).decode('utf-8')
2544
2545
2546                 # Extract video thumbnail
2547                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2548                 if mobj is None:
2549                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2550                         return
2551                 video_thumbnail = mobj.group(0).decode('utf-8')
2552
2553                 info = {
2554                         'id': video_id,
2555                         'url': video_url,
2556                         'uploader': None,
2557                         'upload_date': None,
2558                         'title': video_title,
2559                         'ext': 'flv',
2560                         'format': 'flv',
2561                         'thumbnail': video_thumbnail,
2562                         'description': None,
2563                         'player_url': None,
2564                 }
2565
2566                 return [info]
2567
2568
2569 class SoundcloudIE(InfoExtractor):
2570         """Information extractor for soundcloud.com
2571            To access the media, the uid of the song and a stream token
2572            must be extracted from the page source and the script must make
2573            a request to media.soundcloud.com/crossdomain.xml. Then
2574            the media can be grabbed by requesting from an url composed
2575            of the stream token and uid
2576          """
2577
2578         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2579         IE_NAME = u'soundcloud'
2580
2581         def __init__(self, downloader=None):
2582                 InfoExtractor.__init__(self, downloader)
2583
2584         def report_webpage(self, video_id):
2585                 """Report information extraction."""
2586                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2587
2588         def report_extraction(self, video_id):
2589                 """Report information extraction."""
2590                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2591
2592         def _real_extract(self, url):
2593                 mobj = re.match(self._VALID_URL, url)
2594                 if mobj is None:
2595                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2596                         return
2597
2598                 # extract uploader (which is in the url)
2599                 uploader = mobj.group(1).decode('utf-8')
2600                 # extract simple title (uploader + slug of song title)
2601                 slug_title =  mobj.group(2).decode('utf-8')
2602                 simple_title = uploader + u'-' + slug_title
2603
2604                 self.report_webpage('%s/%s' % (uploader, slug_title))
2605
2606                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2607                 try:
2608                         webpage = urllib2.urlopen(request).read()
2609                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2610                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2611                         return
2612
2613                 self.report_extraction('%s/%s' % (uploader, slug_title))
2614
2615                 # extract uid and stream token that soundcloud hands out for access
2616                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2617                 if mobj:
2618                         video_id = mobj.group(1)
2619                         stream_token = mobj.group(2)
2620
2621                 # extract unsimplified title
2622                 mobj = re.search('"title":"(.*?)",', webpage)
2623                 if mobj:
2624                         title = mobj.group(1).decode('utf-8')
2625                 else:
2626                         title = simple_title
2627
2628                 # construct media url (with uid/token)
2629                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2630                 mediaURL = mediaURL % (video_id, stream_token)
2631
2632                 # description
2633                 description = u'No description available'
2634                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2635                 if mobj:
2636                         description = mobj.group(1)
2637
2638                 # upload date
2639                 upload_date = None
2640                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2641                 if mobj:
2642                         try:
2643                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2644                         except Exception, e:
2645                                 self._downloader.to_stderr(str(e))
2646
2647                 # for soundcloud, a request to a cross domain is required for cookies
2648                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2649
2650                 return [{
2651                         'id':           video_id.decode('utf-8'),
2652                         'url':          mediaURL,
2653                         'uploader':     uploader.decode('utf-8'),
2654                         'upload_date':  upload_date,
2655                         'title':        title,
2656                         'ext':          u'mp3',
2657                         'format':       u'NA',
2658                         'player_url':   None,
2659                         'description': description.decode('utf-8')
2660                 }]
2661
2662
2663 class InfoQIE(InfoExtractor):
2664         """Information extractor for infoq.com"""
2665
2666         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2667         IE_NAME = u'infoq'
2668
2669         def report_webpage(self, video_id):
2670                 """Report information extraction."""
2671                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2672
2673         def report_extraction(self, video_id):
2674                 """Report information extraction."""
2675                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2676
2677         def _real_extract(self, url):
2678                 mobj = re.match(self._VALID_URL, url)
2679                 if mobj is None:
2680                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2681                         return
2682
2683                 self.report_webpage(url)
2684
2685                 request = urllib2.Request(url)
2686                 try:
2687                         webpage = urllib2.urlopen(request).read()
2688                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2689                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2690                         return
2691
2692                 self.report_extraction(url)
2693
2694
2695                 # Extract video URL
2696                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2697                 if mobj is None:
2698                         self._downloader.trouble(u'ERROR: unable to extract video url')
2699                         return
2700                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2701
2702
2703                 # Extract title
2704                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2705                 if mobj is None:
2706                         self._downloader.trouble(u'ERROR: unable to extract video title')
2707                         return
2708                 video_title = mobj.group(1).decode('utf-8')
2709
2710                 # Extract description
2711                 video_description = u'No description available.'
2712                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2713                 if mobj is not None:
2714                         video_description = mobj.group(1).decode('utf-8')
2715
2716                 video_filename = video_url.split('/')[-1]
2717                 video_id, extension = video_filename.split('.')
2718
2719                 info = {
2720                         'id': video_id,
2721                         'url': video_url,
2722                         'uploader': None,
2723                         'upload_date': None,
2724                         'title': video_title,
2725                         'ext': extension,
2726                         'format': extension, # Extension is always(?) mp4, but seems to be flv
2727                         'thumbnail': None,
2728                         'description': video_description,
2729                         'player_url': None,
2730                 }
2731
2732                 return [info]
2733
2734 class MixcloudIE(InfoExtractor):
2735         """Information extractor for www.mixcloud.com"""
2736         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2737         IE_NAME = u'mixcloud'
2738
2739         def __init__(self, downloader=None):
2740                 InfoExtractor.__init__(self, downloader)
2741
2742         def report_download_json(self, file_id):
2743                 """Report JSON download."""
2744                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2745
2746         def report_extraction(self, file_id):
2747                 """Report information extraction."""
2748                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2749
2750         def get_urls(self, jsonData, fmt, bitrate='best'):
2751                 """Get urls from 'audio_formats' section in json"""
2752                 file_url = None
2753                 try:
2754                         bitrate_list = jsonData[fmt]
2755                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2756                                 bitrate = max(bitrate_list) # select highest
2757
2758                         url_list = jsonData[fmt][bitrate]
2759                 except TypeError: # we have no bitrate info.
2760                         url_list = jsonData[fmt]
2761                 return url_list
2762
2763         def check_urls(self, url_list):
2764                 """Returns 1st active url from list"""
2765                 for url in url_list:
2766                         try:
2767                                 urllib2.urlopen(url)
2768                                 return url
2769                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2770                                 url = None
2771
2772                 return None
2773
2774         def _print_formats(self, formats):
2775                 print 'Available formats:'
2776                 for fmt in formats.keys():
2777                         for b in formats[fmt]:
2778                                 try:
2779                                         ext = formats[fmt][b][0]
2780                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2781                                 except TypeError: # we have no bitrate info
2782                                         ext = formats[fmt][0]
2783                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2784                                         break
2785
2786         def _real_extract(self, url):
2787                 mobj = re.match(self._VALID_URL, url)
2788                 if mobj is None:
2789                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2790                         return
2791                 # extract uploader & filename from url
2792                 uploader = mobj.group(1).decode('utf-8')
2793                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2794
2795                 # construct API request
2796                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2797                 # retrieve .json file with links to files
2798                 request = urllib2.Request(file_url)
2799                 try:
2800                         self.report_download_json(file_url)
2801                         jsonData = urllib2.urlopen(request).read()
2802                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2803                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2804                         return
2805
2806                 # parse JSON
2807                 json_data = json.loads(jsonData)
2808                 player_url = json_data['player_swf_url']
2809                 formats = dict(json_data['audio_formats'])
2810
2811                 req_format = self._downloader.params.get('format', None)
2812                 bitrate = None
2813
2814                 if self._downloader.params.get('listformats', None):
2815                         self._print_formats(formats)
2816                         return
2817
2818                 if req_format is None or req_format == 'best':
2819                         for format_param in formats.keys():
2820                                 url_list = self.get_urls(formats, format_param)
2821                                 # check urls
2822                                 file_url = self.check_urls(url_list)
2823                                 if file_url is not None:
2824                                         break # got it!
2825                 else:
2826                         if req_format not in formats.keys():
2827                                 self._downloader.trouble(u'ERROR: format is not available')
2828                                 return
2829
2830                         url_list = self.get_urls(formats, req_format)
2831                         file_url = self.check_urls(url_list)
2832                         format_param = req_format
2833
2834                 return [{
2835                         'id': file_id.decode('utf-8'),
2836                         'url': file_url.decode('utf-8'),
2837                         'uploader':     uploader.decode('utf-8'),
2838                         'upload_date': u'NA',
2839                         'title': json_data['name'],
2840                         'ext': file_url.split('.')[-1].decode('utf-8'),
2841                         'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2842                         'thumbnail': json_data['thumbnail_url'],
2843                         'description': json_data['description'],
2844                         'player_url': player_url.decode('utf-8'),
2845                 }]
2846
2847 class StanfordOpenClassroomIE(InfoExtractor):
2848         """Information extractor for Stanford's Open ClassRoom"""
2849
2850         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2851         IE_NAME = u'stanfordoc'
2852
2853         def report_download_webpage(self, objid):
2854                 """Report information extraction."""
2855                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2856
2857         def report_extraction(self, video_id):
2858                 """Report information extraction."""
2859                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2860
2861         def _real_extract(self, url):
2862                 mobj = re.match(self._VALID_URL, url)
2863                 if mobj is None:
2864                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2865                         return
2866
2867                 if mobj.group('course') and mobj.group('video'): # A specific video
2868                         course = mobj.group('course')
2869                         video = mobj.group('video')
2870                         info = {
2871                                 'id': course + '_' + video,
2872                         }
2873
2874                         self.report_extraction(info['id'])
2875                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2876                         xmlUrl = baseUrl + video + '.xml'
2877                         try:
2878                                 metaXml = urllib2.urlopen(xmlUrl).read()
2879                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2880                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2881                                 return
2882                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2883                         try:
2884                                 info['title'] = mdoc.findall('./title')[0].text
2885                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2886                         except IndexError:
2887                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2888                                 return
2889                         info['ext'] = info['url'].rpartition('.')[2]
2890                         info['format'] = info['ext']
2891                         return [info]
2892                 elif mobj.group('course'): # A course page
2893                         course = mobj.group('course')
2894                         info = {
2895                                 'id': course,
2896                                 'type': 'playlist',
2897                         }
2898
2899                         self.report_download_webpage(info['id'])
2900                         try:
2901                                 coursepage = urllib2.urlopen(url).read()
2902                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2903                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2904                                 return
2905
2906                         m = re.search('<h1>([^<]+)</h1>', coursepage)
2907                         if m:
2908                                 info['title'] = unescapeHTML(m.group(1))
2909                         else:
2910                                 info['title'] = info['id']
2911
2912                         m = re.search('<description>([^<]+)</description>', coursepage)
2913                         if m:
2914                                 info['description'] = unescapeHTML(m.group(1))
2915
2916                         links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2917                         info['list'] = [
2918                                 {
2919                                         'type': 'reference',
2920                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2921                                 }
2922                                         for vpage in links]
2923                         results = []
2924                         for entry in info['list']:
2925                                 assert entry['type'] == 'reference'
2926                                 results += self.extract(entry['url'])
2927                         return results
2928
2929                 else: # Root page
2930                         info = {
2931                                 'id': 'Stanford OpenClassroom',
2932                                 'type': 'playlist',
2933                         }
2934
2935                         self.report_download_webpage(info['id'])
2936                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2937                         try:
2938                                 rootpage = urllib2.urlopen(rootURL).read()
2939                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2940                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2941                                 return
2942
2943                         info['title'] = info['id']
2944
2945                         links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2946                         info['list'] = [
2947                                 {
2948                                         'type': 'reference',
2949                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2950                                 }
2951                                         for cpage in links]
2952
2953                         results = []
2954                         for entry in info['list']:
2955                                 assert entry['type'] == 'reference'
2956                                 results += self.extract(entry['url'])
2957                         return results
2958
2959 class MTVIE(InfoExtractor):
2960         """Information extractor for MTV.com"""
2961
2962         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2963         IE_NAME = u'mtv'
2964
2965         def report_webpage(self, video_id):
2966                 """Report information extraction."""
2967                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2968
2969         def report_extraction(self, video_id):
2970                 """Report information extraction."""
2971                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2972
2973         def _real_extract(self, url):
2974                 mobj = re.match(self._VALID_URL, url)
2975                 if mobj is None:
2976                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2977                         return
2978                 if not mobj.group('proto'):
2979                         url = 'http://' + url
2980                 video_id = mobj.group('videoid')
2981                 self.report_webpage(video_id)
2982
2983                 request = urllib2.Request(url)
2984                 try:
2985                         webpage = urllib2.urlopen(request).read()
2986                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2987                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2988                         return
2989
2990                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2991                 if mobj is None:
2992                         self._downloader.trouble(u'ERROR: unable to extract song name')
2993                         return
2994                 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2995                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2996                 if mobj is None:
2997                         self._downloader.trouble(u'ERROR: unable to extract performer')
2998                         return
2999                 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3000                 video_title = performer + ' - ' + song_name
3001
3002                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3003                 if mobj is None:
3004                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3005                         return
3006                 mtvn_uri = mobj.group(1)
3007
3008                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3009                 if mobj is None:
3010                         self._downloader.trouble(u'ERROR: unable to extract content id')
3011                         return
3012                 content_id = mobj.group(1)
3013
3014                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3015                 self.report_extraction(video_id)
3016                 request = urllib2.Request(videogen_url)
3017                 try:
3018                         metadataXml = urllib2.urlopen(request).read()
3019                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3020                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
3021                         return
3022
3023                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3024                 renditions = mdoc.findall('.//rendition')
3025
3026                 # For now, always pick the highest quality.
3027                 rendition = renditions[-1]
3028
3029                 try:
3030                         _,_,ext = rendition.attrib['type'].partition('/')
3031                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3032                         video_url = rendition.find('./src').text
3033                 except KeyError:
3034                         self._downloader.trouble('Invalid rendition field.')
3035                         return
3036
3037                 info = {
3038                         'id': video_id,
3039                         'url': video_url,
3040                         'uploader': performer,
3041                         'title': video_title,
3042                         'ext': ext,
3043                         'format': format,
3044                 }
3045
3046                 return [info]
3047
3048
3049 class YoukuIE(InfoExtractor):
3050
3051         _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3052         IE_NAME = u'Youku'
3053
3054         def __init__(self, downloader=None):
3055                 InfoExtractor.__init__(self, downloader)
3056
3057         def report_download_webpage(self, file_id):
3058                 """Report webpage download."""
3059                 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3060
3061         def report_extraction(self, file_id):
3062                 """Report information extraction."""
3063                 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3064
3065         def _gen_sid(self):
3066                 nowTime = int(time.time() * 1000)
3067                 random1 = random.randint(1000,1998)
3068                 random2 = random.randint(1000,9999)
3069
3070                 return "%d%d%d" %(nowTime,random1,random2)
3071
3072         def _get_file_ID_mix_string(self, seed):
3073                 mixed = []
3074                 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3075                 seed = float(seed)
3076                 for i in range(len(source)):
3077                         seed  =  (seed * 211 + 30031 ) % 65536
3078                         index  =  math.floor(seed / 65536 * len(source) )
3079                         mixed.append(source[int(index)])
3080                         source.remove(source[int(index)])
3081                 #return ''.join(mixed)
3082                 return mixed
3083
3084         def _get_file_id(self, fileId, seed):
3085                 mixed = self._get_file_ID_mix_string(seed)
3086                 ids = fileId.split('*')
3087                 realId = []
3088                 for ch in ids:
3089                         if ch:
3090                                 realId.append(mixed[int(ch)])
3091                 return ''.join(realId)
3092
3093         def _real_extract(self, url):
3094                 mobj = re.match(self._VALID_URL, url)
3095                 if mobj is None:
3096                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3097                         return
3098                 video_id = mobj.group('ID')
3099
3100                 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3101
3102                 request = urllib2.Request(info_url, None, std_headers)
3103                 try:
3104                         self.report_download_webpage(video_id)
3105                         jsondata = urllib2.urlopen(request).read()
3106                 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3107                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3108                         return
3109
3110                 self.report_extraction(video_id)
3111                 try:
3112                         config = json.loads(jsondata)
3113
3114                         video_title =  config['data'][0]['title']
3115                         seed = config['data'][0]['seed']
3116
3117                         format = self._downloader.params.get('format', None)
3118                         supported_format = config['data'][0]['streamfileids'].keys()
3119
3120                         if format is None or format == 'best':
3121                                 if 'hd2' in supported_format:
3122                                         format = 'hd2'
3123                                 else:
3124                                         format = 'flv'
3125                                 ext = u'flv'
3126                         elif format == 'worst':
3127                                 format = 'mp4'
3128                                 ext = u'mp4'
3129                         else:
3130                                 format = 'flv'
3131                                 ext = u'flv'
3132
3133
3134                         fileid = config['data'][0]['streamfileids'][format]
3135                         seg_number = len(config['data'][0]['segs'][format])
3136
3137                         keys=[]
3138                         for i in xrange(seg_number):
3139                                 keys.append(config['data'][0]['segs'][format][i]['k'])
3140
3141                         #TODO check error
3142                         #youku only could be viewed from mainland china
3143                 except:
3144                         self._downloader.trouble(u'ERROR: unable to extract info section')
3145                         return
3146
3147                 files_info=[]
3148                 sid = self._gen_sid()
3149                 fileid = self._get_file_id(fileid, seed)
3150
3151                 #column 8,9 of fileid represent the segment number
3152                 #fileid[7:9] should be changed
3153                 for index, key in enumerate(keys):
3154
3155                         temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3156                         download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3157
3158                         info = {
3159                                 'id': '%s_part%02d' % (video_id, index),
3160                                 'url': download_url,
3161                                 'uploader': None,
3162                                 'title': video_title,
3163                                 'ext': ext,
3164                                 'format': u'NA'
3165                         }
3166                         files_info.append(info)
3167
3168                 return files_info
3169
3170
3171 class XNXXIE(InfoExtractor):
3172         """Information extractor for xnxx.com"""
3173
3174         _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3175         IE_NAME = u'xnxx'
3176         VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3177         VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3178         VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3179
3180         def report_webpage(self, video_id):
3181                 """Report information extraction"""
3182                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3183
3184         def report_extraction(self, video_id):
3185                 """Report information extraction"""
3186                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3187
3188         def _real_extract(self, url):
3189                 mobj = re.match(self._VALID_URL, url)
3190                 if mobj is None:
3191                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3192                         return
3193                 video_id = mobj.group(1).decode('utf-8')
3194
3195                 self.report_webpage(video_id)
3196
3197                 # Get webpage content
3198                 try:
3199                         webpage = urllib2.urlopen(url).read()
3200                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3201                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3202                         return
3203
3204                 result = re.search(self.VIDEO_URL_RE, webpage)
3205                 if result is None:
3206                         self._downloader.trouble(u'ERROR: unable to extract video url')
3207                         return
3208                 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3209
3210                 result = re.search(self.VIDEO_TITLE_RE, webpage)
3211                 if result is None:
3212                         self._downloader.trouble(u'ERROR: unable to extract video title')
3213                         return
3214                 video_title = result.group(1).decode('utf-8')
3215
3216                 result = re.search(self.VIDEO_THUMB_RE, webpage)
3217                 if result is None:
3218                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3219                         return
3220                 video_thumbnail = result.group(1).decode('utf-8')
3221
3222                 info = {'id': video_id,
3223                                 'url': video_url,
3224                                 'uploader': None,
3225                                 'upload_date': None,
3226                                 'title': video_title,
3227                                 'ext': 'flv',
3228                                 'format': 'flv',
3229                                 'thumbnail': video_thumbnail,
3230                                 'description': None,
3231                                 'player_url': None}
3232
3233                 return [info]
3234
3235
3236 class GooglePlusIE(InfoExtractor):
3237         """Information extractor for plus.google.com."""
3238
3239         _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3240         IE_NAME = u'plus.google'
3241
3242         def __init__(self, downloader=None):
3243                 InfoExtractor.__init__(self, downloader)
3244
3245         def report_extract_entry(self, url):
3246                 """Report downloading extry"""
3247                 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3248
3249         def report_date(self, upload_date):
3250                 """Report downloading extry"""
3251                 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3252
3253         def report_uploader(self, uploader):
3254                 """Report downloading extry"""
3255                 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3256
3257         def report_title(self, video_title):
3258                 """Report downloading extry"""
3259                 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3260
3261         def report_extract_vid_page(self, video_page):
3262                 """Report information extraction."""
3263                 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3264
3265         def _real_extract(self, url):
3266                 # Extract id from URL
3267                 mobj = re.match(self._VALID_URL, url)
3268                 if mobj is None:
3269                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3270                         return
3271
3272                 post_url = mobj.group(0)
3273                 video_id = mobj.group(2)
3274
3275                 video_extension = 'flv'
3276
3277                 # Step 1, Retrieve post webpage to extract further information
3278                 self.report_extract_entry(post_url)
3279                 request = urllib2.Request(post_url)
3280                 try:
3281                         webpage = urllib2.urlopen(request).read()
3282                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3283                         self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % str(err))
3284                         return
3285
3286                 # Extract update date
3287                 upload_date = u'NA'
3288                 pattern = 'title="Timestamp">(.*?)</a>'
3289                 mobj = re.search(pattern, webpage)
3290                 if mobj:
3291                         upload_date = mobj.group(1)
3292                         # Convert timestring to a format suitable for filename
3293                         upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3294                         upload_date = upload_date.strftime('%Y%m%d')
3295                 self.report_date(upload_date)
3296
3297                 # Extract uploader
3298                 uploader = u'NA'
3299                 pattern = r'rel\="author".*?>(.*?)</a>'
3300                 mobj = re.search(pattern, webpage)
3301                 if mobj:
3302                         uploader = mobj.group(1)
3303                 self.report_uploader(uploader)
3304
3305                 # Extract title
3306                 # Get the first line for title
3307                 video_title = u'NA'
3308                 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3309                 mobj = re.search(pattern, webpage)
3310                 if mobj:
3311                         video_title = mobj.group(1)
3312                 self.report_title(video_title)
3313
3314                 # Step 2, Stimulate clicking the image box to launch video
3315                 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3316                 mobj = re.search(pattern, webpage)
3317                 if mobj is None:
3318                         self._downloader.trouble(u'ERROR: unable to extract video page URL')
3319
3320                 video_page = mobj.group(1)
3321                 request = urllib2.Request(video_page)
3322                 try:
3323                         webpage = urllib2.urlopen(request).read()
3324                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3325                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3326                         return
3327                 self.report_extract_vid_page(video_page)
3328
3329
3330                 # Extract video links on video page
3331                 """Extract video links of all sizes"""
3332                 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3333                 mobj = re.findall(pattern, webpage)
3334                 if len(mobj) == 0:
3335                         self._downloader.trouble(u'ERROR: unable to extract video links')
3336
3337                 # Sort in resolution
3338                 links = sorted(mobj)
3339
3340                 # Choose the lowest of the sort, i.e. highest resolution
3341                 video_url = links[-1]
3342                 # Only get the url. The resolution part in the tuple has no use anymore
3343                 video_url = video_url[-1]
3344                 # Treat escaped \u0026 style hex
3345                 video_url = unicode(video_url, "unicode_escape")
3346
3347
3348                 return [{
3349                         'id':           video_id.decode('utf-8'),
3350                         'url':          video_url,
3351                         'uploader':     uploader.decode('utf-8'),
3352                         'upload_date':  upload_date.decode('utf-8'),
3353                         'title':        video_title.decode('utf-8'),
3354                         'ext':          video_extension.decode('utf-8'),
3355                         'format':       u'NA',
3356                         'player_url':   None,
3357                 }]