_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import datetime
   5 import HTMLParser
   6 import httplib
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import time
  12 import urllib
  13 import urllib2
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 from urlparse import parse_qs
  19
  20 try:
  21         import cStringIO as StringIO
  22 except ImportError:
  23         import StringIO
  24
  25 from utils import *
  26
  27
  28 class InfoExtractor(object):
  29         """Information Extractor class.
  30
  31         Information extractors are the classes that, given a URL, extract
  32         information from the video (or videos) the URL refers to. This
  33         information includes the real video URL, the video title and simplified
  34         title, author and others. The information is stored in a dictionary
  35         which is then passed to the FileDownloader. The FileDownloader
  36         processes this information possibly downloading the video to the file
  37         system, among other possible outcomes. The dictionaries must include
  38         the following fields:
  39
  40         id:             Video identifier.
  41         url:            Final video URL.
  42         uploader:       Nickname of the video uploader.
  43         title:          Literal title.
  44         ext:            Video filename extension.
  45         format:         Video format.
  46         player_url:     SWF Player URL (may be None).
  47
  48         The following fields are optional. Their primary purpose is to allow
  49         youtube-dl to serve as the backend for a video search function, such
  50         as the one in youtube2mp3.  They are only used when their respective
  51         forced printing functions are called:
  52
  53         thumbnail:      Full URL to a video thumbnail image.
  54         description:    One-line video description.
  55
  56         Subclasses of this one should re-define the _real_initialize() and
  57         _real_extract() methods and define a _VALID_URL regexp.
  58         Probably, they should also be added to the list of extractors.
  59         """
  60
  61         _ready = False
  62         _downloader = None
  63
  64         def __init__(self, downloader=None):
  65                 """Constructor. Receives an optional downloader."""
  66                 self._ready = False
  67                 self.set_downloader(downloader)
  68
  69         def suitable(self, url):
  70                 """Receives a URL and returns True if suitable for this IE."""
  71                 return re.match(self._VALID_URL, url) is not None
  72
  73         def initialize(self):
  74                 """Initializes an instance (authentication, etc)."""
  75                 if not self._ready:
  76                         self._real_initialize()
  77                         self._ready = True
  78
  79         def extract(self, url):
  80                 """Extracts URL information and returns it in list of dicts."""
  81                 self.initialize()
  82                 return self._real_extract(url)
  83
  84         def set_downloader(self, downloader):
  85                 """Sets the downloader for this IE."""
  86                 self._downloader = downloader
  87
  88         def _real_initialize(self):
  89                 """Real initialization process. Redefine in subclasses."""
  90                 pass
  91
  92         def _real_extract(self, url):
  93                 """Real extraction process. Redefine in subclasses."""
  94                 pass
  95
  96
  97 class YoutubeIE(InfoExtractor):
  98         """Information extractor for youtube.com."""
  99
 100         _VALID_URL = r"""^
 101                          (
 102                              (?:https?://)?                                       # http(s):// (optional)
 103                              (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 104                                 tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 105                              (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
 106                              (?:                                                  # the various things that can precede the ID:
 107                                  (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 108                                  |(?:                                             # or the v= param in all its forms
 109                                      (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 110                                      (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 111                                      (?:.+&)?                                     # any other preceding param (like /?s=tuff&v=xxxx)
 112                                      v=
 113                                  )
 114                              )?                                                   # optional -> youtube.com/xxxx is OK
 115                          )?                                                       # all until now is optional -> you can pass the naked ID
 116                          ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 117                          (?(1).+)?                                                # if we found the ID, everything can follow
 118                          $"""
 119         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 120         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 121         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 122         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 123         _NETRC_MACHINE = 'youtube'
 124         # Listed in order of quality
 125         _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 126         _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 127         _video_extensions = {
 128                 '13': '3gp',
 129                 '17': 'mp4',
 130                 '18': 'mp4',
 131                 '22': 'mp4',
 132                 '37': 'mp4',
 133                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 134                 '43': 'webm',
 135                 '44': 'webm',
 136                 '45': 'webm',
 137                 '46': 'webm',
 138         }
 139         _video_dimensions = {
 140                 '5': '240x400',
 141                 '6': '???',
 142                 '13': '???',
 143                 '17': '144x176',
 144                 '18': '360x640',
 145                 '22': '720x1280',
 146                 '34': '360x640',
 147                 '35': '480x854',
 148                 '37': '1080x1920',
 149                 '38': '3072x4096',
 150                 '43': '360x640',
 151                 '44': '480x854',
 152                 '45': '720x1280',
 153                 '46': '1080x1920',
 154         }
 155         IE_NAME = u'youtube'
 156
 157         def suitable(self, url):
 158                 """Receives a URL and returns True if suitable for this IE."""
 159                 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
 160
 161         def report_lang(self):
 162                 """Report attempt to set language."""
 163                 self._downloader.to_screen(u'[youtube] Setting language')
 164
 165         def report_login(self):
 166                 """Report attempt to log in."""
 167                 self._downloader.to_screen(u'[youtube] Logging in')
 168
 169         def report_age_confirmation(self):
 170                 """Report attempt to confirm age."""
 171                 self._downloader.to_screen(u'[youtube] Confirming age')
 172
 173         def report_video_webpage_download(self, video_id):
 174                 """Report attempt to download video webpage."""
 175                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 176
 177         def report_video_info_webpage_download(self, video_id):
 178                 """Report attempt to download video info webpage."""
 179                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 180
 181         def report_video_subtitles_download(self, video_id):
 182                 """Report attempt to download video info webpage."""
 183                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 184
 185         def report_information_extraction(self, video_id):
 186                 """Report attempt to extract video information."""
 187                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 188
 189         def report_unavailable_format(self, video_id, format):
 190                 """Report extracted video URL."""
 191                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 192
 193         def report_rtmp_download(self):
 194                 """Indicate the download will use the RTMP protocol."""
 195                 self._downloader.to_screen(u'[youtube] RTMP download detected')
 196
 197         def _closed_captions_xml_to_srt(self, xml_string):
 198                 srt = ''
 199                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 200                 # TODO parse xml instead of regex
 201                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
 202                         if not dur: dur = '4'
 203                         start = float(start)
 204                         end = start + float(dur)
 205                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 206                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 207                         caption = unescapeHTML(caption)
 208                         caption = unescapeHTML(caption) # double cycle, intentional
 209                         srt += str(n+1) + '\n'
 210                         srt += start + ' --> ' + end + '\n'
 211                         srt += caption + '\n\n'
 212                 return srt
 213
 214         def _print_formats(self, formats):
 215                 print 'Available formats:'
 216                 for x in formats:
 217                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
 218
 219         def _real_initialize(self):
 220                 if self._downloader is None:
 221                         return
 222
 223                 username = None
 224                 password = None
 225                 downloader_params = self._downloader.params
 226
 227                 # Attempt to use provided username and password or .netrc data
 228                 if downloader_params.get('username', None) is not None:
 229                         username = downloader_params['username']
 230                         password = downloader_params['password']
 231                 elif downloader_params.get('usenetrc', False):
 232                         try:
 233                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 234                                 if info is not None:
 235                                         username = info[0]
 236                                         password = info[2]
 237                                 else:
 238                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 239                         except (IOError, netrc.NetrcParseError), err:
 240                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 241                                 return
 242
 243                 # Set language
 244                 request = urllib2.Request(self._LANG_URL)
 245                 try:
 246                         self.report_lang()
 247                         urllib2.urlopen(request).read()
 248                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 249                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 250                         return
 251
 252                 # No authentication to be performed
 253                 if username is None:
 254                         return
 255
 256                 # Log in
 257                 login_form = {
 258                                 'current_form': 'loginForm',
 259                                 'next':         '/',
 260                                 'action_login': 'Log In',
 261                                 'username':     username,
 262                                 'password':     password,
 263                                 }
 264                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
 265                 try:
 266                         self.report_login()
 267                         login_results = urllib2.urlopen(request).read()
 268                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 269                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 270                                 return
 271                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 272                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 273                         return
 274
 275                 # Confirm age
 276                 age_form = {
 277                                 'next_url':             '/',
 278                                 'action_confirm':       'Confirm',
 279                                 }
 280                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
 281                 try:
 282                         self.report_age_confirmation()
 283                         age_results = urllib2.urlopen(request).read()
 284                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 285                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 286                         return
 287
 288         def _real_extract(self, url):
 289                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 290                 mobj = re.search(self._NEXT_URL_RE, url)
 291                 if mobj:
 292                         url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
 293
 294                 # Extract video id from URL
 295                 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 296                 if mobj is None:
 297                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 298                         return
 299                 video_id = mobj.group(2)
 300
 301                 # Get video webpage
 302                 self.report_video_webpage_download(video_id)
 303                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
 304                 try:
 305                         video_webpage = urllib2.urlopen(request).read()
 306                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 307                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 308                         return
 309
 310                 # Attempt to extract SWF player URL
 311                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 312                 if mobj is not None:
 313                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 314                 else:
 315                         player_url = None
 316
 317                 # Get video info
 318                 self.report_video_info_webpage_download(video_id)
 319                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 320                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 321                                         % (video_id, el_type))
 322                         request = urllib2.Request(video_info_url)
 323                         try:
 324                                 video_info_webpage = urllib2.urlopen(request).read()
 325                                 video_info = parse_qs(video_info_webpage)
 326                                 if 'token' in video_info:
 327                                         break
 328                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 329                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 330                                 return
 331                 if 'token' not in video_info:
 332                         if 'reason' in video_info:
 333                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
 334                         else:
 335                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 336                         return
 337
 338                 # Check for "rental" videos
 339                 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 340                         self._downloader.trouble(u'ERROR: "rental" videos not supported')
 341                         return
 342
 343                 # Start extracting information
 344                 self.report_information_extraction(video_id)
 345
 346                 # uploader
 347                 if 'author' not in video_info:
 348                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 349                         return
 350                 video_uploader = urllib.unquote_plus(video_info['author'][0])
 351
 352                 # title
 353                 if 'title' not in video_info:
 354                         self._downloader.trouble(u'ERROR: unable to extract video title')
 355                         return
 356                 video_title = urllib.unquote_plus(video_info['title'][0])
 357                 video_title = video_title.decode('utf-8')
 358
 359                 # thumbnail image
 360                 if 'thumbnail_url' not in video_info:
 361                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 362                         video_thumbnail = ''
 363                 else:   # don't panic if we can't find it
 364                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
 365
 366                 # upload date
 367                 upload_date = u'NA'
 368                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 369                 if mobj is not None:
 370                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 371                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 372                         for expression in format_expressions:
 373                                 try:
 374                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 375                                 except:
 376                                         pass
 377
 378                 # description
 379                 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
 380                 if video_description: video_description = clean_html(video_description)
 381                 else: video_description = ''
 382
 383                 # closed captions
 384                 video_subtitles = None
 385                 if self._downloader.params.get('writesubtitles', False):
 386                         try:
 387                                 self.report_video_subtitles_download(video_id)
 388                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 389                                 try:
 390                                         srt_list = urllib2.urlopen(request).read()
 391                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 392                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
 393                                 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 394                                 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 395                                 if not srt_lang_list:
 396                                         raise Trouble(u'WARNING: video has no closed captions')
 397                                 if self._downloader.params.get('subtitleslang', False):
 398                                         srt_lang = self._downloader.params.get('subtitleslang')
 399                                 elif 'en' in srt_lang_list:
 400                                         srt_lang = 'en'
 401                                 else:
 402                                         srt_lang = srt_lang_list.keys()[0]
 403                                 if not srt_lang in srt_lang_list:
 404                                         raise Trouble(u'WARNING: no closed captions found in the specified language')
 405                                 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
 406                                 try:
 407                                         srt_xml = urllib2.urlopen(request).read()
 408                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 409                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
 410                                 if not srt_xml:
 411                                         raise Trouble(u'WARNING: unable to download video subtitles')
 412                                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
 413                         except Trouble as trouble:
 414                                 self._downloader.trouble(trouble[0])
 415
 416                 # token
 417                 video_token = urllib.unquote_plus(video_info['token'][0])
 418
 419                 # Decide which formats to download
 420                 req_format = self._downloader.params.get('format', None)
 421
 422                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 423                         self.report_rtmp_download()
 424                         video_url_list = [(None, video_info['conn'][0])]
 425                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 426                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 427                         url_data = [parse_qs(uds) for uds in url_data_strs]
 428                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
 429                         url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 430
 431                         format_limit = self._downloader.params.get('format_limit', None)
 432                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 433                         if format_limit is not None and format_limit in available_formats:
 434                                 format_list = available_formats[available_formats.index(format_limit):]
 435                         else:
 436                                 format_list = available_formats
 437                         existing_formats = [x for x in format_list if x in url_map]
 438                         if len(existing_formats) == 0:
 439                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 440                                 return
 441                         if self._downloader.params.get('listformats', None):
 442                                 self._print_formats(existing_formats)
 443                                 return
 444                         if req_format is None or req_format == 'best':
 445                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 446                         elif req_format == 'worst':
 447                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 448                         elif req_format in ('-1', 'all'):
 449                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 450                         else:
 451                                 # Specific formats. We pick the first in a slash-delimeted sequence.
 452                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 453                                 req_formats = req_format.split('/')
 454                                 video_url_list = None
 455                                 for rf in req_formats:
 456                                         if rf in url_map:
 457                                                 video_url_list = [(rf, url_map[rf])]
 458                                                 break
 459                                 if video_url_list is None:
 460                                         self._downloader.trouble(u'ERROR: requested format not available')
 461                                         return
 462                 else:
 463                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 464                         return
 465
 466                 results = []
 467                 for format_param, video_real_url in video_url_list:
 468                         # Extension
 469                         video_extension = self._video_extensions.get(format_param, 'flv')
 470
 471                         results.append({
 472                                 'id':           video_id.decode('utf-8'),
 473                                 'url':          video_real_url.decode('utf-8'),
 474                                 'uploader':     video_uploader.decode('utf-8'),
 475                                 'upload_date':  upload_date,
 476                                 'title':        video_title,
 477                                 'ext':          video_extension.decode('utf-8'),
 478                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 479                                 'thumbnail':    video_thumbnail.decode('utf-8'),
 480                                 'description':  video_description,
 481                                 'player_url':   player_url,
 482                                 'subtitles':    video_subtitles
 483                         })
 484                 return results
 485
 486
 487 class MetacafeIE(InfoExtractor):
 488         """Information Extractor for metacafe.com."""
 489
 490         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 491         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 492         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 493         IE_NAME = u'metacafe'
 494
 495         def __init__(self, downloader=None):
 496                 InfoExtractor.__init__(self, downloader)
 497
 498         def report_disclaimer(self):
 499                 """Report disclaimer retrieval."""
 500                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 501
 502         def report_age_confirmation(self):
 503                 """Report attempt to confirm age."""
 504                 self._downloader.to_screen(u'[metacafe] Confirming age')
 505
 506         def report_download_webpage(self, video_id):
 507                 """Report webpage download."""
 508                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 509
 510         def report_extraction(self, video_id):
 511                 """Report information extraction."""
 512                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 513
 514         def _real_initialize(self):
 515                 # Retrieve disclaimer
 516                 request = urllib2.Request(self._DISCLAIMER)
 517                 try:
 518                         self.report_disclaimer()
 519                         disclaimer = urllib2.urlopen(request).read()
 520                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 521                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 522                         return
 523
 524                 # Confirm age
 525                 disclaimer_form = {
 526                         'filters': '0',
 527                         'submit': "Continue - I'm over 18",
 528                         }
 529                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
 530                 try:
 531                         self.report_age_confirmation()
 532                         disclaimer = urllib2.urlopen(request).read()
 533                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 534                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 535                         return
 536
 537         def _real_extract(self, url):
 538                 # Extract id and simplified title from URL
 539                 mobj = re.match(self._VALID_URL, url)
 540                 if mobj is None:
 541                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 542                         return
 543
 544                 video_id = mobj.group(1)
 545
 546                 # Check if video comes from YouTube
 547                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 548                 if mobj2 is not None:
 549                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 550                         return
 551
 552                 # Retrieve video webpage to extract further information
 553                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 554                 try:
 555                         self.report_download_webpage(video_id)
 556                         webpage = urllib2.urlopen(request).read()
 557                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 558                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 559                         return
 560
 561                 # Extract URL, uploader and title from webpage
 562                 self.report_extraction(video_id)
 563                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 564                 if mobj is not None:
 565                         mediaURL = urllib.unquote(mobj.group(1))
 566                         video_extension = mediaURL[-3:]
 567
 568                         # Extract gdaKey if available
 569                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 570                         if mobj is None:
 571                                 video_url = mediaURL
 572                         else:
 573                                 gdaKey = mobj.group(1)
 574                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 575                 else:
 576                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 577                         if mobj is None:
 578                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 579                                 return
 580                         vardict = parse_qs(mobj.group(1))
 581                         if 'mediaData' not in vardict:
 582                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 583                                 return
 584                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 585                         if mobj is None:
 586                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 587                                 return
 588                         mediaURL = mobj.group(1).replace('\\/', '/')
 589                         video_extension = mediaURL[-3:]
 590                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 591
 592                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 593                 if mobj is None:
 594                         self._downloader.trouble(u'ERROR: unable to extract title')
 595                         return
 596                 video_title = mobj.group(1).decode('utf-8')
 597
 598                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
 599                 if mobj is None:
 600                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 601                         return
 602                 video_uploader = mobj.group(1)
 603
 604                 return [{
 605                         'id':           video_id.decode('utf-8'),
 606                         'url':          video_url.decode('utf-8'),
 607                         'uploader':     video_uploader.decode('utf-8'),
 608                         'upload_date':  u'NA',
 609                         'title':        video_title,
 610                         'ext':          video_extension.decode('utf-8'),
 611                         'format':       u'NA',
 612                         'player_url':   None,
 613                 }]
 614
 615
 616 class DailymotionIE(InfoExtractor):
 617         """Information Extractor for Dailymotion"""
 618
 619         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 620         IE_NAME = u'dailymotion'
 621
 622         def __init__(self, downloader=None):
 623                 InfoExtractor.__init__(self, downloader)
 624
 625         def report_download_webpage(self, video_id):
 626                 """Report webpage download."""
 627                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
 628
 629         def report_extraction(self, video_id):
 630                 """Report information extraction."""
 631                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 632
 633         def _real_extract(self, url):
 634                 # Extract id and simplified title from URL
 635                 mobj = re.match(self._VALID_URL, url)
 636                 if mobj is None:
 637                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 638                         return
 639
 640                 video_id = mobj.group(1).split('_')[0].split('?')[0]
 641
 642                 video_extension = 'mp4'
 643
 644                 # Retrieve video webpage to extract further information
 645                 request = urllib2.Request(url)
 646                 request.add_header('Cookie', 'family_filter=off')
 647                 try:
 648                         self.report_download_webpage(video_id)
 649                         webpage = urllib2.urlopen(request).read()
 650                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 651                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 652                         return
 653
 654                 # Extract URL, uploader and title from webpage
 655                 self.report_extraction(video_id)
 656                 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 657                 if mobj is None:
 658                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 659                         return
 660                 flashvars = urllib.unquote(mobj.group(1))
 661
 662                 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 663                         if key in flashvars:
 664                                 max_quality = key
 665                                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 666                                 break
 667                 else:
 668                         self._downloader.trouble(u'ERROR: unable to extract video URL')
 669                         return
 670
 671                 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 672                 if mobj is None:
 673                         self._downloader.trouble(u'ERROR: unable to extract video URL')
 674                         return
 675
 676                 video_url = urllib.unquote(mobj.group(1)).replace('\\/', '/')
 677
 678                 # TODO: support choosing qualities
 679
 680                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 681                 if mobj is None:
 682                         self._downloader.trouble(u'ERROR: unable to extract title')
 683                         return
 684                 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
 685
 686                 video_uploader = u'NA'
 687                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
 688                 if mobj is None:
 689                         self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 690                 else:
 691                         video_uploader = mobj.group(1)
 692
 693                 video_upload_date = u'NA'
 694                 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 695                 if mobj is not None:
 696                         video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 697
 698                 return [{
 699                         'id':           video_id.decode('utf-8'),
 700                         'url':          video_url.decode('utf-8'),
 701                         'uploader':     video_uploader.decode('utf-8'),
 702                         'upload_date':  video_upload_date,
 703                         'title':        video_title,
 704                         'ext':          video_extension.decode('utf-8'),
 705                         'format':       u'NA',
 706                         'player_url':   None,
 707                 }]
 708
 709
 710 class GoogleIE(InfoExtractor):
 711         """Information extractor for video.google.com."""
 712
 713         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
 714         IE_NAME = u'video.google'
 715
 716         def __init__(self, downloader=None):
 717                 InfoExtractor.__init__(self, downloader)
 718
 719         def report_download_webpage(self, video_id):
 720                 """Report webpage download."""
 721                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
 722
 723         def report_extraction(self, video_id):
 724                 """Report information extraction."""
 725                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
 726
 727         def _real_extract(self, url):
 728                 # Extract id from URL
 729                 mobj = re.match(self._VALID_URL, url)
 730                 if mobj is None:
 731                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 732                         return
 733
 734                 video_id = mobj.group(1)
 735
 736                 video_extension = 'mp4'
 737
 738                 # Retrieve video webpage to extract further information
 739                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
 740                 try:
 741                         self.report_download_webpage(video_id)
 742                         webpage = urllib2.urlopen(request).read()
 743                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 744                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 745                         return
 746
 747                 # Extract URL, uploader, and title from webpage
 748                 self.report_extraction(video_id)
 749                 mobj = re.search(r"download_url:'([^']+)'", webpage)
 750                 if mobj is None:
 751                         video_extension = 'flv'
 752                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
 753                 if mobj is None:
 754                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 755                         return
 756                 mediaURL = urllib.unquote(mobj.group(1))
 757                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
 758                 mediaURL = mediaURL.replace('\\x26', '\x26')
 759
 760                 video_url = mediaURL
 761
 762                 mobj = re.search(r'<title>(.*)</title>', webpage)
 763                 if mobj is None:
 764                         self._downloader.trouble(u'ERROR: unable to extract title')
 765                         return
 766                 video_title = mobj.group(1).decode('utf-8')
 767
 768                 # Extract video description
 769                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
 770                 if mobj is None:
 771                         self._downloader.trouble(u'ERROR: unable to extract video description')
 772                         return
 773                 video_description = mobj.group(1).decode('utf-8')
 774                 if not video_description:
 775                         video_description = 'No description available.'
 776
 777                 # Extract video thumbnail
 778                 if self._downloader.params.get('forcethumbnail', False):
 779                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
 780                         try:
 781                                 webpage = urllib2.urlopen(request).read()
 782                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 783                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 784                                 return
 785                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
 786                         if mobj is None:
 787                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 788                                 return
 789                         video_thumbnail = mobj.group(1)
 790                 else:   # we need something to pass to process_info
 791                         video_thumbnail = ''
 792
 793                 return [{
 794                         'id':           video_id.decode('utf-8'),
 795                         'url':          video_url.decode('utf-8'),
 796                         'uploader':     u'NA',
 797                         'upload_date':  u'NA',
 798                         'title':        video_title,
 799                         'ext':          video_extension.decode('utf-8'),
 800                         'format':       u'NA',
 801                         'player_url':   None,
 802                 }]
 803
 804
 805 class PhotobucketIE(InfoExtractor):
 806         """Information extractor for photobucket.com."""
 807
 808         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 809         IE_NAME = u'photobucket'
 810
 811         def __init__(self, downloader=None):
 812                 InfoExtractor.__init__(self, downloader)
 813
 814         def report_download_webpage(self, video_id):
 815                 """Report webpage download."""
 816                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 817
 818         def report_extraction(self, video_id):
 819                 """Report information extraction."""
 820                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 821
 822         def _real_extract(self, url):
 823                 # Extract id from URL
 824                 mobj = re.match(self._VALID_URL, url)
 825                 if mobj is None:
 826                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 827                         return
 828
 829                 video_id = mobj.group(1)
 830
 831                 video_extension = 'flv'
 832
 833                 # Retrieve video webpage to extract further information
 834                 request = urllib2.Request(url)
 835                 try:
 836                         self.report_download_webpage(video_id)
 837                         webpage = urllib2.urlopen(request).read()
 838                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 839                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 840                         return
 841
 842                 # Extract URL, uploader, and title from webpage
 843                 self.report_extraction(video_id)
 844                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 845                 if mobj is None:
 846                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 847                         return
 848                 mediaURL = urllib.unquote(mobj.group(1))
 849
 850                 video_url = mediaURL
 851
 852                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 853                 if mobj is None:
 854                         self._downloader.trouble(u'ERROR: unable to extract title')
 855                         return
 856                 video_title = mobj.group(1).decode('utf-8')
 857
 858                 video_uploader = mobj.group(2).decode('utf-8')
 859
 860                 return [{
 861                         'id':           video_id.decode('utf-8'),
 862                         'url':          video_url.decode('utf-8'),
 863                         'uploader':     video_uploader,
 864                         'upload_date':  u'NA',
 865                         'title':        video_title,
 866                         'ext':          video_extension.decode('utf-8'),
 867                         'format':       u'NA',
 868                         'player_url':   None,
 869                 }]
 870
 871
 872 class YahooIE(InfoExtractor):
 873         """Information extractor for video.yahoo.com."""
 874
 875         # _VALID_URL matches all Yahoo! Video URLs
 876         # _VPAGE_URL matches only the extractable '/watch/' URLs
 877         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 878         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 879         IE_NAME = u'video.yahoo'
 880
 881         def __init__(self, downloader=None):
 882                 InfoExtractor.__init__(self, downloader)
 883
 884         def report_download_webpage(self, video_id):
 885                 """Report webpage download."""
 886                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 887
 888         def report_extraction(self, video_id):
 889                 """Report information extraction."""
 890                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 891
 892         def _real_extract(self, url, new_video=True):
 893                 # Extract ID from URL
 894                 mobj = re.match(self._VALID_URL, url)
 895                 if mobj is None:
 896                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 897                         return
 898
 899                 video_id = mobj.group(2)
 900                 video_extension = 'flv'
 901
 902                 # Rewrite valid but non-extractable URLs as
 903                 # extractable English language /watch/ URLs
 904                 if re.match(self._VPAGE_URL, url) is None:
 905                         request = urllib2.Request(url)
 906                         try:
 907                                 webpage = urllib2.urlopen(request).read()
 908                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 909                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 910                                 return
 911
 912                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 913                         if mobj is None:
 914                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 915                                 return
 916                         yahoo_id = mobj.group(1)
 917
 918                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 919                         if mobj is None:
 920                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 921                                 return
 922                         yahoo_vid = mobj.group(1)
 923
 924                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 925                         return self._real_extract(url, new_video=False)
 926
 927                 # Retrieve video webpage to extract further information
 928                 request = urllib2.Request(url)
 929                 try:
 930                         self.report_download_webpage(video_id)
 931                         webpage = urllib2.urlopen(request).read()
 932                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 933                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 934                         return
 935
 936                 # Extract uploader and title from webpage
 937                 self.report_extraction(video_id)
 938                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 939                 if mobj is None:
 940                         self._downloader.trouble(u'ERROR: unable to extract video title')
 941                         return
 942                 video_title = mobj.group(1).decode('utf-8')
 943
 944                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 945                 if mobj is None:
 946                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
 947                         return
 948                 video_uploader = mobj.group(1).decode('utf-8')
 949
 950                 # Extract video thumbnail
 951                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 952                 if mobj is None:
 953                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 954                         return
 955                 video_thumbnail = mobj.group(1).decode('utf-8')
 956
 957                 # Extract video description
 958                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 959                 if mobj is None:
 960                         self._downloader.trouble(u'ERROR: unable to extract video description')
 961                         return
 962                 video_description = mobj.group(1).decode('utf-8')
 963                 if not video_description:
 964                         video_description = 'No description available.'
 965
 966                 # Extract video height and width
 967                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 968                 if mobj is None:
 969                         self._downloader.trouble(u'ERROR: unable to extract video height')
 970                         return
 971                 yv_video_height = mobj.group(1)
 972
 973                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 974                 if mobj is None:
 975                         self._downloader.trouble(u'ERROR: unable to extract video width')
 976                         return
 977                 yv_video_width = mobj.group(1)
 978
 979                 # Retrieve video playlist to extract media URL
 980                 # I'm not completely sure what all these options are, but we
 981                 # seem to need most of them, otherwise the server sends a 401.
 982                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 983                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 984                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 985                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 986                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 987                 try:
 988                         self.report_download_webpage(video_id)
 989                         webpage = urllib2.urlopen(request).read()
 990                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 991                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 992                         return
 993
 994                 # Extract media URL from playlist XML
 995                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 996                 if mobj is None:
 997                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
 998                         return
 999                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1000                 video_url = unescapeHTML(video_url)
1001
1002                 return [{
1003                         'id':           video_id.decode('utf-8'),
1004                         'url':          video_url,
1005                         'uploader':     video_uploader,
1006                         'upload_date':  u'NA',
1007                         'title':        video_title,
1008                         'ext':          video_extension.decode('utf-8'),
1009                         'thumbnail':    video_thumbnail.decode('utf-8'),
1010                         'description':  video_description,
1011                         'thumbnail':    video_thumbnail,
1012                         'player_url':   None,
1013                 }]
1014
1015
1016 class VimeoIE(InfoExtractor):
1017         """Information extractor for vimeo.com."""
1018
1019         # _VALID_URL matches Vimeo URLs
1020         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1021         IE_NAME = u'vimeo'
1022
1023         def __init__(self, downloader=None):
1024                 InfoExtractor.__init__(self, downloader)
1025
1026         def report_download_webpage(self, video_id):
1027                 """Report webpage download."""
1028                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1029
1030         def report_extraction(self, video_id):
1031                 """Report information extraction."""
1032                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1033
1034         def _real_extract(self, url, new_video=True):
1035                 # Extract ID from URL
1036                 mobj = re.match(self._VALID_URL, url)
1037                 if mobj is None:
1038                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1039                         return
1040
1041                 video_id = mobj.group(1)
1042
1043                 # Retrieve video webpage to extract further information
1044                 request = urllib2.Request(url, None, std_headers)
1045                 try:
1046                         self.report_download_webpage(video_id)
1047                         webpage = urllib2.urlopen(request).read()
1048                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1049                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1050                         return
1051
1052                 # Now we begin extracting as much information as we can from what we
1053                 # retrieved. First we extract the information common to all extractors,
1054                 # and latter we extract those that are Vimeo specific.
1055                 self.report_extraction(video_id)
1056
1057                 # Extract the config JSON
1058                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1059                 try:
1060                         config = json.loads(config)
1061                 except:
1062                         self._downloader.trouble(u'ERROR: unable to extract info section')
1063                         return
1064
1065                 # Extract title
1066                 video_title = config["video"]["title"]
1067
1068                 # Extract uploader
1069                 video_uploader = config["video"]["owner"]["name"]
1070
1071                 # Extract video thumbnail
1072                 video_thumbnail = config["video"]["thumbnail"]
1073
1074                 # Extract video description
1075                 video_description = get_element_by_id("description", webpage.decode('utf8'))
1076                 if video_description: video_description = clean_html(video_description)
1077                 else: video_description = ''
1078
1079                 # Extract upload date
1080                 video_upload_date = u'NA'
1081                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1082                 if mobj is not None:
1083                         video_upload_date = mobj.group(1)
1084
1085                 # Vimeo specific: extract request signature and timestamp
1086                 sig = config['request']['signature']
1087                 timestamp = config['request']['timestamp']
1088
1089                 # Vimeo specific: extract video codec and quality information
1090                 # TODO bind to format param
1091                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1092                 for codec in codecs:
1093                         if codec[0] in config["video"]["files"]:
1094                                 video_codec = codec[0]
1095                                 video_extension = codec[1]
1096                                 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1097                                 else: quality = 'sd'
1098                                 break
1099                 else:
1100                         self._downloader.trouble(u'ERROR: no known codec found')
1101                         return
1102
1103                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1104                                         %(video_id, sig, timestamp, quality, video_codec.upper())
1105
1106                 return [{
1107                         'id':           video_id,
1108                         'url':          video_url,
1109                         'uploader':     video_uploader,
1110                         'upload_date':  video_upload_date,
1111                         'title':        video_title,
1112                         'ext':          video_extension,
1113                         'thumbnail':    video_thumbnail,
1114                         'description':  video_description,
1115                         'player_url':   None,
1116                 }]
1117
1118
1119 class GenericIE(InfoExtractor):
1120         """Generic last-resort information extractor."""
1121
1122         _VALID_URL = r'.*'
1123         IE_NAME = u'generic'
1124
1125         def __init__(self, downloader=None):
1126                 InfoExtractor.__init__(self, downloader)
1127
1128         def report_download_webpage(self, video_id):
1129                 """Report webpage download."""
1130                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1131                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1132
1133         def report_extraction(self, video_id):
1134                 """Report information extraction."""
1135                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1136
1137         def report_following_redirect(self, new_url):
1138                 """Report information extraction."""
1139                 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1140
1141         def _test_redirect(self, url):
1142                 """Check if it is a redirect, like url shorteners, in case restart chain."""
1143                 class HeadRequest(urllib2.Request):
1144                         def get_method(self):
1145                                 return "HEAD"
1146
1147                 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1148                         """
1149                         Subclass the HTTPRedirectHandler to make it use our
1150                         HeadRequest also on the redirected URL
1151                         """
1152                         def redirect_request(self, req, fp, code, msg, headers, newurl):
1153                                 if code in (301, 302, 303, 307):
1154                                         newurl = newurl.replace(' ', '%20')
1155                                         newheaders = dict((k,v) for k,v in req.headers.items()
1156                                                                           if k.lower() not in ("content-length", "content-type"))
1157                                         return HeadRequest(newurl,
1158                                                                            headers=newheaders,
1159                                                                            origin_req_host=req.get_origin_req_host(),
1160                                                                            unverifiable=True)
1161                                 else:
1162                                         raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1163
1164                 class HTTPMethodFallback(urllib2.BaseHandler):
1165                         """
1166                         Fallback to GET if HEAD is not allowed (405 HTTP error)
1167                         """
1168                         def http_error_405(self, req, fp, code, msg, headers):
1169                                 fp.read()
1170                                 fp.close()
1171
1172                                 newheaders = dict((k,v) for k,v in req.headers.items()
1173                                                                   if k.lower() not in ("content-length", "content-type"))
1174                                 return self.parent.open(urllib2.Request(req.get_full_url(),
1175                                                                                                  headers=newheaders,
1176                                                                                                  origin_req_host=req.get_origin_req_host(),
1177                                                                                                  unverifiable=True))
1178
1179                 # Build our opener
1180                 opener = urllib2.OpenerDirector()
1181                 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1182                                                 HTTPMethodFallback, HEADRedirectHandler,
1183                                                 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1184                         opener.add_handler(handler())
1185
1186                 response = opener.open(HeadRequest(url))
1187                 new_url = response.geturl()
1188
1189                 if url == new_url: return False
1190
1191                 self.report_following_redirect(new_url)
1192                 self._downloader.download([new_url])
1193                 return True
1194
1195         def _real_extract(self, url):
1196                 if self._test_redirect(url): return
1197
1198                 video_id = url.split('/')[-1]
1199                 request = urllib2.Request(url)
1200                 try:
1201                         self.report_download_webpage(video_id)
1202                         webpage = urllib2.urlopen(request).read()
1203                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1204                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1205                         return
1206                 except ValueError, err:
1207                         # since this is the last-resort InfoExtractor, if
1208                         # this error is thrown, it'll be thrown here
1209                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1210                         return
1211
1212                 self.report_extraction(video_id)
1213                 # Start with something easy: JW Player in SWFObject
1214                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1215                 if mobj is None:
1216                         # Broaden the search a little bit
1217                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1218                 if mobj is None:
1219                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1220                         return
1221
1222                 # It's possible that one of the regexes
1223                 # matched, but returned an empty group:
1224                 if mobj.group(1) is None:
1225                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1226                         return
1227
1228                 video_url = urllib.unquote(mobj.group(1))
1229                 video_id = os.path.basename(video_url)
1230
1231                 # here's a fun little line of code for you:
1232                 video_extension = os.path.splitext(video_id)[1][1:]
1233                 video_id = os.path.splitext(video_id)[0]
1234
1235                 # it's tempting to parse this further, but you would
1236                 # have to take into account all the variations like
1237                 #   Video Title - Site Name
1238                 #   Site Name | Video Title
1239                 #   Video Title - Tagline | Site Name
1240                 # and so on and so forth; it's just not practical
1241                 mobj = re.search(r'<title>(.*)</title>', webpage)
1242                 if mobj is None:
1243                         self._downloader.trouble(u'ERROR: unable to extract title')
1244                         return
1245                 video_title = mobj.group(1).decode('utf-8')
1246
1247                 # video uploader is domain name
1248                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1249                 if mobj is None:
1250                         self._downloader.trouble(u'ERROR: unable to extract title')
1251                         return
1252                 video_uploader = mobj.group(1).decode('utf-8')
1253
1254                 return [{
1255                         'id':           video_id.decode('utf-8'),
1256                         'url':          video_url.decode('utf-8'),
1257                         'uploader':     video_uploader,
1258                         'upload_date':  u'NA',
1259                         'title':        video_title,
1260                         'ext':          video_extension.decode('utf-8'),
1261                         'format':       u'NA',
1262                         'player_url':   None,
1263                 }]
1264
1265
1266 class YoutubeSearchIE(InfoExtractor):
1267         """Information Extractor for YouTube search queries."""
1268         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1269         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1270         _max_youtube_results = 1000
1271         IE_NAME = u'youtube:search'
1272
1273         def __init__(self, downloader=None):
1274                 InfoExtractor.__init__(self, downloader)
1275
1276         def report_download_page(self, query, pagenum):
1277                 """Report attempt to download search page with given number."""
1278                 query = query.decode(preferredencoding())
1279                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1280
1281         def _real_extract(self, query):
1282                 mobj = re.match(self._VALID_URL, query)
1283                 if mobj is None:
1284                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1285                         return
1286
1287                 prefix, query = query.split(':')
1288                 prefix = prefix[8:]
1289                 query = query.encode('utf-8')
1290                 if prefix == '':
1291                         self._download_n_results(query, 1)
1292                         return
1293                 elif prefix == 'all':
1294                         self._download_n_results(query, self._max_youtube_results)
1295                         return
1296                 else:
1297                         try:
1298                                 n = long(prefix)
1299                                 if n <= 0:
1300                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1301                                         return
1302                                 elif n > self._max_youtube_results:
1303                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1304                                         n = self._max_youtube_results
1305                                 self._download_n_results(query, n)
1306                                 return
1307                         except ValueError: # parsing prefix as integer fails
1308                                 self._download_n_results(query, 1)
1309                                 return
1310
1311         def _download_n_results(self, query, n):
1312                 """Downloads a specified number of results for a query"""
1313
1314                 video_ids = []
1315                 pagenum = 0
1316                 limit = n
1317
1318                 while (50 * pagenum) < limit:
1319                         self.report_download_page(query, pagenum+1)
1320                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1321                         request = urllib2.Request(result_url)
1322                         try:
1323                                 data = urllib2.urlopen(request).read()
1324                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1325                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1326                                 return
1327                         api_response = json.loads(data)['data']
1328
1329                         new_ids = list(video['id'] for video in api_response['items'])
1330                         video_ids += new_ids
1331
1332                         limit = min(n, api_response['totalItems'])
1333                         pagenum += 1
1334
1335                 if len(video_ids) > n:
1336                         video_ids = video_ids[:n]
1337                 for id in video_ids:
1338                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1339                 return
1340
1341
1342 class GoogleSearchIE(InfoExtractor):
1343         """Information Extractor for Google Video search queries."""
1344         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1345         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1346         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1347         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1348         _max_google_results = 1000
1349         IE_NAME = u'video.google:search'
1350
1351         def __init__(self, downloader=None):
1352                 InfoExtractor.__init__(self, downloader)
1353
1354         def report_download_page(self, query, pagenum):
1355                 """Report attempt to download playlist page with given number."""
1356                 query = query.decode(preferredencoding())
1357                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1358
1359         def _real_extract(self, query):
1360                 mobj = re.match(self._VALID_URL, query)
1361                 if mobj is None:
1362                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1363                         return
1364
1365                 prefix, query = query.split(':')
1366                 prefix = prefix[8:]
1367                 query = query.encode('utf-8')
1368                 if prefix == '':
1369                         self._download_n_results(query, 1)
1370                         return
1371                 elif prefix == 'all':
1372                         self._download_n_results(query, self._max_google_results)
1373                         return
1374                 else:
1375                         try:
1376                                 n = long(prefix)
1377                                 if n <= 0:
1378                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1379                                         return
1380                                 elif n > self._max_google_results:
1381                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1382                                         n = self._max_google_results
1383                                 self._download_n_results(query, n)
1384                                 return
1385                         except ValueError: # parsing prefix as integer fails
1386                                 self._download_n_results(query, 1)
1387                                 return
1388
1389         def _download_n_results(self, query, n):
1390                 """Downloads a specified number of results for a query"""
1391
1392                 video_ids = []
1393                 pagenum = 0
1394
1395                 while True:
1396                         self.report_download_page(query, pagenum)
1397                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1398                         request = urllib2.Request(result_url)
1399                         try:
1400                                 page = urllib2.urlopen(request).read()
1401                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1402                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1403                                 return
1404
1405                         # Extract video identifiers
1406                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1407                                 video_id = mobj.group(1)
1408                                 if video_id not in video_ids:
1409                                         video_ids.append(video_id)
1410                                         if len(video_ids) == n:
1411                                                 # Specified n videos reached
1412                                                 for id in video_ids:
1413                                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1414                                                 return
1415
1416                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1417                                 for id in video_ids:
1418                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1419                                 return
1420
1421                         pagenum = pagenum + 1
1422
1423
1424 class YahooSearchIE(InfoExtractor):
1425         """Information Extractor for Yahoo! Video search queries."""
1426         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1427         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1428         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1429         _MORE_PAGES_INDICATOR = r'\s*Next'
1430         _max_yahoo_results = 1000
1431         IE_NAME = u'video.yahoo:search'
1432
1433         def __init__(self, downloader=None):
1434                 InfoExtractor.__init__(self, downloader)
1435
1436         def report_download_page(self, query, pagenum):
1437                 """Report attempt to download playlist page with given number."""
1438                 query = query.decode(preferredencoding())
1439                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1440
1441         def _real_extract(self, query):
1442                 mobj = re.match(self._VALID_URL, query)
1443                 if mobj is None:
1444                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1445                         return
1446
1447                 prefix, query = query.split(':')
1448                 prefix = prefix[8:]
1449                 query = query.encode('utf-8')
1450                 if prefix == '':
1451                         self._download_n_results(query, 1)
1452                         return
1453                 elif prefix == 'all':
1454                         self._download_n_results(query, self._max_yahoo_results)
1455                         return
1456                 else:
1457                         try:
1458                                 n = long(prefix)
1459                                 if n <= 0:
1460                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1461                                         return
1462                                 elif n > self._max_yahoo_results:
1463                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1464                                         n = self._max_yahoo_results
1465                                 self._download_n_results(query, n)
1466                                 return
1467                         except ValueError: # parsing prefix as integer fails
1468                                 self._download_n_results(query, 1)
1469                                 return
1470
1471         def _download_n_results(self, query, n):
1472                 """Downloads a specified number of results for a query"""
1473
1474                 video_ids = []
1475                 already_seen = set()
1476                 pagenum = 1
1477
1478                 while True:
1479                         self.report_download_page(query, pagenum)
1480                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1481                         request = urllib2.Request(result_url)
1482                         try:
1483                                 page = urllib2.urlopen(request).read()
1484                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1485                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1486                                 return
1487
1488                         # Extract video identifiers
1489                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1490                                 video_id = mobj.group(1)
1491                                 if video_id not in already_seen:
1492                                         video_ids.append(video_id)
1493                                         already_seen.add(video_id)
1494                                         if len(video_ids) == n:
1495                                                 # Specified n videos reached
1496                                                 for id in video_ids:
1497                                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1498                                                 return
1499
1500                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1501                                 for id in video_ids:
1502                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1503                                 return
1504
1505                         pagenum = pagenum + 1
1506
1507
1508 class YoutubePlaylistIE(InfoExtractor):
1509         """Information Extractor for YouTube playlists."""
1510
1511         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1512         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1513         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1514         _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1515         IE_NAME = u'youtube:playlist'
1516
1517         def __init__(self, downloader=None):
1518                 InfoExtractor.__init__(self, downloader)
1519
1520         def report_download_page(self, playlist_id, pagenum):
1521                 """Report attempt to download playlist page with given number."""
1522                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1523
1524         def _real_extract(self, url):
1525                 # Extract playlist id
1526                 mobj = re.match(self._VALID_URL, url)
1527                 if mobj is None:
1528                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1529                         return
1530
1531                 # Single video case
1532                 if mobj.group(3) is not None:
1533                         self._downloader.download([mobj.group(3)])
1534                         return
1535
1536                 # Download playlist pages
1537                 # prefix is 'p' as default for playlists but there are other types that need extra care
1538                 playlist_prefix = mobj.group(1)
1539                 if playlist_prefix == 'a':
1540                         playlist_access = 'artist'
1541                 else:
1542                         playlist_prefix = 'p'
1543                         playlist_access = 'view_play_list'
1544                 playlist_id = mobj.group(2)
1545                 video_ids = []
1546                 pagenum = 1
1547
1548                 while True:
1549                         self.report_download_page(playlist_id, pagenum)
1550                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1551                         request = urllib2.Request(url)
1552                         try:
1553                                 page = urllib2.urlopen(request).read()
1554                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1555                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1556                                 return
1557
1558                         # Extract video identifiers
1559                         ids_in_page = []
1560                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1561                                 if mobj.group(1) not in ids_in_page:
1562                                         ids_in_page.append(mobj.group(1))
1563                         video_ids.extend(ids_in_page)
1564
1565                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1566                                 break
1567                         pagenum = pagenum + 1
1568
1569                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1570                 playlistend = self._downloader.params.get('playlistend', -1)
1571                 if playlistend == -1:
1572                         video_ids = video_ids[playliststart:]
1573                 else:
1574                         video_ids = video_ids[playliststart:playlistend]
1575
1576                 for id in video_ids:
1577                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1578                 return
1579
1580
1581 class YoutubeUserIE(InfoExtractor):
1582         """Information Extractor for YouTube users."""
1583
1584         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1585         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1586         _GDATA_PAGE_SIZE = 50
1587         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1588         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1589         IE_NAME = u'youtube:user'
1590
1591         def __init__(self, downloader=None):
1592                 InfoExtractor.__init__(self, downloader)
1593
1594         def report_download_page(self, username, start_index):
1595                 """Report attempt to download user page."""
1596                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1597                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1598
1599         def _real_extract(self, url):
1600                 # Extract username
1601                 mobj = re.match(self._VALID_URL, url)
1602                 if mobj is None:
1603                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1604                         return
1605
1606                 username = mobj.group(1)
1607
1608                 # Download video ids using YouTube Data API. Result size per
1609                 # query is limited (currently to 50 videos) so we need to query
1610                 # page by page until there are no video ids - it means we got
1611                 # all of them.
1612
1613                 video_ids = []
1614                 pagenum = 0
1615
1616                 while True:
1617                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1618                         self.report_download_page(username, start_index)
1619
1620                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1621
1622                         try:
1623                                 page = urllib2.urlopen(request).read()
1624                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1625                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1626                                 return
1627
1628                         # Extract video identifiers
1629                         ids_in_page = []
1630
1631                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1632                                 if mobj.group(1) not in ids_in_page:
1633                                         ids_in_page.append(mobj.group(1))
1634
1635                         video_ids.extend(ids_in_page)
1636
1637                         # A little optimization - if current page is not
1638                         # "full", ie. does not contain PAGE_SIZE video ids then
1639                         # we can assume that this page is the last one - there
1640                         # are no more ids on further pages - no need to query
1641                         # again.
1642
1643                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1644                                 break
1645
1646                         pagenum += 1
1647
1648                 all_ids_count = len(video_ids)
1649                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1650                 playlistend = self._downloader.params.get('playlistend', -1)
1651
1652                 if playlistend == -1:
1653                         video_ids = video_ids[playliststart:]
1654                 else:
1655                         video_ids = video_ids[playliststart:playlistend]
1656
1657                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1658                                 (username, all_ids_count, len(video_ids)))
1659
1660                 for video_id in video_ids:
1661                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1662
1663
1664 class BlipTVUserIE(InfoExtractor):
1665         """Information Extractor for blip.tv users."""
1666
1667         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1668         _PAGE_SIZE = 12
1669         IE_NAME = u'blip.tv:user'
1670
1671         def __init__(self, downloader=None):
1672                 InfoExtractor.__init__(self, downloader)
1673
1674         def report_download_page(self, username, pagenum):
1675                 """Report attempt to download user page."""
1676                 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1677                                 (self.IE_NAME, username, pagenum))
1678
1679         def _real_extract(self, url):
1680                 # Extract username
1681                 mobj = re.match(self._VALID_URL, url)
1682                 if mobj is None:
1683                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1684                         return
1685
1686                 username = mobj.group(1)
1687
1688                 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1689
1690                 request = urllib2.Request(url)
1691
1692                 try:
1693                         page = urllib2.urlopen(request).read().decode('utf-8')
1694                         mobj = re.search(r'data-users-id="([^"]+)"', page)
1695                         page_base = page_base % mobj.group(1)
1696                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1697                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1698                         return
1699
1700
1701                 # Download video ids using BlipTV Ajax calls. Result size per
1702                 # query is limited (currently to 12 videos) so we need to query
1703                 # page by page until there are no video ids - it means we got
1704                 # all of them.
1705
1706                 video_ids = []
1707                 pagenum = 1
1708
1709                 while True:
1710                         self.report_download_page(username, pagenum)
1711
1712                         request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1713
1714                         try:
1715                                 page = urllib2.urlopen(request).read().decode('utf-8')
1716                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1717                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1718                                 return
1719
1720                         # Extract video identifiers
1721                         ids_in_page = []
1722
1723                         for mobj in re.finditer(r'href="/([^"]+)"', page):
1724                                 if mobj.group(1) not in ids_in_page:
1725                                         ids_in_page.append(unescapeHTML(mobj.group(1)))
1726
1727                         video_ids.extend(ids_in_page)
1728
1729                         # A little optimization - if current page is not
1730                         # "full", ie. does not contain PAGE_SIZE video ids then
1731                         # we can assume that this page is the last one - there
1732                         # are no more ids on further pages - no need to query
1733                         # again.
1734
1735                         if len(ids_in_page) < self._PAGE_SIZE:
1736                                 break
1737
1738                         pagenum += 1
1739
1740                 all_ids_count = len(video_ids)
1741                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1742                 playlistend = self._downloader.params.get('playlistend', -1)
1743
1744                 if playlistend == -1:
1745                         video_ids = video_ids[playliststart:]
1746                 else:
1747                         video_ids = video_ids[playliststart:playlistend]
1748
1749                 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1750                                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1751
1752                 for video_id in video_ids:
1753                         self._downloader.download([u'http://blip.tv/'+video_id])
1754
1755
1756 class DepositFilesIE(InfoExtractor):
1757         """Information extractor for depositfiles.com"""
1758
1759         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1760         IE_NAME = u'DepositFiles'
1761
1762         def __init__(self, downloader=None):
1763                 InfoExtractor.__init__(self, downloader)
1764
1765         def report_download_webpage(self, file_id):
1766                 """Report webpage download."""
1767                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1768
1769         def report_extraction(self, file_id):
1770                 """Report information extraction."""
1771                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1772
1773         def _real_extract(self, url):
1774                 file_id = url.split('/')[-1]
1775                 # Rebuild url in english locale
1776                 url = 'http://depositfiles.com/en/files/' + file_id
1777
1778                 # Retrieve file webpage with 'Free download' button pressed
1779                 free_download_indication = { 'gateway_result' : '1' }
1780                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1781                 try:
1782                         self.report_download_webpage(file_id)
1783                         webpage = urllib2.urlopen(request).read()
1784                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1785                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1786                         return
1787
1788                 # Search for the real file URL
1789                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1790                 if (mobj is None) or (mobj.group(1) is None):
1791                         # Try to figure out reason of the error.
1792                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1793                         if (mobj is not None) and (mobj.group(1) is not None):
1794                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1795                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1796                         else:
1797                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1798                         return
1799
1800                 file_url = mobj.group(1)
1801                 file_extension = os.path.splitext(file_url)[1][1:]
1802
1803                 # Search for file title
1804                 mobj = re.search(r'<b title="(.*?)">', webpage)
1805                 if mobj is None:
1806                         self._downloader.trouble(u'ERROR: unable to extract title')
1807                         return
1808                 file_title = mobj.group(1).decode('utf-8')
1809
1810                 return [{
1811                         'id':           file_id.decode('utf-8'),
1812                         'url':          file_url.decode('utf-8'),
1813                         'uploader':     u'NA',
1814                         'upload_date':  u'NA',
1815                         'title':        file_title,
1816                         'ext':          file_extension.decode('utf-8'),
1817                         'format':       u'NA',
1818                         'player_url':   None,
1819                 }]
1820
1821
1822 class FacebookIE(InfoExtractor):
1823         """Information Extractor for Facebook"""
1824
1825         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1826         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1827         _NETRC_MACHINE = 'facebook'
1828         _available_formats = ['video', 'highqual', 'lowqual']
1829         _video_extensions = {
1830                 'video': 'mp4',
1831                 'highqual': 'mp4',
1832                 'lowqual': 'mp4',
1833         }
1834         IE_NAME = u'facebook'
1835
1836         def __init__(self, downloader=None):
1837                 InfoExtractor.__init__(self, downloader)
1838
1839         def _reporter(self, message):
1840                 """Add header and report message."""
1841                 self._downloader.to_screen(u'[facebook] %s' % message)
1842
1843         def report_login(self):
1844                 """Report attempt to log in."""
1845                 self._reporter(u'Logging in')
1846
1847         def report_video_webpage_download(self, video_id):
1848                 """Report attempt to download video webpage."""
1849                 self._reporter(u'%s: Downloading video webpage' % video_id)
1850
1851         def report_information_extraction(self, video_id):
1852                 """Report attempt to extract video information."""
1853                 self._reporter(u'%s: Extracting video information' % video_id)
1854
1855         def _parse_page(self, video_webpage):
1856                 """Extract video information from page"""
1857                 # General data
1858                 data = {'title': r'\("video_title", "(.*?)"\)',
1859                         'description': r'<div class="datawrap">(.*?)</div>',
1860                         'owner': r'\("video_owner_name", "(.*?)"\)',
1861                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1862                         }
1863                 video_info = {}
1864                 for piece in data.keys():
1865                         mobj = re.search(data[piece], video_webpage)
1866                         if mobj is not None:
1867                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1868
1869                 # Video urls
1870                 video_urls = {}
1871                 for fmt in self._available_formats:
1872                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1873                         if mobj is not None:
1874                                 # URL is in a Javascript segment inside an escaped Unicode format within
1875                                 # the generally utf-8 page
1876                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1877                 video_info['video_urls'] = video_urls
1878
1879                 return video_info
1880
1881         def _real_initialize(self):
1882                 if self._downloader is None:
1883                         return
1884
1885                 useremail = None
1886                 password = None
1887                 downloader_params = self._downloader.params
1888
1889                 # Attempt to use provided username and password or .netrc data
1890                 if downloader_params.get('username', None) is not None:
1891                         useremail = downloader_params['username']
1892                         password = downloader_params['password']
1893                 elif downloader_params.get('usenetrc', False):
1894                         try:
1895                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1896                                 if info is not None:
1897                                         useremail = info[0]
1898                                         password = info[2]
1899                                 else:
1900                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1901                         except (IOError, netrc.NetrcParseError), err:
1902                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1903                                 return
1904
1905                 if useremail is None:
1906                         return
1907
1908                 # Log in
1909                 login_form = {
1910                         'email': useremail,
1911                         'pass': password,
1912                         'login': 'Log+In'
1913                         }
1914                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1915                 try:
1916                         self.report_login()
1917                         login_results = urllib2.urlopen(request).read()
1918                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1919                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1920                                 return
1921                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1922                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1923                         return
1924
1925         def _real_extract(self, url):
1926                 mobj = re.match(self._VALID_URL, url)
1927                 if mobj is None:
1928                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1929                         return
1930                 video_id = mobj.group('ID')
1931
1932                 # Get video webpage
1933                 self.report_video_webpage_download(video_id)
1934                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1935                 try:
1936                         page = urllib2.urlopen(request)
1937                         video_webpage = page.read()
1938                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1939                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1940                         return
1941
1942                 # Start extracting information
1943                 self.report_information_extraction(video_id)
1944
1945                 # Extract information
1946                 video_info = self._parse_page(video_webpage)
1947
1948                 # uploader
1949                 if 'owner' not in video_info:
1950                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1951                         return
1952                 video_uploader = video_info['owner']
1953
1954                 # title
1955                 if 'title' not in video_info:
1956                         self._downloader.trouble(u'ERROR: unable to extract video title')
1957                         return
1958                 video_title = video_info['title']
1959                 video_title = video_title.decode('utf-8')
1960
1961                 # thumbnail image
1962                 if 'thumbnail' not in video_info:
1963                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1964                         video_thumbnail = ''
1965                 else:
1966                         video_thumbnail = video_info['thumbnail']
1967
1968                 # upload date
1969                 upload_date = u'NA'
1970                 if 'upload_date' in video_info:
1971                         upload_time = video_info['upload_date']
1972                         timetuple = email.utils.parsedate_tz(upload_time)
1973                         if timetuple is not None:
1974                                 try:
1975                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1976                                 except:
1977                                         pass
1978
1979                 # description
1980                 video_description = video_info.get('description', 'No description available.')
1981
1982                 url_map = video_info['video_urls']
1983                 if len(url_map.keys()) > 0:
1984                         # Decide which formats to download
1985                         req_format = self._downloader.params.get('format', None)
1986                         format_limit = self._downloader.params.get('format_limit', None)
1987
1988                         if format_limit is not None and format_limit in self._available_formats:
1989                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1990                         else:
1991                                 format_list = self._available_formats
1992                         existing_formats = [x for x in format_list if x in url_map]
1993                         if len(existing_formats) == 0:
1994                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1995                                 return
1996                         if req_format is None:
1997                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1998                         elif req_format == 'worst':
1999                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2000                         elif req_format == '-1':
2001                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2002                         else:
2003                                 # Specific format
2004                                 if req_format not in url_map:
2005                                         self._downloader.trouble(u'ERROR: requested format not available')
2006                                         return
2007                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2008
2009                 results = []
2010                 for format_param, video_real_url in video_url_list:
2011                         # Extension
2012                         video_extension = self._video_extensions.get(format_param, 'mp4')
2013
2014                         results.append({
2015                                 'id':           video_id.decode('utf-8'),
2016                                 'url':          video_real_url.decode('utf-8'),
2017                                 'uploader':     video_uploader.decode('utf-8'),
2018                                 'upload_date':  upload_date,
2019                                 'title':        video_title,
2020                                 'ext':          video_extension.decode('utf-8'),
2021                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2022                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2023                                 'description':  video_description.decode('utf-8'),
2024                                 'player_url':   None,
2025                         })
2026                 return results
2027
2028 class BlipTVIE(InfoExtractor):
2029         """Information extractor for blip.tv"""
2030
2031         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2032         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2033         IE_NAME = u'blip.tv'
2034
2035         def report_extraction(self, file_id):
2036                 """Report information extraction."""
2037                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2038
2039         def report_direct_download(self, title):
2040                 """Report information extraction."""
2041                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2042
2043         def _real_extract(self, url):
2044                 mobj = re.match(self._VALID_URL, url)
2045                 if mobj is None:
2046                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2047                         return
2048
2049                 if '?' in url:
2050                         cchar = '&'
2051                 else:
2052                         cchar = '?'
2053                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2054                 request = urllib2.Request(json_url.encode('utf-8'))
2055                 self.report_extraction(mobj.group(1))
2056                 info = None
2057                 try:
2058                         urlh = urllib2.urlopen(request)
2059                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2060                                 basename = url.split('/')[-1]
2061                                 title,ext = os.path.splitext(basename)
2062                                 title = title.decode('UTF-8')
2063                                 ext = ext.replace('.', '')
2064                                 self.report_direct_download(title)
2065                                 info = {
2066                                         'id': title,
2067                                         'url': url,
2068                                         'title': title,
2069                                         'ext': ext,
2070                                         'urlhandle': urlh
2071                                 }
2072                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2073                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2074                         return
2075                 if info is None: # Regular URL
2076                         try:
2077                                 json_code = urlh.read()
2078                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2079                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2080                                 return
2081
2082                         try:
2083                                 json_data = json.loads(json_code)
2084                                 if 'Post' in json_data:
2085                                         data = json_data['Post']
2086                                 else:
2087                                         data = json_data
2088
2089                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2090                                 video_url = data['media']['url']
2091                                 umobj = re.match(self._URL_EXT, video_url)
2092                                 if umobj is None:
2093                                         raise ValueError('Can not determine filename extension')
2094                                 ext = umobj.group(1)
2095
2096                                 info = {
2097                                         'id': data['item_id'],
2098                                         'url': video_url,
2099                                         'uploader': data['display_name'],
2100                                         'upload_date': upload_date,
2101                                         'title': data['title'],
2102                                         'ext': ext,
2103                                         'format': data['media']['mimeType'],
2104                                         'thumbnail': data['thumbnailUrl'],
2105                                         'description': data['description'],
2106                                         'player_url': data['embedUrl']
2107                                 }
2108                         except (ValueError,KeyError), err:
2109                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2110                                 return
2111
2112                 std_headers['User-Agent'] = 'iTunes/10.6.1'
2113                 return [info]
2114
2115
2116 class MyVideoIE(InfoExtractor):
2117         """Information Extractor for myvideo.de."""
2118
2119         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2120         IE_NAME = u'myvideo'
2121
2122         def __init__(self, downloader=None):
2123                 InfoExtractor.__init__(self, downloader)
2124
2125         def report_download_webpage(self, video_id):
2126                 """Report webpage download."""
2127                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2128
2129         def report_extraction(self, video_id):
2130                 """Report information extraction."""
2131                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2132
2133         def _real_extract(self,url):
2134                 mobj = re.match(self._VALID_URL, url)
2135                 if mobj is None:
2136                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2137                         return
2138
2139                 video_id = mobj.group(1)
2140
2141                 # Get video webpage
2142                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2143                 try:
2144                         self.report_download_webpage(video_id)
2145                         webpage = urllib2.urlopen(request).read()
2146                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2147                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2148                         return
2149
2150                 self.report_extraction(video_id)
2151                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2152                                  webpage)
2153                 if mobj is None:
2154                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2155                         return
2156                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2157
2158                 mobj = re.search('<title>([^<]+)</title>', webpage)
2159                 if mobj is None:
2160                         self._downloader.trouble(u'ERROR: unable to extract title')
2161                         return
2162
2163                 video_title = mobj.group(1)
2164
2165                 return [{
2166                         'id':           video_id,
2167                         'url':          video_url,
2168                         'uploader':     u'NA',
2169                         'upload_date':  u'NA',
2170                         'title':        video_title,
2171                         'ext':          u'flv',
2172                         'format':       u'NA',
2173                         'player_url':   None,
2174                 }]
2175
2176 class ComedyCentralIE(InfoExtractor):
2177         """Information extractor for The Daily Show and Colbert Report """
2178
2179         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2180         IE_NAME = u'comedycentral'
2181
2182         def report_extraction(self, episode_id):
2183                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2184
2185         def report_config_download(self, episode_id):
2186                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2187
2188         def report_index_download(self, episode_id):
2189                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2190
2191         def report_player_url(self, episode_id):
2192                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2193
2194         def _real_extract(self, url):
2195                 mobj = re.match(self._VALID_URL, url)
2196                 if mobj is None:
2197                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2198                         return
2199
2200                 if mobj.group('shortname'):
2201                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
2202                                 url = u'http://www.thedailyshow.com/full-episodes/'
2203                         else:
2204                                 url = u'http://www.colbertnation.com/full-episodes/'
2205                         mobj = re.match(self._VALID_URL, url)
2206                         assert mobj is not None
2207
2208                 dlNewest = not mobj.group('episode')
2209                 if dlNewest:
2210                         epTitle = mobj.group('showname')
2211                 else:
2212                         epTitle = mobj.group('episode')
2213
2214                 req = urllib2.Request(url)
2215                 self.report_extraction(epTitle)
2216                 try:
2217                         htmlHandle = urllib2.urlopen(req)
2218                         html = htmlHandle.read()
2219                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2220                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2221                         return
2222                 if dlNewest:
2223                         url = htmlHandle.geturl()
2224                         mobj = re.match(self._VALID_URL, url)
2225                         if mobj is None:
2226                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2227                                 return
2228                         if mobj.group('episode') == '':
2229                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2230                                 return
2231                         epTitle = mobj.group('episode')
2232
2233                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2234                 if len(mMovieParams) == 0:
2235                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2236                         return
2237
2238                 playerUrl_raw = mMovieParams[0][0]
2239                 self.report_player_url(epTitle)
2240                 try:
2241                         urlHandle = urllib2.urlopen(playerUrl_raw)
2242                         playerUrl = urlHandle.geturl()
2243                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2244                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2245                         return
2246
2247                 uri = mMovieParams[0][1]
2248                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2249                 self.report_index_download(epTitle)
2250                 try:
2251                         indexXml = urllib2.urlopen(indexUrl).read()
2252                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2253                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2254                         return
2255
2256                 results = []
2257
2258                 idoc = xml.etree.ElementTree.fromstring(indexXml)
2259                 itemEls = idoc.findall('.//item')
2260                 for itemEl in itemEls:
2261                         mediaId = itemEl.findall('./guid')[0].text
2262                         shortMediaId = mediaId.split(':')[-1]
2263                         showId = mediaId.split(':')[-2].replace('.com', '')
2264                         officialTitle = itemEl.findall('./title')[0].text
2265                         officialDate = itemEl.findall('./pubDate')[0].text
2266
2267                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2268                                                 urllib.urlencode({'uri': mediaId}))
2269                         configReq = urllib2.Request(configUrl)
2270                         self.report_config_download(epTitle)
2271                         try:
2272                                 configXml = urllib2.urlopen(configReq).read()
2273                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2274                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2275                                 return
2276
2277                         cdoc = xml.etree.ElementTree.fromstring(configXml)
2278                         turls = []
2279                         for rendition in cdoc.findall('.//rendition'):
2280                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2281                                 turls.append(finfo)
2282
2283                         if len(turls) == 0:
2284                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2285                                 continue
2286
2287                         # For now, just pick the highest bitrate
2288                         format,video_url = turls[-1]
2289
2290                         effTitle = showId + u'-' + epTitle
2291                         info = {
2292                                 'id': shortMediaId,
2293                                 'url': video_url,
2294                                 'uploader': showId,
2295                                 'upload_date': officialDate,
2296                                 'title': effTitle,
2297                                 'ext': 'mp4',
2298                                 'format': format,
2299                                 'thumbnail': None,
2300                                 'description': officialTitle,
2301                                 'player_url': playerUrl
2302                         }
2303
2304                         results.append(info)
2305
2306                 return results
2307
2308
2309 class EscapistIE(InfoExtractor):
2310         """Information extractor for The Escapist """
2311
2312         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2313         IE_NAME = u'escapist'
2314
2315         def report_extraction(self, showName):
2316                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2317
2318         def report_config_download(self, showName):
2319                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2320
2321         def _real_extract(self, url):
2322                 mobj = re.match(self._VALID_URL, url)
2323                 if mobj is None:
2324                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2325                         return
2326                 showName = mobj.group('showname')
2327                 videoId = mobj.group('episode')
2328
2329                 self.report_extraction(showName)
2330                 try:
2331                         webPage = urllib2.urlopen(url)
2332                         webPageBytes = webPage.read()
2333                         m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2334                         webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2335                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2336                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2337                         return
2338
2339                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2340                 description = unescapeHTML(descMatch.group(1))
2341                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2342                 imgUrl = unescapeHTML(imgMatch.group(1))
2343                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2344                 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2345                 configUrlMatch = re.search('config=(.*)$', playerUrl)
2346                 configUrl = urllib2.unquote(configUrlMatch.group(1))
2347
2348                 self.report_config_download(showName)
2349                 try:
2350                         configJSON = urllib2.urlopen(configUrl).read()
2351                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2352                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2353                         return
2354
2355                 # Technically, it's JavaScript, not JSON
2356                 configJSON = configJSON.replace("'", '"')
2357
2358                 try:
2359                         config = json.loads(configJSON)
2360                 except (ValueError,), err:
2361                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2362                         return
2363
2364                 playlist = config['playlist']
2365                 videoUrl = playlist[1]['url']
2366
2367                 info = {
2368                         'id': videoId,
2369                         'url': videoUrl,
2370                         'uploader': showName,
2371                         'upload_date': None,
2372                         'title': showName,
2373                         'ext': 'flv',
2374                         'format': 'flv',
2375                         'thumbnail': imgUrl,
2376                         'description': description,
2377                         'player_url': playerUrl,
2378                 }
2379
2380                 return [info]
2381
2382
2383 class CollegeHumorIE(InfoExtractor):
2384         """Information extractor for collegehumor.com"""
2385
2386         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2387         IE_NAME = u'collegehumor'
2388
2389         def report_webpage(self, video_id):
2390                 """Report information extraction."""
2391                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2392
2393         def report_extraction(self, video_id):
2394                 """Report information extraction."""
2395                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2396
2397         def _real_extract(self, url):
2398                 mobj = re.match(self._VALID_URL, url)
2399                 if mobj is None:
2400                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2401                         return
2402                 video_id = mobj.group('videoid')
2403
2404                 self.report_webpage(video_id)
2405                 request = urllib2.Request(url)
2406                 try:
2407                         webpage = urllib2.urlopen(request).read()
2408                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2409                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2410                         return
2411
2412                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2413                 if m is None:
2414                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2415                         return
2416                 internal_video_id = m.group('internalvideoid')
2417
2418                 info = {
2419                         'id': video_id,
2420                         'internal_id': internal_video_id,
2421                 }
2422
2423                 self.report_extraction(video_id)
2424                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2425                 try:
2426                         metaXml = urllib2.urlopen(xmlUrl).read()
2427                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2428                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2429                         return
2430
2431                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2432                 try:
2433                         videoNode = mdoc.findall('./video')[0]
2434                         info['description'] = videoNode.findall('./description')[0].text
2435                         info['title'] = videoNode.findall('./caption')[0].text
2436                         info['url'] = videoNode.findall('./file')[0].text
2437                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2438                         info['ext'] = info['url'].rpartition('.')[2]
2439                         info['format'] = info['ext']
2440                 except IndexError:
2441                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2442                         return
2443
2444                 return [info]
2445
2446
2447 class XVideosIE(InfoExtractor):
2448         """Information extractor for xvideos.com"""
2449
2450         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2451         IE_NAME = u'xvideos'
2452
2453         def report_webpage(self, video_id):
2454                 """Report information extraction."""
2455                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2456
2457         def report_extraction(self, video_id):
2458                 """Report information extraction."""
2459                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2460
2461         def _real_extract(self, url):
2462                 mobj = re.match(self._VALID_URL, url)
2463                 if mobj is None:
2464                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2465                         return
2466                 video_id = mobj.group(1).decode('utf-8')
2467
2468                 self.report_webpage(video_id)
2469
2470                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2471                 try:
2472                         webpage = urllib2.urlopen(request).read()
2473                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2474                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2475                         return
2476
2477                 self.report_extraction(video_id)
2478
2479
2480                 # Extract video URL
2481                 mobj = re.search(r'flv_url=(.+?)&', webpage)
2482                 if mobj is None:
2483                         self._downloader.trouble(u'ERROR: unable to extract video url')
2484                         return
2485                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2486
2487
2488                 # Extract title
2489                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2490                 if mobj is None:
2491                         self._downloader.trouble(u'ERROR: unable to extract video title')
2492                         return
2493                 video_title = mobj.group(1).decode('utf-8')
2494
2495
2496                 # Extract video thumbnail
2497                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2498                 if mobj is None:
2499                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2500                         return
2501                 video_thumbnail = mobj.group(0).decode('utf-8')
2502
2503                 info = {
2504                         'id': video_id,
2505                         'url': video_url,
2506                         'uploader': None,
2507                         'upload_date': None,
2508                         'title': video_title,
2509                         'ext': 'flv',
2510                         'format': 'flv',
2511                         'thumbnail': video_thumbnail,
2512                         'description': None,
2513                         'player_url': None,
2514                 }
2515
2516                 return [info]
2517
2518
2519 class SoundcloudIE(InfoExtractor):
2520         """Information extractor for soundcloud.com
2521            To access the media, the uid of the song and a stream token
2522            must be extracted from the page source and the script must make
2523            a request to media.soundcloud.com/crossdomain.xml. Then
2524            the media can be grabbed by requesting from an url composed
2525            of the stream token and uid
2526          """
2527
2528         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2529         IE_NAME = u'soundcloud'
2530
2531         def __init__(self, downloader=None):
2532                 InfoExtractor.__init__(self, downloader)
2533
2534         def report_webpage(self, video_id):
2535                 """Report information extraction."""
2536                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2537
2538         def report_extraction(self, video_id):
2539                 """Report information extraction."""
2540                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2541
2542         def _real_extract(self, url):
2543                 mobj = re.match(self._VALID_URL, url)
2544                 if mobj is None:
2545                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2546                         return
2547
2548                 # extract uploader (which is in the url)
2549                 uploader = mobj.group(1).decode('utf-8')
2550                 # extract simple title (uploader + slug of song title)
2551                 slug_title =  mobj.group(2).decode('utf-8')
2552                 simple_title = uploader + u'-' + slug_title
2553
2554                 self.report_webpage('%s/%s' % (uploader, slug_title))
2555
2556                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2557                 try:
2558                         webpage = urllib2.urlopen(request).read()
2559                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2560                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2561                         return
2562
2563                 self.report_extraction('%s/%s' % (uploader, slug_title))
2564
2565                 # extract uid and stream token that soundcloud hands out for access
2566                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2567                 if mobj:
2568                         video_id = mobj.group(1)
2569                         stream_token = mobj.group(2)
2570
2571                 # extract unsimplified title
2572                 mobj = re.search('"title":"(.*?)",', webpage)
2573                 if mobj:
2574                         title = mobj.group(1).decode('utf-8')
2575                 else:
2576                         title = simple_title
2577
2578                 # construct media url (with uid/token)
2579                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2580                 mediaURL = mediaURL % (video_id, stream_token)
2581
2582                 # description
2583                 description = u'No description available'
2584                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2585                 if mobj:
2586                         description = mobj.group(1)
2587
2588                 # upload date
2589                 upload_date = None
2590                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2591                 if mobj:
2592                         try:
2593                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2594                         except Exception, e:
2595                                 self._downloader.to_stderr(str(e))
2596
2597                 # for soundcloud, a request to a cross domain is required for cookies
2598                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2599
2600                 return [{
2601                         'id':           video_id.decode('utf-8'),
2602                         'url':          mediaURL,
2603                         'uploader':     uploader.decode('utf-8'),
2604                         'upload_date':  upload_date,
2605                         'title':        title,
2606                         'ext':          u'mp3',
2607                         'format':       u'NA',
2608                         'player_url':   None,
2609                         'description': description.decode('utf-8')
2610                 }]
2611
2612
2613 class InfoQIE(InfoExtractor):
2614         """Information extractor for infoq.com"""
2615
2616         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2617         IE_NAME = u'infoq'
2618
2619         def report_webpage(self, video_id):
2620                 """Report information extraction."""
2621                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2622
2623         def report_extraction(self, video_id):
2624                 """Report information extraction."""
2625                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2626
2627         def _real_extract(self, url):
2628                 mobj = re.match(self._VALID_URL, url)
2629                 if mobj is None:
2630                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2631                         return
2632
2633                 self.report_webpage(url)
2634
2635                 request = urllib2.Request(url)
2636                 try:
2637                         webpage = urllib2.urlopen(request).read()
2638                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2639                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2640                         return
2641
2642                 self.report_extraction(url)
2643
2644
2645                 # Extract video URL
2646                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2647                 if mobj is None:
2648                         self._downloader.trouble(u'ERROR: unable to extract video url')
2649                         return
2650                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2651
2652
2653                 # Extract title
2654                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2655                 if mobj is None:
2656                         self._downloader.trouble(u'ERROR: unable to extract video title')
2657                         return
2658                 video_title = mobj.group(1).decode('utf-8')
2659
2660                 # Extract description
2661                 video_description = u'No description available.'
2662                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2663                 if mobj is not None:
2664                         video_description = mobj.group(1).decode('utf-8')
2665
2666                 video_filename = video_url.split('/')[-1]
2667                 video_id, extension = video_filename.split('.')
2668
2669                 info = {
2670                         'id': video_id,
2671                         'url': video_url,
2672                         'uploader': None,
2673                         'upload_date': None,
2674                         'title': video_title,
2675                         'ext': extension,
2676                         'format': extension, # Extension is always(?) mp4, but seems to be flv
2677                         'thumbnail': None,
2678                         'description': video_description,
2679                         'player_url': None,
2680                 }
2681
2682                 return [info]
2683
2684 class MixcloudIE(InfoExtractor):
2685         """Information extractor for www.mixcloud.com"""
2686         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2687         IE_NAME = u'mixcloud'
2688
2689         def __init__(self, downloader=None):
2690                 InfoExtractor.__init__(self, downloader)
2691
2692         def report_download_json(self, file_id):
2693                 """Report JSON download."""
2694                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2695
2696         def report_extraction(self, file_id):
2697                 """Report information extraction."""
2698                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2699
2700         def get_urls(self, jsonData, fmt, bitrate='best'):
2701                 """Get urls from 'audio_formats' section in json"""
2702                 file_url = None
2703                 try:
2704                         bitrate_list = jsonData[fmt]
2705                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2706                                 bitrate = max(bitrate_list) # select highest
2707
2708                         url_list = jsonData[fmt][bitrate]
2709                 except TypeError: # we have no bitrate info.
2710                         url_list = jsonData[fmt]
2711                 return url_list
2712
2713         def check_urls(self, url_list):
2714                 """Returns 1st active url from list"""
2715                 for url in url_list:
2716                         try:
2717                                 urllib2.urlopen(url)
2718                                 return url
2719                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2720                                 url = None
2721
2722                 return None
2723
2724         def _print_formats(self, formats):
2725                 print 'Available formats:'
2726                 for fmt in formats.keys():
2727                         for b in formats[fmt]:
2728                                 try:
2729                                         ext = formats[fmt][b][0]
2730                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2731                                 except TypeError: # we have no bitrate info
2732                                         ext = formats[fmt][0]
2733                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2734                                         break
2735
2736         def _real_extract(self, url):
2737                 mobj = re.match(self._VALID_URL, url)
2738                 if mobj is None:
2739                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2740                         return
2741                 # extract uploader & filename from url
2742                 uploader = mobj.group(1).decode('utf-8')
2743                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2744
2745                 # construct API request
2746                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2747                 # retrieve .json file with links to files
2748                 request = urllib2.Request(file_url)
2749                 try:
2750                         self.report_download_json(file_url)
2751                         jsonData = urllib2.urlopen(request).read()
2752                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2753                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2754                         return
2755
2756                 # parse JSON
2757                 json_data = json.loads(jsonData)
2758                 player_url = json_data['player_swf_url']
2759                 formats = dict(json_data['audio_formats'])
2760
2761                 req_format = self._downloader.params.get('format', None)
2762                 bitrate = None
2763
2764                 if self._downloader.params.get('listformats', None):
2765                         self._print_formats(formats)
2766                         return
2767
2768                 if req_format is None or req_format == 'best':
2769                         for format_param in formats.keys():
2770                                 url_list = self.get_urls(formats, format_param)
2771                                 # check urls
2772                                 file_url = self.check_urls(url_list)
2773                                 if file_url is not None:
2774                                         break # got it!
2775                 else:
2776                         if req_format not in formats.keys():
2777                                 self._downloader.trouble(u'ERROR: format is not available')
2778                                 return
2779
2780                         url_list = self.get_urls(formats, req_format)
2781                         file_url = self.check_urls(url_list)
2782                         format_param = req_format
2783
2784                 return [{
2785                         'id': file_id.decode('utf-8'),
2786                         'url': file_url.decode('utf-8'),
2787                         'uploader':     uploader.decode('utf-8'),
2788                         'upload_date': u'NA',
2789                         'title': json_data['name'],
2790                         'ext': file_url.split('.')[-1].decode('utf-8'),
2791                         'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2792                         'thumbnail': json_data['thumbnail_url'],
2793                         'description': json_data['description'],
2794                         'player_url': player_url.decode('utf-8'),
2795                 }]
2796
2797 class StanfordOpenClassroomIE(InfoExtractor):
2798         """Information extractor for Stanford's Open ClassRoom"""
2799
2800         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2801         IE_NAME = u'stanfordoc'
2802
2803         def report_download_webpage(self, objid):
2804                 """Report information extraction."""
2805                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2806
2807         def report_extraction(self, video_id):
2808                 """Report information extraction."""
2809                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2810
2811         def _real_extract(self, url):
2812                 mobj = re.match(self._VALID_URL, url)
2813                 if mobj is None:
2814                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2815                         return
2816
2817                 if mobj.group('course') and mobj.group('video'): # A specific video
2818                         course = mobj.group('course')
2819                         video = mobj.group('video')
2820                         info = {
2821                                 'id': course + '_' + video,
2822                         }
2823
2824                         self.report_extraction(info['id'])
2825                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2826                         xmlUrl = baseUrl + video + '.xml'
2827                         try:
2828                                 metaXml = urllib2.urlopen(xmlUrl).read()
2829                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2830                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2831                                 return
2832                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2833                         try:
2834                                 info['title'] = mdoc.findall('./title')[0].text
2835                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2836                         except IndexError:
2837                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2838                                 return
2839                         info['ext'] = info['url'].rpartition('.')[2]
2840                         info['format'] = info['ext']
2841                         return [info]
2842                 elif mobj.group('course'): # A course page
2843                         course = mobj.group('course')
2844                         info = {
2845                                 'id': course,
2846                                 'type': 'playlist',
2847                         }
2848
2849                         self.report_download_webpage(info['id'])
2850                         try:
2851                                 coursepage = urllib2.urlopen(url).read()
2852                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2853                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2854                                 return
2855
2856                         m = re.search('<h1>([^<]+)</h1>', coursepage)
2857                         if m:
2858                                 info['title'] = unescapeHTML(m.group(1))
2859                         else:
2860                                 info['title'] = info['id']
2861
2862                         m = re.search('<description>([^<]+)</description>', coursepage)
2863                         if m:
2864                                 info['description'] = unescapeHTML(m.group(1))
2865
2866                         links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2867                         info['list'] = [
2868                                 {
2869                                         'type': 'reference',
2870                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2871                                 }
2872                                         for vpage in links]
2873                         results = []
2874                         for entry in info['list']:
2875                                 assert entry['type'] == 'reference'
2876                                 results += self.extract(entry['url'])
2877                         return results
2878
2879                 else: # Root page
2880                         info = {
2881                                 'id': 'Stanford OpenClassroom',
2882                                 'type': 'playlist',
2883                         }
2884
2885                         self.report_download_webpage(info['id'])
2886                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2887                         try:
2888                                 rootpage = urllib2.urlopen(rootURL).read()
2889                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2890                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2891                                 return
2892
2893                         info['title'] = info['id']
2894
2895                         links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2896                         info['list'] = [
2897                                 {
2898                                         'type': 'reference',
2899                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2900                                 }
2901                                         for cpage in links]
2902
2903                         results = []
2904                         for entry in info['list']:
2905                                 assert entry['type'] == 'reference'
2906                                 results += self.extract(entry['url'])
2907                         return results
2908
2909 class MTVIE(InfoExtractor):
2910         """Information extractor for MTV.com"""
2911
2912         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2913         IE_NAME = u'mtv'
2914
2915         def report_webpage(self, video_id):
2916                 """Report information extraction."""
2917                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2918
2919         def report_extraction(self, video_id):
2920                 """Report information extraction."""
2921                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2922
2923         def _real_extract(self, url):
2924                 mobj = re.match(self._VALID_URL, url)
2925                 if mobj is None:
2926                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2927                         return
2928                 if not mobj.group('proto'):
2929                         url = 'http://' + url
2930                 video_id = mobj.group('videoid')
2931                 self.report_webpage(video_id)
2932
2933                 request = urllib2.Request(url)
2934                 try:
2935                         webpage = urllib2.urlopen(request).read()
2936                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2937                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2938                         return
2939
2940                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2941                 if mobj is None:
2942                         self._downloader.trouble(u'ERROR: unable to extract song name')
2943                         return
2944                 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2945                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2946                 if mobj is None:
2947                         self._downloader.trouble(u'ERROR: unable to extract performer')
2948                         return
2949                 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2950                 video_title = performer + ' - ' + song_name
2951
2952                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2953                 if mobj is None:
2954                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
2955                         return
2956                 mtvn_uri = mobj.group(1)
2957
2958                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2959                 if mobj is None:
2960                         self._downloader.trouble(u'ERROR: unable to extract content id')
2961                         return
2962                 content_id = mobj.group(1)
2963
2964                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2965                 self.report_extraction(video_id)
2966                 request = urllib2.Request(videogen_url)
2967                 try:
2968                         metadataXml = urllib2.urlopen(request).read()
2969                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2970                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
2971                         return
2972
2973                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2974                 renditions = mdoc.findall('.//rendition')
2975
2976                 # For now, always pick the highest quality.
2977                 rendition = renditions[-1]
2978
2979                 try:
2980                         _,_,ext = rendition.attrib['type'].partition('/')
2981                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2982                         video_url = rendition.find('./src').text
2983                 except KeyError:
2984                         self._downloader.trouble('Invalid rendition field.')
2985                         return
2986
2987                 info = {
2988                         'id': video_id,
2989                         'url': video_url,
2990                         'uploader': performer,
2991                         'title': video_title,
2992                         'ext': ext,
2993                         'format': format,
2994                 }
2995
2996                 return [info]
2997
2998
2999 class YoukuIE(InfoExtractor):
3000
3001         _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3002         IE_NAME = u'Youku'
3003
3004         def __init__(self, downloader=None):
3005                 InfoExtractor.__init__(self, downloader)
3006
3007         def report_download_webpage(self, file_id):
3008                 """Report webpage download."""
3009                 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3010
3011         def report_extraction(self, file_id):
3012                 """Report information extraction."""
3013                 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3014
3015         def _gen_sid(self):
3016                 nowTime = int(time.time() * 1000)
3017                 random1 = random.randint(1000,1998)
3018                 random2 = random.randint(1000,9999)
3019
3020                 return "%d%d%d" %(nowTime,random1,random2)
3021
3022         def _get_file_ID_mix_string(self, seed):
3023                 mixed = []
3024                 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3025                 seed = float(seed)
3026                 for i in range(len(source)):
3027                         seed  =  (seed * 211 + 30031 ) % 65536
3028                         index  =  math.floor(seed / 65536 * len(source) )
3029                         mixed.append(source[int(index)])
3030                         source.remove(source[int(index)])
3031                 #return ''.join(mixed)
3032                 return mixed
3033
3034         def _get_file_id(self, fileId, seed):
3035                 mixed = self._get_file_ID_mix_string(seed)
3036                 ids = fileId.split('*')
3037                 realId = []
3038                 for ch in ids:
3039                         if ch:
3040                                 realId.append(mixed[int(ch)])
3041                 return ''.join(realId)
3042
3043         def _real_extract(self, url):
3044                 mobj = re.match(self._VALID_URL, url)
3045                 if mobj is None:
3046                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3047                         return
3048                 video_id = mobj.group('ID')
3049
3050                 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3051
3052                 request = urllib2.Request(info_url, None, std_headers)
3053                 try:
3054                         self.report_download_webpage(video_id)
3055                         jsondata = urllib2.urlopen(request).read()
3056                 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3057                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3058                         return
3059
3060                 self.report_extraction(video_id)
3061                 try:
3062                         config = json.loads(jsondata)
3063
3064                         video_title =  config['data'][0]['title']
3065                         seed = config['data'][0]['seed']
3066
3067                         format = self._downloader.params.get('format', None)
3068                         supported_format = config['data'][0]['streamfileids'].keys()
3069
3070                         if format is None or format == 'best':
3071                                 if 'hd2' in supported_format:
3072                                         format = 'hd2'
3073                                 else:
3074                                         format = 'flv'
3075                                 ext = u'flv'
3076                         elif format == 'worst':
3077                                 format = 'mp4'
3078                                 ext = u'mp4'
3079                         else:
3080                                 format = 'flv'
3081                                 ext = u'flv'
3082
3083
3084                         fileid = config['data'][0]['streamfileids'][format]
3085                         seg_number = len(config['data'][0]['segs'][format])
3086
3087                         keys=[]
3088                         for i in xrange(seg_number):
3089                                 keys.append(config['data'][0]['segs'][format][i]['k'])
3090
3091                         #TODO check error
3092                         #youku only could be viewed from mainland china
3093                 except:
3094                         self._downloader.trouble(u'ERROR: unable to extract info section')
3095                         return
3096
3097                 files_info=[]
3098                 sid = self._gen_sid()
3099                 fileid = self._get_file_id(fileid, seed)
3100
3101                 #column 8,9 of fileid represent the segment number
3102                 #fileid[7:9] should be changed
3103                 for index, key in enumerate(keys):
3104
3105                         temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3106                         download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3107
3108                         info = {
3109                                 'id': '%s_part%02d' % (video_id, index),
3110                                 'url': download_url,
3111                                 'uploader': None,
3112                                 'title': video_title,
3113                                 'ext': ext,
3114                                 'format': u'NA'
3115                         }
3116                         files_info.append(info)
3117
3118                 return files_info
3119
3120
3121 class XNXXIE(InfoExtractor):
3122         """Information extractor for xnxx.com"""
3123
3124         _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3125         IE_NAME = u'xnxx'
3126         VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3127         VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3128         VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3129
3130         def report_webpage(self, video_id):
3131                 """Report information extraction"""
3132                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3133
3134         def report_extraction(self, video_id):
3135                 """Report information extraction"""
3136                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3137
3138         def _real_extract(self, url):
3139                 mobj = re.match(self._VALID_URL, url)
3140                 if mobj is None:
3141                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3142                         return
3143                 video_id = mobj.group(1).decode('utf-8')
3144
3145                 self.report_webpage(video_id)
3146
3147                 # Get webpage content
3148                 try:
3149                         webpage = urllib2.urlopen(url).read()
3150                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3151                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3152                         return
3153
3154                 result = re.search(self.VIDEO_URL_RE, webpage)
3155                 if result is None:
3156                         self._downloader.trouble(u'ERROR: unable to extract video url')
3157                         return
3158                 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3159
3160                 result = re.search(self.VIDEO_TITLE_RE, webpage)
3161                 if result is None:
3162                         self._downloader.trouble(u'ERROR: unable to extract video title')
3163                         return
3164                 video_title = result.group(1).decode('utf-8')
3165
3166                 result = re.search(self.VIDEO_THUMB_RE, webpage)
3167                 if result is None:
3168                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3169                         return
3170                 video_thumbnail = result.group(1).decode('utf-8')
3171
3172                 info = {'id': video_id,
3173                                 'url': video_url,
3174                                 'uploader': None,
3175                                 'upload_date': None,
3176                                 'title': video_title,
3177                                 'ext': 'flv',
3178                                 'format': 'flv',
3179                                 'thumbnail': video_thumbnail,
3180                                 'description': None,
3181                                 'player_url': None}
3182
3183                 return [info]
3184
3185
3186 class GooglePlusIE(InfoExtractor):
3187         """Information extractor for plus.google.com."""
3188
3189         _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3190         IE_NAME = u'plus.google'
3191
3192         def __init__(self, downloader=None):
3193                 InfoExtractor.__init__(self, downloader)
3194
3195         def report_extract_entry(self, url):
3196                 """Report downloading extry"""
3197                 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3198
3199         def report_date(self, upload_date):
3200                 """Report downloading extry"""
3201                 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3202
3203         def report_uploader(self, uploader):
3204                 """Report downloading extry"""
3205                 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3206
3207         def report_title(self, video_title):
3208                 """Report downloading extry"""
3209                 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3210
3211         def report_extract_vid_page(self, video_page):
3212                 """Report information extraction."""
3213                 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3214
3215         def _real_extract(self, url):
3216                 # Extract id from URL
3217                 mobj = re.match(self._VALID_URL, url)
3218                 if mobj is None:
3219                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3220                         return
3221
3222                 post_url = mobj.group(0)
3223                 video_id = mobj.group(2)
3224
3225                 video_extension = 'flv'
3226
3227                 # Step 1, Retrieve post webpage to extract further information
3228                 self.report_extract_entry(post_url)
3229                 request = urllib2.Request(post_url)
3230                 try:
3231                         webpage = urllib2.urlopen(request).read()
3232                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3233                         self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % str(err))
3234                         return
3235
3236                 # Extract update date
3237                 upload_date = u'NA'
3238                 pattern = 'title="Timestamp">(.*?)</a>'
3239                 mobj = re.search(pattern, webpage)
3240                 if mobj:
3241                         upload_date = mobj.group(1)
3242                         # Convert timestring to a format suitable for filename
3243                         upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3244                         upload_date = upload_date.strftime('%Y%m%d')
3245                 self.report_date(upload_date)
3246
3247                 # Extract uploader
3248                 uploader = u'NA'
3249                 pattern = r'rel\="author".*?>(.*?)</a>'
3250                 mobj = re.search(pattern, webpage)
3251                 if mobj:
3252                         uploader = mobj.group(1)
3253                 self.report_uploader(uploader)
3254
3255                 # Extract title
3256                 # Get the first line for title
3257                 video_title = u'NA'
3258                 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3259                 mobj = re.search(pattern, webpage)
3260                 if mobj:
3261                         video_title = mobj.group(1)
3262                 self.report_title(video_title)
3263
3264                 # Step 2, Stimulate clicking the image box to launch video
3265                 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3266                 mobj = re.search(pattern, webpage)
3267                 if mobj is None:
3268                         self._downloader.trouble(u'ERROR: unable to extract video page URL')
3269
3270                 video_page = mobj.group(1)
3271                 request = urllib2.Request(video_page)
3272                 try:
3273                         webpage = urllib2.urlopen(request).read()
3274                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3275                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3276                         return
3277                 self.report_extract_vid_page(video_page)
3278
3279
3280                 # Extract video links on video page
3281                 """Extract video links of all sizes"""
3282                 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3283                 mobj = re.findall(pattern, webpage)
3284                 if len(mobj) == 0:
3285                         self._downloader.trouble(u'ERROR: unable to extract video links')
3286
3287                 # Sort in resolution
3288                 links = sorted(mobj)
3289
3290                 # Choose the lowest of the sort, i.e. highest resolution
3291                 video_url = links[-1]
3292                 # Only get the url. The resolution part in the tuple has no use anymore
3293                 video_url = video_url[-1]
3294                 # Treat escaped \u0026 style hex
3295                 video_url = unicode(video_url, "unicode_escape")
3296
3297
3298                 return [{
3299                         'id':           video_id.decode('utf-8'),
3300                         'url':          video_url,
3301                         'uploader':     uploader.decode('utf-8'),
3302                         'upload_date':  upload_date.decode('utf-8'),
3303                         'title':        video_title.decode('utf-8'),
3304                         'ext':          video_extension.decode('utf-8'),
3305                         'format':       u'NA',
3306                         'player_url':   None,
3307                 }]