_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import datetime
   5 import HTMLParser
   6 import httplib
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import time
  12 import urllib
  13 import urllib2
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 from urlparse import parse_qs
  19
  20 try:
  21         import cStringIO as StringIO
  22 except ImportError:
  23         import StringIO
  24
  25 from utils import *
  26
  27
  28 class InfoExtractor(object):
  29         """Information Extractor class.
  30
  31         Information extractors are the classes that, given a URL, extract
  32         information from the video (or videos) the URL refers to. This
  33         information includes the real video URL, the video title and simplified
  34         title, author and others. The information is stored in a dictionary
  35         which is then passed to the FileDownloader. The FileDownloader
  36         processes this information possibly downloading the video to the file
  37         system, among other possible outcomes. The dictionaries must include
  38         the following fields:
  39
  40         id:             Video identifier.
  41         url:            Final video URL.
  42         uploader:       Nickname of the video uploader.
  43         title:          Literal title.
  44         ext:            Video filename extension.
  45         format:         Video format.
  46         player_url:     SWF Player URL (may be None).
  47
  48         The following fields are optional. Their primary purpose is to allow
  49         youtube-dl to serve as the backend for a video search function, such
  50         as the one in youtube2mp3.  They are only used when their respective
  51         forced printing functions are called:
  52
  53         thumbnail:      Full URL to a video thumbnail image.
  54         description:    One-line video description.
  55
  56         Subclasses of this one should re-define the _real_initialize() and
  57         _real_extract() methods and define a _VALID_URL regexp.
  58         Probably, they should also be added to the list of extractors.
  59         """
  60
  61         _ready = False
  62         _downloader = None
  63
  64         def __init__(self, downloader=None):
  65                 """Constructor. Receives an optional downloader."""
  66                 self._ready = False
  67                 self.set_downloader(downloader)
  68
  69         def suitable(self, url):
  70                 """Receives a URL and returns True if suitable for this IE."""
  71                 return re.match(self._VALID_URL, url) is not None
  72
  73         def initialize(self):
  74                 """Initializes an instance (authentication, etc)."""
  75                 if not self._ready:
  76                         self._real_initialize()
  77                         self._ready = True
  78
  79         def extract(self, url):
  80                 """Extracts URL information and returns it in list of dicts."""
  81                 self.initialize()
  82                 return self._real_extract(url)
  83
  84         def set_downloader(self, downloader):
  85                 """Sets the downloader for this IE."""
  86                 self._downloader = downloader
  87
  88         def _real_initialize(self):
  89                 """Real initialization process. Redefine in subclasses."""
  90                 pass
  91
  92         def _real_extract(self, url):
  93                 """Real extraction process. Redefine in subclasses."""
  94                 pass
  95
  96
  97 class YoutubeIE(InfoExtractor):
  98         """Information extractor for youtube.com."""
  99
 100         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|tube\.majestyc\.net/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
 101         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 102         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 103         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 104         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 105         _NETRC_MACHINE = 'youtube'
 106         # Listed in order of quality
 107         _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 108         _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 109         _video_extensions = {
 110                 '13': '3gp',
 111                 '17': 'mp4',
 112                 '18': 'mp4',
 113                 '22': 'mp4',
 114                 '37': 'mp4',
 115                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 116                 '43': 'webm',
 117                 '44': 'webm',
 118                 '45': 'webm',
 119                 '46': 'webm',
 120         }
 121         _video_dimensions = {
 122                 '5': '240x400',
 123                 '6': '???',
 124                 '13': '???',
 125                 '17': '144x176',
 126                 '18': '360x640',
 127                 '22': '720x1280',
 128                 '34': '360x640',
 129                 '35': '480x854',
 130                 '37': '1080x1920',
 131                 '38': '3072x4096',
 132                 '43': '360x640',
 133                 '44': '480x854',
 134                 '45': '720x1280',
 135                 '46': '1080x1920',
 136         }
 137         IE_NAME = u'youtube'
 138
 139         def report_lang(self):
 140                 """Report attempt to set language."""
 141                 self._downloader.to_screen(u'[youtube] Setting language')
 142
 143         def report_login(self):
 144                 """Report attempt to log in."""
 145                 self._downloader.to_screen(u'[youtube] Logging in')
 146
 147         def report_age_confirmation(self):
 148                 """Report attempt to confirm age."""
 149                 self._downloader.to_screen(u'[youtube] Confirming age')
 150
 151         def report_video_webpage_download(self, video_id):
 152                 """Report attempt to download video webpage."""
 153                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 154
 155         def report_video_info_webpage_download(self, video_id):
 156                 """Report attempt to download video info webpage."""
 157                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 158
 159         def report_video_subtitles_download(self, video_id):
 160                 """Report attempt to download video info webpage."""
 161                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 162
 163         def report_information_extraction(self, video_id):
 164                 """Report attempt to extract video information."""
 165                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 166
 167         def report_unavailable_format(self, video_id, format):
 168                 """Report extracted video URL."""
 169                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 170
 171         def report_rtmp_download(self):
 172                 """Indicate the download will use the RTMP protocol."""
 173                 self._downloader.to_screen(u'[youtube] RTMP download detected')
 174
 175         def _closed_captions_xml_to_srt(self, xml_string):
 176                 srt = ''
 177                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 178                 # TODO parse xml instead of regex
 179                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
 180                         if not dur: dur = '4'
 181                         start = float(start)
 182                         end = start + float(dur)
 183                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 184                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 185                         caption = unescapeHTML(caption)
 186                         caption = unescapeHTML(caption) # double cycle, intentional
 187                         srt += str(n+1) + '\n'
 188                         srt += start + ' --> ' + end + '\n'
 189                         srt += caption + '\n\n'
 190                 return srt
 191
 192         def _print_formats(self, formats):
 193                 print 'Available formats:'
 194                 for x in formats:
 195                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
 196
 197         def _real_initialize(self):
 198                 if self._downloader is None:
 199                         return
 200
 201                 username = None
 202                 password = None
 203                 downloader_params = self._downloader.params
 204
 205                 # Attempt to use provided username and password or .netrc data
 206                 if downloader_params.get('username', None) is not None:
 207                         username = downloader_params['username']
 208                         password = downloader_params['password']
 209                 elif downloader_params.get('usenetrc', False):
 210                         try:
 211                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 212                                 if info is not None:
 213                                         username = info[0]
 214                                         password = info[2]
 215                                 else:
 216                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 217                         except (IOError, netrc.NetrcParseError), err:
 218                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 219                                 return
 220
 221                 # Set language
 222                 request = urllib2.Request(self._LANG_URL)
 223                 try:
 224                         self.report_lang()
 225                         urllib2.urlopen(request).read()
 226                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 227                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 228                         return
 229
 230                 # No authentication to be performed
 231                 if username is None:
 232                         return
 233
 234                 # Log in
 235                 login_form = {
 236                                 'current_form': 'loginForm',
 237                                 'next':         '/',
 238                                 'action_login': 'Log In',
 239                                 'username':     username,
 240                                 'password':     password,
 241                                 }
 242                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
 243                 try:
 244                         self.report_login()
 245                         login_results = urllib2.urlopen(request).read()
 246                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 247                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 248                                 return
 249                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 250                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 251                         return
 252
 253                 # Confirm age
 254                 age_form = {
 255                                 'next_url':             '/',
 256                                 'action_confirm':       'Confirm',
 257                                 }
 258                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
 259                 try:
 260                         self.report_age_confirmation()
 261                         age_results = urllib2.urlopen(request).read()
 262                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 263                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 264                         return
 265
 266         def _real_extract(self, url):
 267                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 268                 mobj = re.search(self._NEXT_URL_RE, url)
 269                 if mobj:
 270                         url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
 271
 272                 # Extract video id from URL
 273                 mobj = re.match(self._VALID_URL, url)
 274                 if mobj is None:
 275                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 276                         return
 277                 video_id = mobj.group(2)
 278
 279                 # Get video webpage
 280                 self.report_video_webpage_download(video_id)
 281                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
 282                 try:
 283                         video_webpage = urllib2.urlopen(request).read()
 284                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 285                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 286                         return
 287
 288                 # Attempt to extract SWF player URL
 289                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 290                 if mobj is not None:
 291                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 292                 else:
 293                         player_url = None
 294
 295                 # Get video info
 296                 self.report_video_info_webpage_download(video_id)
 297                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 298                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 299                                         % (video_id, el_type))
 300                         request = urllib2.Request(video_info_url)
 301                         try:
 302                                 video_info_webpage = urllib2.urlopen(request).read()
 303                                 video_info = parse_qs(video_info_webpage)
 304                                 if 'token' in video_info:
 305                                         break
 306                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 307                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 308                                 return
 309                 if 'token' not in video_info:
 310                         if 'reason' in video_info:
 311                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
 312                         else:
 313                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 314                         return
 315
 316                 # Check for "rental" videos
 317                 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 318                         self._downloader.trouble(u'ERROR: "rental" videos not supported')
 319                         return
 320
 321                 # Start extracting information
 322                 self.report_information_extraction(video_id)
 323
 324                 # uploader
 325                 if 'author' not in video_info:
 326                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 327                         return
 328                 video_uploader = urllib.unquote_plus(video_info['author'][0])
 329
 330                 # title
 331                 if 'title' not in video_info:
 332                         self._downloader.trouble(u'ERROR: unable to extract video title')
 333                         return
 334                 video_title = urllib.unquote_plus(video_info['title'][0])
 335                 video_title = video_title.decode('utf-8')
 336
 337                 # thumbnail image
 338                 if 'thumbnail_url' not in video_info:
 339                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 340                         video_thumbnail = ''
 341                 else:   # don't panic if we can't find it
 342                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
 343
 344                 # upload date
 345                 upload_date = u'NA'
 346                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 347                 if mobj is not None:
 348                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 349                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 350                         for expression in format_expressions:
 351                                 try:
 352                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 353                                 except:
 354                                         pass
 355
 356                 # description
 357                 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
 358                 if video_description: video_description = clean_html(video_description)
 359                 else: video_description = ''
 360
 361                 # closed captions
 362                 video_subtitles = None
 363                 if self._downloader.params.get('writesubtitles', False):
 364                         try:
 365                                 self.report_video_subtitles_download(video_id)
 366                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 367                                 try:
 368                                         srt_list = urllib2.urlopen(request).read()
 369                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 370                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
 371                                 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 372                                 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 373                                 if not srt_lang_list:
 374                                         raise Trouble(u'WARNING: video has no closed captions')
 375                                 if self._downloader.params.get('subtitleslang', False):
 376                                         srt_lang = self._downloader.params.get('subtitleslang')
 377                                 elif 'en' in srt_lang_list:
 378                                         srt_lang = 'en'
 379                                 else:
 380                                         srt_lang = srt_lang_list.keys()[0]
 381                                 if not srt_lang in srt_lang_list:
 382                                         raise Trouble(u'WARNING: no closed captions found in the specified language')
 383                                 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
 384                                 try:
 385                                         srt_xml = urllib2.urlopen(request).read()
 386                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 387                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
 388                                 if not srt_xml:
 389                                         raise Trouble(u'WARNING: unable to download video subtitles')
 390                                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
 391                         except Trouble as trouble:
 392                                 self._downloader.trouble(trouble[0])
 393
 394                 # token
 395                 video_token = urllib.unquote_plus(video_info['token'][0])
 396
 397                 # Decide which formats to download
 398                 req_format = self._downloader.params.get('format', None)
 399
 400                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 401                         self.report_rtmp_download()
 402                         video_url_list = [(None, video_info['conn'][0])]
 403                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 404                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 405                         url_data = [parse_qs(uds) for uds in url_data_strs]
 406                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
 407                         url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 408
 409                         format_limit = self._downloader.params.get('format_limit', None)
 410                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 411                         if format_limit is not None and format_limit in available_formats:
 412                                 format_list = available_formats[available_formats.index(format_limit):]
 413                         else:
 414                                 format_list = available_formats
 415                         existing_formats = [x for x in format_list if x in url_map]
 416                         if len(existing_formats) == 0:
 417                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 418                                 return
 419                         if self._downloader.params.get('listformats', None):
 420                                 self._print_formats(existing_formats)
 421                                 return
 422                         if req_format is None or req_format == 'best':
 423                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 424                         elif req_format == 'worst':
 425                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 426                         elif req_format in ('-1', 'all'):
 427                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 428                         else:
 429                                 # Specific formats. We pick the first in a slash-delimeted sequence.
 430                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 431                                 req_formats = req_format.split('/')
 432                                 video_url_list = None
 433                                 for rf in req_formats:
 434                                         if rf in url_map:
 435                                                 video_url_list = [(rf, url_map[rf])]
 436                                                 break
 437                                 if video_url_list is None:
 438                                         self._downloader.trouble(u'ERROR: requested format not available')
 439                                         return
 440                 else:
 441                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 442                         return
 443
 444                 results = []
 445                 for format_param, video_real_url in video_url_list:
 446                         # Extension
 447                         video_extension = self._video_extensions.get(format_param, 'flv')
 448
 449                         results.append({
 450                                 'id':           video_id.decode('utf-8'),
 451                                 'url':          video_real_url.decode('utf-8'),
 452                                 'uploader':     video_uploader.decode('utf-8'),
 453                                 'upload_date':  upload_date,
 454                                 'title':        video_title,
 455                                 'ext':          video_extension.decode('utf-8'),
 456                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 457                                 'thumbnail':    video_thumbnail.decode('utf-8'),
 458                                 'description':  video_description,
 459                                 'player_url':   player_url,
 460                                 'subtitles':    video_subtitles
 461                         })
 462                 return results
 463
 464
 465 class MetacafeIE(InfoExtractor):
 466         """Information Extractor for metacafe.com."""
 467
 468         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 469         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 470         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 471         IE_NAME = u'metacafe'
 472
 473         def __init__(self, downloader=None):
 474                 InfoExtractor.__init__(self, downloader)
 475
 476         def report_disclaimer(self):
 477                 """Report disclaimer retrieval."""
 478                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 479
 480         def report_age_confirmation(self):
 481                 """Report attempt to confirm age."""
 482                 self._downloader.to_screen(u'[metacafe] Confirming age')
 483
 484         def report_download_webpage(self, video_id):
 485                 """Report webpage download."""
 486                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 487
 488         def report_extraction(self, video_id):
 489                 """Report information extraction."""
 490                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 491
 492         def _real_initialize(self):
 493                 # Retrieve disclaimer
 494                 request = urllib2.Request(self._DISCLAIMER)
 495                 try:
 496                         self.report_disclaimer()
 497                         disclaimer = urllib2.urlopen(request).read()
 498                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 499                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 500                         return
 501
 502                 # Confirm age
 503                 disclaimer_form = {
 504                         'filters': '0',
 505                         'submit': "Continue - I'm over 18",
 506                         }
 507                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
 508                 try:
 509                         self.report_age_confirmation()
 510                         disclaimer = urllib2.urlopen(request).read()
 511                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 512                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 513                         return
 514
 515         def _real_extract(self, url):
 516                 # Extract id and simplified title from URL
 517                 mobj = re.match(self._VALID_URL, url)
 518                 if mobj is None:
 519                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 520                         return
 521
 522                 video_id = mobj.group(1)
 523
 524                 # Check if video comes from YouTube
 525                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 526                 if mobj2 is not None:
 527                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 528                         return
 529
 530                 # Retrieve video webpage to extract further information
 531                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 532                 try:
 533                         self.report_download_webpage(video_id)
 534                         webpage = urllib2.urlopen(request).read()
 535                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 536                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 537                         return
 538
 539                 # Extract URL, uploader and title from webpage
 540                 self.report_extraction(video_id)
 541                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 542                 if mobj is not None:
 543                         mediaURL = urllib.unquote(mobj.group(1))
 544                         video_extension = mediaURL[-3:]
 545
 546                         # Extract gdaKey if available
 547                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 548                         if mobj is None:
 549                                 video_url = mediaURL
 550                         else:
 551                                 gdaKey = mobj.group(1)
 552                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 553                 else:
 554                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 555                         if mobj is None:
 556                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 557                                 return
 558                         vardict = parse_qs(mobj.group(1))
 559                         if 'mediaData' not in vardict:
 560                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 561                                 return
 562                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 563                         if mobj is None:
 564                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 565                                 return
 566                         mediaURL = mobj.group(1).replace('\\/', '/')
 567                         video_extension = mediaURL[-3:]
 568                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 569
 570                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 571                 if mobj is None:
 572                         self._downloader.trouble(u'ERROR: unable to extract title')
 573                         return
 574                 video_title = mobj.group(1).decode('utf-8')
 575
 576                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
 577                 if mobj is None:
 578                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 579                         return
 580                 video_uploader = mobj.group(1)
 581
 582                 return [{
 583                         'id':           video_id.decode('utf-8'),
 584                         'url':          video_url.decode('utf-8'),
 585                         'uploader':     video_uploader.decode('utf-8'),
 586                         'upload_date':  u'NA',
 587                         'title':        video_title,
 588                         'ext':          video_extension.decode('utf-8'),
 589                         'format':       u'NA',
 590                         'player_url':   None,
 591                 }]
 592
 593
 594 class DailymotionIE(InfoExtractor):
 595         """Information Extractor for Dailymotion"""
 596
 597         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
 598         IE_NAME = u'dailymotion'
 599
 600         def __init__(self, downloader=None):
 601                 InfoExtractor.__init__(self, downloader)
 602
 603         def report_download_webpage(self, video_id):
 604                 """Report webpage download."""
 605                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
 606
 607         def report_extraction(self, video_id):
 608                 """Report information extraction."""
 609                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 610
 611         def _real_extract(self, url):
 612                 # Extract id and simplified title from URL
 613                 mobj = re.match(self._VALID_URL, url)
 614                 if mobj is None:
 615                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 616                         return
 617
 618                 video_id = mobj.group(1)
 619
 620                 video_extension = 'mp4'
 621
 622                 # Retrieve video webpage to extract further information
 623                 request = urllib2.Request(url)
 624                 request.add_header('Cookie', 'family_filter=off')
 625                 try:
 626                         self.report_download_webpage(video_id)
 627                         webpage = urllib2.urlopen(request).read()
 628                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 629                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 630                         return
 631
 632                 # Extract URL, uploader and title from webpage
 633                 self.report_extraction(video_id)
 634                 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 635                 if mobj is None:
 636                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 637                         return
 638                 flashvars = urllib.unquote(mobj.group(1))
 639                 if 'hqURL' in flashvars: max_quality = 'hqURL'
 640                 elif 'sdURL' in flashvars: max_quality = 'sdURL'
 641                 else: max_quality = 'ldURL'
 642                 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 643                 if mobj is None:
 644                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 645                         return
 646                 video_url = mobj.group(1).replace('\\/', '/')
 647
 648                 # TODO: support choosing qualities
 649
 650                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 651                 if mobj is None:
 652                         self._downloader.trouble(u'ERROR: unable to extract title')
 653                         return
 654                 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
 655
 656                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
 657                 if mobj is None:
 658                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 659                         return
 660                 video_uploader = mobj.group(1)
 661
 662                 return [{
 663                         'id':           video_id.decode('utf-8'),
 664                         'url':          video_url.decode('utf-8'),
 665                         'uploader':     video_uploader.decode('utf-8'),
 666                         'upload_date':  u'NA',
 667                         'title':        video_title,
 668                         'ext':          video_extension.decode('utf-8'),
 669                         'format':       u'NA',
 670                         'player_url':   None,
 671                 }]
 672
 673
 674 class GoogleIE(InfoExtractor):
 675         """Information extractor for video.google.com."""
 676
 677         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
 678         IE_NAME = u'video.google'
 679
 680         def __init__(self, downloader=None):
 681                 InfoExtractor.__init__(self, downloader)
 682
 683         def report_download_webpage(self, video_id):
 684                 """Report webpage download."""
 685                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
 686
 687         def report_extraction(self, video_id):
 688                 """Report information extraction."""
 689                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
 690
 691         def _real_extract(self, url):
 692                 # Extract id from URL
 693                 mobj = re.match(self._VALID_URL, url)
 694                 if mobj is None:
 695                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 696                         return
 697
 698                 video_id = mobj.group(1)
 699
 700                 video_extension = 'mp4'
 701
 702                 # Retrieve video webpage to extract further information
 703                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
 704                 try:
 705                         self.report_download_webpage(video_id)
 706                         webpage = urllib2.urlopen(request).read()
 707                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 708                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 709                         return
 710
 711                 # Extract URL, uploader, and title from webpage
 712                 self.report_extraction(video_id)
 713                 mobj = re.search(r"download_url:'([^']+)'", webpage)
 714                 if mobj is None:
 715                         video_extension = 'flv'
 716                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
 717                 if mobj is None:
 718                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 719                         return
 720                 mediaURL = urllib.unquote(mobj.group(1))
 721                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
 722                 mediaURL = mediaURL.replace('\\x26', '\x26')
 723
 724                 video_url = mediaURL
 725
 726                 mobj = re.search(r'<title>(.*)</title>', webpage)
 727                 if mobj is None:
 728                         self._downloader.trouble(u'ERROR: unable to extract title')
 729                         return
 730                 video_title = mobj.group(1).decode('utf-8')
 731
 732                 # Extract video description
 733                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
 734                 if mobj is None:
 735                         self._downloader.trouble(u'ERROR: unable to extract video description')
 736                         return
 737                 video_description = mobj.group(1).decode('utf-8')
 738                 if not video_description:
 739                         video_description = 'No description available.'
 740
 741                 # Extract video thumbnail
 742                 if self._downloader.params.get('forcethumbnail', False):
 743                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
 744                         try:
 745                                 webpage = urllib2.urlopen(request).read()
 746                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 747                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 748                                 return
 749                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
 750                         if mobj is None:
 751                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 752                                 return
 753                         video_thumbnail = mobj.group(1)
 754                 else:   # we need something to pass to process_info
 755                         video_thumbnail = ''
 756
 757                 return [{
 758                         'id':           video_id.decode('utf-8'),
 759                         'url':          video_url.decode('utf-8'),
 760                         'uploader':     u'NA',
 761                         'upload_date':  u'NA',
 762                         'title':        video_title,
 763                         'ext':          video_extension.decode('utf-8'),
 764                         'format':       u'NA',
 765                         'player_url':   None,
 766                 }]
 767
 768
 769 class PhotobucketIE(InfoExtractor):
 770         """Information extractor for photobucket.com."""
 771
 772         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 773         IE_NAME = u'photobucket'
 774
 775         def __init__(self, downloader=None):
 776                 InfoExtractor.__init__(self, downloader)
 777
 778         def report_download_webpage(self, video_id):
 779                 """Report webpage download."""
 780                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 781
 782         def report_extraction(self, video_id):
 783                 """Report information extraction."""
 784                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 785
 786         def _real_extract(self, url):
 787                 # Extract id from URL
 788                 mobj = re.match(self._VALID_URL, url)
 789                 if mobj is None:
 790                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 791                         return
 792
 793                 video_id = mobj.group(1)
 794
 795                 video_extension = 'flv'
 796
 797                 # Retrieve video webpage to extract further information
 798                 request = urllib2.Request(url)
 799                 try:
 800                         self.report_download_webpage(video_id)
 801                         webpage = urllib2.urlopen(request).read()
 802                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 803                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 804                         return
 805
 806                 # Extract URL, uploader, and title from webpage
 807                 self.report_extraction(video_id)
 808                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 809                 if mobj is None:
 810                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 811                         return
 812                 mediaURL = urllib.unquote(mobj.group(1))
 813
 814                 video_url = mediaURL
 815
 816                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 817                 if mobj is None:
 818                         self._downloader.trouble(u'ERROR: unable to extract title')
 819                         return
 820                 video_title = mobj.group(1).decode('utf-8')
 821
 822                 video_uploader = mobj.group(2).decode('utf-8')
 823
 824                 return [{
 825                         'id':           video_id.decode('utf-8'),
 826                         'url':          video_url.decode('utf-8'),
 827                         'uploader':     video_uploader,
 828                         'upload_date':  u'NA',
 829                         'title':        video_title,
 830                         'ext':          video_extension.decode('utf-8'),
 831                         'format':       u'NA',
 832                         'player_url':   None,
 833                 }]
 834
 835
 836 class YahooIE(InfoExtractor):
 837         """Information extractor for video.yahoo.com."""
 838
 839         # _VALID_URL matches all Yahoo! Video URLs
 840         # _VPAGE_URL matches only the extractable '/watch/' URLs
 841         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 842         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 843         IE_NAME = u'video.yahoo'
 844
 845         def __init__(self, downloader=None):
 846                 InfoExtractor.__init__(self, downloader)
 847
 848         def report_download_webpage(self, video_id):
 849                 """Report webpage download."""
 850                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 851
 852         def report_extraction(self, video_id):
 853                 """Report information extraction."""
 854                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 855
 856         def _real_extract(self, url, new_video=True):
 857                 # Extract ID from URL
 858                 mobj = re.match(self._VALID_URL, url)
 859                 if mobj is None:
 860                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 861                         return
 862
 863                 video_id = mobj.group(2)
 864                 video_extension = 'flv'
 865
 866                 # Rewrite valid but non-extractable URLs as
 867                 # extractable English language /watch/ URLs
 868                 if re.match(self._VPAGE_URL, url) is None:
 869                         request = urllib2.Request(url)
 870                         try:
 871                                 webpage = urllib2.urlopen(request).read()
 872                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 873                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 874                                 return
 875
 876                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 877                         if mobj is None:
 878                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 879                                 return
 880                         yahoo_id = mobj.group(1)
 881
 882                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 883                         if mobj is None:
 884                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 885                                 return
 886                         yahoo_vid = mobj.group(1)
 887
 888                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 889                         return self._real_extract(url, new_video=False)
 890
 891                 # Retrieve video webpage to extract further information
 892                 request = urllib2.Request(url)
 893                 try:
 894                         self.report_download_webpage(video_id)
 895                         webpage = urllib2.urlopen(request).read()
 896                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 897                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 898                         return
 899
 900                 # Extract uploader and title from webpage
 901                 self.report_extraction(video_id)
 902                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 903                 if mobj is None:
 904                         self._downloader.trouble(u'ERROR: unable to extract video title')
 905                         return
 906                 video_title = mobj.group(1).decode('utf-8')
 907
 908                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 909                 if mobj is None:
 910                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
 911                         return
 912                 video_uploader = mobj.group(1).decode('utf-8')
 913
 914                 # Extract video thumbnail
 915                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 916                 if mobj is None:
 917                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 918                         return
 919                 video_thumbnail = mobj.group(1).decode('utf-8')
 920
 921                 # Extract video description
 922                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 923                 if mobj is None:
 924                         self._downloader.trouble(u'ERROR: unable to extract video description')
 925                         return
 926                 video_description = mobj.group(1).decode('utf-8')
 927                 if not video_description:
 928                         video_description = 'No description available.'
 929
 930                 # Extract video height and width
 931                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 932                 if mobj is None:
 933                         self._downloader.trouble(u'ERROR: unable to extract video height')
 934                         return
 935                 yv_video_height = mobj.group(1)
 936
 937                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 938                 if mobj is None:
 939                         self._downloader.trouble(u'ERROR: unable to extract video width')
 940                         return
 941                 yv_video_width = mobj.group(1)
 942
 943                 # Retrieve video playlist to extract media URL
 944                 # I'm not completely sure what all these options are, but we
 945                 # seem to need most of them, otherwise the server sends a 401.
 946                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 947                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 948                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 949                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 950                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 951                 try:
 952                         self.report_download_webpage(video_id)
 953                         webpage = urllib2.urlopen(request).read()
 954                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 955                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 956                         return
 957
 958                 # Extract media URL from playlist XML
 959                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 960                 if mobj is None:
 961                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
 962                         return
 963                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
 964                 video_url = unescapeHTML(video_url)
 965
 966                 return [{
 967                         'id':           video_id.decode('utf-8'),
 968                         'url':          video_url,
 969                         'uploader':     video_uploader,
 970                         'upload_date':  u'NA',
 971                         'title':        video_title,
 972                         'ext':          video_extension.decode('utf-8'),
 973                         'thumbnail':    video_thumbnail.decode('utf-8'),
 974                         'description':  video_description,
 975                         'thumbnail':    video_thumbnail,
 976                         'player_url':   None,
 977                 }]
 978
 979
 980 class VimeoIE(InfoExtractor):
 981         """Information extractor for vimeo.com."""
 982
 983         # _VALID_URL matches Vimeo URLs
 984         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
 985         IE_NAME = u'vimeo'
 986
 987         def __init__(self, downloader=None):
 988                 InfoExtractor.__init__(self, downloader)
 989
 990         def report_download_webpage(self, video_id):
 991                 """Report webpage download."""
 992                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
 993
 994         def report_extraction(self, video_id):
 995                 """Report information extraction."""
 996                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
 997
 998         def _real_extract(self, url, new_video=True):
 999                 # Extract ID from URL
1000                 mobj = re.match(self._VALID_URL, url)
1001                 if mobj is None:
1002                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1003                         return
1004
1005                 video_id = mobj.group(1)
1006
1007                 # Retrieve video webpage to extract further information
1008                 request = urllib2.Request(url, None, std_headers)
1009                 try:
1010                         self.report_download_webpage(video_id)
1011                         webpage = urllib2.urlopen(request).read()
1012                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1013                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1014                         return
1015
1016                 # Now we begin extracting as much information as we can from what we
1017                 # retrieved. First we extract the information common to all extractors,
1018                 # and latter we extract those that are Vimeo specific.
1019                 self.report_extraction(video_id)
1020
1021                 # Extract the config JSON
1022                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1023                 try:
1024                         config = json.loads(config)
1025                 except:
1026                         self._downloader.trouble(u'ERROR: unable to extract info section')
1027                         return
1028
1029                 # Extract title
1030                 video_title = config["video"]["title"]
1031
1032                 # Extract uploader
1033                 video_uploader = config["video"]["owner"]["name"]
1034
1035                 # Extract video thumbnail
1036                 video_thumbnail = config["video"]["thumbnail"]
1037
1038                 # Extract video description
1039                 video_description = get_element_by_id("description", webpage.decode('utf8'))
1040                 if video_description: video_description = clean_html(video_description)
1041                 else: video_description = ''
1042
1043                 # Extract upload date
1044                 video_upload_date = u'NA'
1045                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1046                 if mobj is not None:
1047                         video_upload_date = mobj.group(1)
1048
1049                 # Vimeo specific: extract request signature and timestamp
1050                 sig = config['request']['signature']
1051                 timestamp = config['request']['timestamp']
1052
1053                 # Vimeo specific: extract video codec and quality information
1054                 # TODO bind to format param
1055                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1056                 for codec in codecs:
1057                         if codec[0] in config["video"]["files"]:
1058                                 video_codec = codec[0]
1059                                 video_extension = codec[1]
1060                                 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1061                                 else: quality = 'sd'
1062                                 break
1063                 else:
1064                         self._downloader.trouble(u'ERROR: no known codec found')
1065                         return
1066
1067                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1068                                         %(video_id, sig, timestamp, quality, video_codec.upper())
1069
1070                 return [{
1071                         'id':           video_id,
1072                         'url':          video_url,
1073                         'uploader':     video_uploader,
1074                         'upload_date':  video_upload_date,
1075                         'title':        video_title,
1076                         'ext':          video_extension,
1077                         'thumbnail':    video_thumbnail,
1078                         'description':  video_description,
1079                         'player_url':   None,
1080                 }]
1081
1082
1083 class GenericIE(InfoExtractor):
1084         """Generic last-resort information extractor."""
1085
1086         _VALID_URL = r'.*'
1087         IE_NAME = u'generic'
1088
1089         def __init__(self, downloader=None):
1090                 InfoExtractor.__init__(self, downloader)
1091
1092         def report_download_webpage(self, video_id):
1093                 """Report webpage download."""
1094                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1095                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1096
1097         def report_extraction(self, video_id):
1098                 """Report information extraction."""
1099                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1100
1101         def report_following_redirect(self, new_url):
1102                 """Report information extraction."""
1103                 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1104
1105         def _test_redirect(self, url):
1106                 """Check if it is a redirect, like url shorteners, in case restart chain."""
1107                 class HeadRequest(urllib2.Request):
1108                         def get_method(self):
1109                                 return "HEAD"
1110
1111                 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1112                         """
1113                         Subclass the HTTPRedirectHandler to make it use our
1114                         HeadRequest also on the redirected URL
1115                         """
1116                         def redirect_request(self, req, fp, code, msg, headers, newurl):
1117                                 if code in (301, 302, 303, 307):
1118                                         newurl = newurl.replace(' ', '%20')
1119                                         newheaders = dict((k,v) for k,v in req.headers.items()
1120                                                                           if k.lower() not in ("content-length", "content-type"))
1121                                         return HeadRequest(newurl,
1122                                                                            headers=newheaders,
1123                                                                            origin_req_host=req.get_origin_req_host(),
1124                                                                            unverifiable=True)
1125                                 else:
1126                                         raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1127
1128                 class HTTPMethodFallback(urllib2.BaseHandler):
1129                         """
1130                         Fallback to GET if HEAD is not allowed (405 HTTP error)
1131                         """
1132                         def http_error_405(self, req, fp, code, msg, headers):
1133                                 fp.read()
1134                                 fp.close()
1135
1136                                 newheaders = dict((k,v) for k,v in req.headers.items()
1137                                                                   if k.lower() not in ("content-length", "content-type"))
1138                                 return self.parent.open(urllib2.Request(req.get_full_url(),
1139                                                                                                  headers=newheaders,
1140                                                                                                  origin_req_host=req.get_origin_req_host(),
1141                                                                                                  unverifiable=True))
1142
1143                 # Build our opener
1144                 opener = urllib2.OpenerDirector()
1145                 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1146                                                 HTTPMethodFallback, HEADRedirectHandler,
1147                                                 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1148                         opener.add_handler(handler())
1149
1150                 response = opener.open(HeadRequest(url))
1151                 new_url = response.geturl()
1152
1153                 if url == new_url: return False
1154
1155                 self.report_following_redirect(new_url)
1156                 self._downloader.download([new_url])
1157                 return True
1158
1159         def _real_extract(self, url):
1160                 if self._test_redirect(url): return
1161
1162                 video_id = url.split('/')[-1]
1163                 request = urllib2.Request(url)
1164                 try:
1165                         self.report_download_webpage(video_id)
1166                         webpage = urllib2.urlopen(request).read()
1167                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1168                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1169                         return
1170                 except ValueError, err:
1171                         # since this is the last-resort InfoExtractor, if
1172                         # this error is thrown, it'll be thrown here
1173                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1174                         return
1175
1176                 self.report_extraction(video_id)
1177                 # Start with something easy: JW Player in SWFObject
1178                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1179                 if mobj is None:
1180                         # Broaden the search a little bit
1181                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1182                 if mobj is None:
1183                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1184                         return
1185
1186                 # It's possible that one of the regexes
1187                 # matched, but returned an empty group:
1188                 if mobj.group(1) is None:
1189                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1190                         return
1191
1192                 video_url = urllib.unquote(mobj.group(1))
1193                 video_id = os.path.basename(video_url)
1194
1195                 # here's a fun little line of code for you:
1196                 video_extension = os.path.splitext(video_id)[1][1:]
1197                 video_id = os.path.splitext(video_id)[0]
1198
1199                 # it's tempting to parse this further, but you would
1200                 # have to take into account all the variations like
1201                 #   Video Title - Site Name
1202                 #   Site Name | Video Title
1203                 #   Video Title - Tagline | Site Name
1204                 # and so on and so forth; it's just not practical
1205                 mobj = re.search(r'<title>(.*)</title>', webpage)
1206                 if mobj is None:
1207                         self._downloader.trouble(u'ERROR: unable to extract title')
1208                         return
1209                 video_title = mobj.group(1).decode('utf-8')
1210
1211                 # video uploader is domain name
1212                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1213                 if mobj is None:
1214                         self._downloader.trouble(u'ERROR: unable to extract title')
1215                         return
1216                 video_uploader = mobj.group(1).decode('utf-8')
1217
1218                 return [{
1219                         'id':           video_id.decode('utf-8'),
1220                         'url':          video_url.decode('utf-8'),
1221                         'uploader':     video_uploader,
1222                         'upload_date':  u'NA',
1223                         'title':        video_title,
1224                         'ext':          video_extension.decode('utf-8'),
1225                         'format':       u'NA',
1226                         'player_url':   None,
1227                 }]
1228
1229
1230 class YoutubeSearchIE(InfoExtractor):
1231         """Information Extractor for YouTube search queries."""
1232         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1233         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1234         _max_youtube_results = 1000
1235         IE_NAME = u'youtube:search'
1236
1237         def __init__(self, downloader=None):
1238                 InfoExtractor.__init__(self, downloader)
1239
1240         def report_download_page(self, query, pagenum):
1241                 """Report attempt to download search page with given number."""
1242                 query = query.decode(preferredencoding())
1243                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1244
1245         def _real_extract(self, query):
1246                 mobj = re.match(self._VALID_URL, query)
1247                 if mobj is None:
1248                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1249                         return
1250
1251                 prefix, query = query.split(':')
1252                 prefix = prefix[8:]
1253                 query = query.encode('utf-8')
1254                 if prefix == '':
1255                         self._download_n_results(query, 1)
1256                         return
1257                 elif prefix == 'all':
1258                         self._download_n_results(query, self._max_youtube_results)
1259                         return
1260                 else:
1261                         try:
1262                                 n = long(prefix)
1263                                 if n <= 0:
1264                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1265                                         return
1266                                 elif n > self._max_youtube_results:
1267                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1268                                         n = self._max_youtube_results
1269                                 self._download_n_results(query, n)
1270                                 return
1271                         except ValueError: # parsing prefix as integer fails
1272                                 self._download_n_results(query, 1)
1273                                 return
1274
1275         def _download_n_results(self, query, n):
1276                 """Downloads a specified number of results for a query"""
1277
1278                 video_ids = []
1279                 pagenum = 0
1280                 limit = n
1281
1282                 while (50 * pagenum) < limit:
1283                         self.report_download_page(query, pagenum+1)
1284                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1285                         request = urllib2.Request(result_url)
1286                         try:
1287                                 data = urllib2.urlopen(request).read()
1288                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1289                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1290                                 return
1291                         api_response = json.loads(data)['data']
1292
1293                         new_ids = list(video['id'] for video in api_response['items'])
1294                         video_ids += new_ids
1295
1296                         limit = min(n, api_response['totalItems'])
1297                         pagenum += 1
1298
1299                 if len(video_ids) > n:
1300                         video_ids = video_ids[:n]
1301                 for id in video_ids:
1302                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1303                 return
1304
1305
1306 class GoogleSearchIE(InfoExtractor):
1307         """Information Extractor for Google Video search queries."""
1308         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1309         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1310         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1311         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1312         _max_google_results = 1000
1313         IE_NAME = u'video.google:search'
1314
1315         def __init__(self, downloader=None):
1316                 InfoExtractor.__init__(self, downloader)
1317
1318         def report_download_page(self, query, pagenum):
1319                 """Report attempt to download playlist page with given number."""
1320                 query = query.decode(preferredencoding())
1321                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1322
1323         def _real_extract(self, query):
1324                 mobj = re.match(self._VALID_URL, query)
1325                 if mobj is None:
1326                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1327                         return
1328
1329                 prefix, query = query.split(':')
1330                 prefix = prefix[8:]
1331                 query = query.encode('utf-8')
1332                 if prefix == '':
1333                         self._download_n_results(query, 1)
1334                         return
1335                 elif prefix == 'all':
1336                         self._download_n_results(query, self._max_google_results)
1337                         return
1338                 else:
1339                         try:
1340                                 n = long(prefix)
1341                                 if n <= 0:
1342                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1343                                         return
1344                                 elif n > self._max_google_results:
1345                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1346                                         n = self._max_google_results
1347                                 self._download_n_results(query, n)
1348                                 return
1349                         except ValueError: # parsing prefix as integer fails
1350                                 self._download_n_results(query, 1)
1351                                 return
1352
1353         def _download_n_results(self, query, n):
1354                 """Downloads a specified number of results for a query"""
1355
1356                 video_ids = []
1357                 pagenum = 0
1358
1359                 while True:
1360                         self.report_download_page(query, pagenum)
1361                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1362                         request = urllib2.Request(result_url)
1363                         try:
1364                                 page = urllib2.urlopen(request).read()
1365                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1366                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1367                                 return
1368
1369                         # Extract video identifiers
1370                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1371                                 video_id = mobj.group(1)
1372                                 if video_id not in video_ids:
1373                                         video_ids.append(video_id)
1374                                         if len(video_ids) == n:
1375                                                 # Specified n videos reached
1376                                                 for id in video_ids:
1377                                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1378                                                 return
1379
1380                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1381                                 for id in video_ids:
1382                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1383                                 return
1384
1385                         pagenum = pagenum + 1
1386
1387
1388 class YahooSearchIE(InfoExtractor):
1389         """Information Extractor for Yahoo! Video search queries."""
1390         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1391         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1392         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1393         _MORE_PAGES_INDICATOR = r'\s*Next'
1394         _max_yahoo_results = 1000
1395         IE_NAME = u'video.yahoo:search'
1396
1397         def __init__(self, downloader=None):
1398                 InfoExtractor.__init__(self, downloader)
1399
1400         def report_download_page(self, query, pagenum):
1401                 """Report attempt to download playlist page with given number."""
1402                 query = query.decode(preferredencoding())
1403                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1404
1405         def _real_extract(self, query):
1406                 mobj = re.match(self._VALID_URL, query)
1407                 if mobj is None:
1408                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1409                         return
1410
1411                 prefix, query = query.split(':')
1412                 prefix = prefix[8:]
1413                 query = query.encode('utf-8')
1414                 if prefix == '':
1415                         self._download_n_results(query, 1)
1416                         return
1417                 elif prefix == 'all':
1418                         self._download_n_results(query, self._max_yahoo_results)
1419                         return
1420                 else:
1421                         try:
1422                                 n = long(prefix)
1423                                 if n <= 0:
1424                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1425                                         return
1426                                 elif n > self._max_yahoo_results:
1427                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1428                                         n = self._max_yahoo_results
1429                                 self._download_n_results(query, n)
1430                                 return
1431                         except ValueError: # parsing prefix as integer fails
1432                                 self._download_n_results(query, 1)
1433                                 return
1434
1435         def _download_n_results(self, query, n):
1436                 """Downloads a specified number of results for a query"""
1437
1438                 video_ids = []
1439                 already_seen = set()
1440                 pagenum = 1
1441
1442                 while True:
1443                         self.report_download_page(query, pagenum)
1444                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1445                         request = urllib2.Request(result_url)
1446                         try:
1447                                 page = urllib2.urlopen(request).read()
1448                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1449                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1450                                 return
1451
1452                         # Extract video identifiers
1453                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1454                                 video_id = mobj.group(1)
1455                                 if video_id not in already_seen:
1456                                         video_ids.append(video_id)
1457                                         already_seen.add(video_id)
1458                                         if len(video_ids) == n:
1459                                                 # Specified n videos reached
1460                                                 for id in video_ids:
1461                                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1462                                                 return
1463
1464                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1465                                 for id in video_ids:
1466                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1467                                 return
1468
1469                         pagenum = pagenum + 1
1470
1471
1472 class YoutubePlaylistIE(InfoExtractor):
1473         """Information Extractor for YouTube playlists."""
1474
1475         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1476         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1477         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=.*?%s'
1478         _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1479         IE_NAME = u'youtube:playlist'
1480
1481         def __init__(self, downloader=None):
1482                 InfoExtractor.__init__(self, downloader)
1483
1484         def report_download_page(self, playlist_id, pagenum):
1485                 """Report attempt to download playlist page with given number."""
1486                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1487
1488         def _real_extract(self, url):
1489                 # Extract playlist id
1490                 mobj = re.match(self._VALID_URL, url)
1491                 if mobj is None:
1492                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1493                         return
1494
1495                 # Single video case
1496                 if mobj.group(3) is not None:
1497                         self._downloader.download([mobj.group(3)])
1498                         return
1499
1500                 # Download playlist pages
1501                 # prefix is 'p' as default for playlists but there are other types that need extra care
1502                 playlist_prefix = mobj.group(1)
1503                 if playlist_prefix == 'a':
1504                         playlist_access = 'artist'
1505                 else:
1506                         playlist_prefix = 'p'
1507                         playlist_access = 'view_play_list'
1508                 playlist_id = mobj.group(2)
1509                 video_ids = []
1510                 pagenum = 1
1511
1512                 while True:
1513                         self.report_download_page(playlist_id, pagenum)
1514                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1515                         request = urllib2.Request(url)
1516                         try:
1517                                 page = urllib2.urlopen(request).read()
1518                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1519                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1520                                 return
1521
1522                         # Extract video identifiers
1523                         ids_in_page = []
1524                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1525                                 if mobj.group(1) not in ids_in_page:
1526                                         ids_in_page.append(mobj.group(1))
1527                         video_ids.extend(ids_in_page)
1528
1529                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1530                                 break
1531                         pagenum = pagenum + 1
1532
1533                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1534                 playlistend = self._downloader.params.get('playlistend', -1)
1535                 if playlistend == -1:
1536                         video_ids = video_ids[playliststart:]
1537                 else:
1538                         video_ids = video_ids[playliststart:playlistend]
1539
1540                 for id in video_ids:
1541                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1542                 return
1543
1544
1545 class YoutubeUserIE(InfoExtractor):
1546         """Information Extractor for YouTube users."""
1547
1548         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1549         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1550         _GDATA_PAGE_SIZE = 50
1551         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1552         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1553         IE_NAME = u'youtube:user'
1554
1555         def __init__(self, downloader=None):
1556                 InfoExtractor.__init__(self, downloader)
1557
1558         def report_download_page(self, username, start_index):
1559                 """Report attempt to download user page."""
1560                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1561                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1562
1563         def _real_extract(self, url):
1564                 # Extract username
1565                 mobj = re.match(self._VALID_URL, url)
1566                 if mobj is None:
1567                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1568                         return
1569
1570                 username = mobj.group(1)
1571
1572                 # Download video ids using YouTube Data API. Result size per
1573                 # query is limited (currently to 50 videos) so we need to query
1574                 # page by page until there are no video ids - it means we got
1575                 # all of them.
1576
1577                 video_ids = []
1578                 pagenum = 0
1579
1580                 while True:
1581                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1582                         self.report_download_page(username, start_index)
1583
1584                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1585
1586                         try:
1587                                 page = urllib2.urlopen(request).read()
1588                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1589                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1590                                 return
1591
1592                         # Extract video identifiers
1593                         ids_in_page = []
1594
1595                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1596                                 if mobj.group(1) not in ids_in_page:
1597                                         ids_in_page.append(mobj.group(1))
1598
1599                         video_ids.extend(ids_in_page)
1600
1601                         # A little optimization - if current page is not
1602                         # "full", ie. does not contain PAGE_SIZE video ids then
1603                         # we can assume that this page is the last one - there
1604                         # are no more ids on further pages - no need to query
1605                         # again.
1606
1607                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1608                                 break
1609
1610                         pagenum += 1
1611
1612                 all_ids_count = len(video_ids)
1613                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1614                 playlistend = self._downloader.params.get('playlistend', -1)
1615
1616                 if playlistend == -1:
1617                         video_ids = video_ids[playliststart:]
1618                 else:
1619                         video_ids = video_ids[playliststart:playlistend]
1620
1621                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1622                                 (username, all_ids_count, len(video_ids)))
1623
1624                 for video_id in video_ids:
1625                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1626
1627
1628 class BlipTVUserIE(InfoExtractor):
1629         """Information Extractor for blip.tv users."""
1630
1631         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1632         _PAGE_SIZE = 12
1633         IE_NAME = u'blip.tv:user'
1634
1635         def __init__(self, downloader=None):
1636                 InfoExtractor.__init__(self, downloader)
1637
1638         def report_download_page(self, username, pagenum):
1639                 """Report attempt to download user page."""
1640                 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1641                                 (self.IE_NAME, username, pagenum))
1642
1643         def _real_extract(self, url):
1644                 # Extract username
1645                 mobj = re.match(self._VALID_URL, url)
1646                 if mobj is None:
1647                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1648                         return
1649
1650                 username = mobj.group(1)
1651
1652                 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1653
1654                 request = urllib2.Request(url)
1655
1656                 try:
1657                         page = urllib2.urlopen(request).read().decode('utf-8')
1658                         mobj = re.search(r'data-users-id="([^"]+)"', page)
1659                         page_base = page_base % mobj.group(1)
1660                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1661                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1662                         return
1663
1664
1665                 # Download video ids using BlipTV Ajax calls. Result size per
1666                 # query is limited (currently to 12 videos) so we need to query
1667                 # page by page until there are no video ids - it means we got
1668                 # all of them.
1669
1670                 video_ids = []
1671                 pagenum = 1
1672
1673                 while True:
1674                         self.report_download_page(username, pagenum)
1675
1676                         request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1677
1678                         try:
1679                                 page = urllib2.urlopen(request).read().decode('utf-8')
1680                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1681                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1682                                 return
1683
1684                         # Extract video identifiers
1685                         ids_in_page = []
1686
1687                         for mobj in re.finditer(r'href="/([^"]+)"', page):
1688                                 if mobj.group(1) not in ids_in_page:
1689                                         ids_in_page.append(unescapeHTML(mobj.group(1)))
1690
1691                         video_ids.extend(ids_in_page)
1692
1693                         # A little optimization - if current page is not
1694                         # "full", ie. does not contain PAGE_SIZE video ids then
1695                         # we can assume that this page is the last one - there
1696                         # are no more ids on further pages - no need to query
1697                         # again.
1698
1699                         if len(ids_in_page) < self._PAGE_SIZE:
1700                                 break
1701
1702                         pagenum += 1
1703
1704                 all_ids_count = len(video_ids)
1705                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1706                 playlistend = self._downloader.params.get('playlistend', -1)
1707
1708                 if playlistend == -1:
1709                         video_ids = video_ids[playliststart:]
1710                 else:
1711                         video_ids = video_ids[playliststart:playlistend]
1712
1713                 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1714                                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1715
1716                 for video_id in video_ids:
1717                         self._downloader.download([u'http://blip.tv/'+video_id])
1718
1719
1720 class DepositFilesIE(InfoExtractor):
1721         """Information extractor for depositfiles.com"""
1722
1723         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1724         IE_NAME = u'DepositFiles'
1725
1726         def __init__(self, downloader=None):
1727                 InfoExtractor.__init__(self, downloader)
1728
1729         def report_download_webpage(self, file_id):
1730                 """Report webpage download."""
1731                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1732
1733         def report_extraction(self, file_id):
1734                 """Report information extraction."""
1735                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1736
1737         def _real_extract(self, url):
1738                 file_id = url.split('/')[-1]
1739                 # Rebuild url in english locale
1740                 url = 'http://depositfiles.com/en/files/' + file_id
1741
1742                 # Retrieve file webpage with 'Free download' button pressed
1743                 free_download_indication = { 'gateway_result' : '1' }
1744                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1745                 try:
1746                         self.report_download_webpage(file_id)
1747                         webpage = urllib2.urlopen(request).read()
1748                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1749                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1750                         return
1751
1752                 # Search for the real file URL
1753                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1754                 if (mobj is None) or (mobj.group(1) is None):
1755                         # Try to figure out reason of the error.
1756                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1757                         if (mobj is not None) and (mobj.group(1) is not None):
1758                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1759                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1760                         else:
1761                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1762                         return
1763
1764                 file_url = mobj.group(1)
1765                 file_extension = os.path.splitext(file_url)[1][1:]
1766
1767                 # Search for file title
1768                 mobj = re.search(r'<b title="(.*?)">', webpage)
1769                 if mobj is None:
1770                         self._downloader.trouble(u'ERROR: unable to extract title')
1771                         return
1772                 file_title = mobj.group(1).decode('utf-8')
1773
1774                 return [{
1775                         'id':           file_id.decode('utf-8'),
1776                         'url':          file_url.decode('utf-8'),
1777                         'uploader':     u'NA',
1778                         'upload_date':  u'NA',
1779                         'title':        file_title,
1780                         'ext':          file_extension.decode('utf-8'),
1781                         'format':       u'NA',
1782                         'player_url':   None,
1783                 }]
1784
1785
1786 class FacebookIE(InfoExtractor):
1787         """Information Extractor for Facebook"""
1788
1789         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1790         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1791         _NETRC_MACHINE = 'facebook'
1792         _available_formats = ['video', 'highqual', 'lowqual']
1793         _video_extensions = {
1794                 'video': 'mp4',
1795                 'highqual': 'mp4',
1796                 'lowqual': 'mp4',
1797         }
1798         IE_NAME = u'facebook'
1799
1800         def __init__(self, downloader=None):
1801                 InfoExtractor.__init__(self, downloader)
1802
1803         def _reporter(self, message):
1804                 """Add header and report message."""
1805                 self._downloader.to_screen(u'[facebook] %s' % message)
1806
1807         def report_login(self):
1808                 """Report attempt to log in."""
1809                 self._reporter(u'Logging in')
1810
1811         def report_video_webpage_download(self, video_id):
1812                 """Report attempt to download video webpage."""
1813                 self._reporter(u'%s: Downloading video webpage' % video_id)
1814
1815         def report_information_extraction(self, video_id):
1816                 """Report attempt to extract video information."""
1817                 self._reporter(u'%s: Extracting video information' % video_id)
1818
1819         def _parse_page(self, video_webpage):
1820                 """Extract video information from page"""
1821                 # General data
1822                 data = {'title': r'\("video_title", "(.*?)"\)',
1823                         'description': r'<div class="datawrap">(.*?)</div>',
1824                         'owner': r'\("video_owner_name", "(.*?)"\)',
1825                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1826                         }
1827                 video_info = {}
1828                 for piece in data.keys():
1829                         mobj = re.search(data[piece], video_webpage)
1830                         if mobj is not None:
1831                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1832
1833                 # Video urls
1834                 video_urls = {}
1835                 for fmt in self._available_formats:
1836                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1837                         if mobj is not None:
1838                                 # URL is in a Javascript segment inside an escaped Unicode format within
1839                                 # the generally utf-8 page
1840                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1841                 video_info['video_urls'] = video_urls
1842
1843                 return video_info
1844
1845         def _real_initialize(self):
1846                 if self._downloader is None:
1847                         return
1848
1849                 useremail = None
1850                 password = None
1851                 downloader_params = self._downloader.params
1852
1853                 # Attempt to use provided username and password or .netrc data
1854                 if downloader_params.get('username', None) is not None:
1855                         useremail = downloader_params['username']
1856                         password = downloader_params['password']
1857                 elif downloader_params.get('usenetrc', False):
1858                         try:
1859                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1860                                 if info is not None:
1861                                         useremail = info[0]
1862                                         password = info[2]
1863                                 else:
1864                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1865                         except (IOError, netrc.NetrcParseError), err:
1866                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1867                                 return
1868
1869                 if useremail is None:
1870                         return
1871
1872                 # Log in
1873                 login_form = {
1874                         'email': useremail,
1875                         'pass': password,
1876                         'login': 'Log+In'
1877                         }
1878                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1879                 try:
1880                         self.report_login()
1881                         login_results = urllib2.urlopen(request).read()
1882                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1883                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1884                                 return
1885                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1886                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1887                         return
1888
1889         def _real_extract(self, url):
1890                 mobj = re.match(self._VALID_URL, url)
1891                 if mobj is None:
1892                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1893                         return
1894                 video_id = mobj.group('ID')
1895
1896                 # Get video webpage
1897                 self.report_video_webpage_download(video_id)
1898                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1899                 try:
1900                         page = urllib2.urlopen(request)
1901                         video_webpage = page.read()
1902                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1903                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1904                         return
1905
1906                 # Start extracting information
1907                 self.report_information_extraction(video_id)
1908
1909                 # Extract information
1910                 video_info = self._parse_page(video_webpage)
1911
1912                 # uploader
1913                 if 'owner' not in video_info:
1914                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1915                         return
1916                 video_uploader = video_info['owner']
1917
1918                 # title
1919                 if 'title' not in video_info:
1920                         self._downloader.trouble(u'ERROR: unable to extract video title')
1921                         return
1922                 video_title = video_info['title']
1923                 video_title = video_title.decode('utf-8')
1924
1925                 # thumbnail image
1926                 if 'thumbnail' not in video_info:
1927                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1928                         video_thumbnail = ''
1929                 else:
1930                         video_thumbnail = video_info['thumbnail']
1931
1932                 # upload date
1933                 upload_date = u'NA'
1934                 if 'upload_date' in video_info:
1935                         upload_time = video_info['upload_date']
1936                         timetuple = email.utils.parsedate_tz(upload_time)
1937                         if timetuple is not None:
1938                                 try:
1939                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1940                                 except:
1941                                         pass
1942
1943                 # description
1944                 video_description = video_info.get('description', 'No description available.')
1945
1946                 url_map = video_info['video_urls']
1947                 if len(url_map.keys()) > 0:
1948                         # Decide which formats to download
1949                         req_format = self._downloader.params.get('format', None)
1950                         format_limit = self._downloader.params.get('format_limit', None)
1951
1952                         if format_limit is not None and format_limit in self._available_formats:
1953                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1954                         else:
1955                                 format_list = self._available_formats
1956                         existing_formats = [x for x in format_list if x in url_map]
1957                         if len(existing_formats) == 0:
1958                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1959                                 return
1960                         if req_format is None:
1961                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1962                         elif req_format == 'worst':
1963                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1964                         elif req_format == '-1':
1965                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1966                         else:
1967                                 # Specific format
1968                                 if req_format not in url_map:
1969                                         self._downloader.trouble(u'ERROR: requested format not available')
1970                                         return
1971                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1972
1973                 results = []
1974                 for format_param, video_real_url in video_url_list:
1975                         # Extension
1976                         video_extension = self._video_extensions.get(format_param, 'mp4')
1977
1978                         results.append({
1979                                 'id':           video_id.decode('utf-8'),
1980                                 'url':          video_real_url.decode('utf-8'),
1981                                 'uploader':     video_uploader.decode('utf-8'),
1982                                 'upload_date':  upload_date,
1983                                 'title':        video_title,
1984                                 'ext':          video_extension.decode('utf-8'),
1985                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1986                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1987                                 'description':  video_description.decode('utf-8'),
1988                                 'player_url':   None,
1989                         })
1990                 return results
1991
1992 class BlipTVIE(InfoExtractor):
1993         """Information extractor for blip.tv"""
1994
1995         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1996         _URL_EXT = r'^.*\.([a-z0-9]+)$'
1997         IE_NAME = u'blip.tv'
1998
1999         def report_extraction(self, file_id):
2000                 """Report information extraction."""
2001                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2002
2003         def report_direct_download(self, title):
2004                 """Report information extraction."""
2005                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2006
2007         def _real_extract(self, url):
2008                 mobj = re.match(self._VALID_URL, url)
2009                 if mobj is None:
2010                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2011                         return
2012
2013                 if '?' in url:
2014                         cchar = '&'
2015                 else:
2016                         cchar = '?'
2017                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2018                 request = urllib2.Request(json_url.encode('utf-8'))
2019                 self.report_extraction(mobj.group(1))
2020                 info = None
2021                 try:
2022                         urlh = urllib2.urlopen(request)
2023                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2024                                 basename = url.split('/')[-1]
2025                                 title,ext = os.path.splitext(basename)
2026                                 title = title.decode('UTF-8')
2027                                 ext = ext.replace('.', '')
2028                                 self.report_direct_download(title)
2029                                 info = {
2030                                         'id': title,
2031                                         'url': url,
2032                                         'title': title,
2033                                         'ext': ext,
2034                                         'urlhandle': urlh
2035                                 }
2036                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2037                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2038                         return
2039                 if info is None: # Regular URL
2040                         try:
2041                                 json_code = urlh.read()
2042                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2043                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2044                                 return
2045
2046                         try:
2047                                 json_data = json.loads(json_code)
2048                                 if 'Post' in json_data:
2049                                         data = json_data['Post']
2050                                 else:
2051                                         data = json_data
2052
2053                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2054                                 video_url = data['media']['url']
2055                                 umobj = re.match(self._URL_EXT, video_url)
2056                                 if umobj is None:
2057                                         raise ValueError('Can not determine filename extension')
2058                                 ext = umobj.group(1)
2059
2060                                 info = {
2061                                         'id': data['item_id'],
2062                                         'url': video_url,
2063                                         'uploader': data['display_name'],
2064                                         'upload_date': upload_date,
2065                                         'title': data['title'],
2066                                         'ext': ext,
2067                                         'format': data['media']['mimeType'],
2068                                         'thumbnail': data['thumbnailUrl'],
2069                                         'description': data['description'],
2070                                         'player_url': data['embedUrl']
2071                                 }
2072                         except (ValueError,KeyError), err:
2073                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2074                                 return
2075
2076                 std_headers['User-Agent'] = 'iTunes/10.6.1'
2077                 return [info]
2078
2079
2080 class MyVideoIE(InfoExtractor):
2081         """Information Extractor for myvideo.de."""
2082
2083         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2084         IE_NAME = u'myvideo'
2085
2086         def __init__(self, downloader=None):
2087                 InfoExtractor.__init__(self, downloader)
2088
2089         def report_download_webpage(self, video_id):
2090                 """Report webpage download."""
2091                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2092
2093         def report_extraction(self, video_id):
2094                 """Report information extraction."""
2095                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2096
2097         def _real_extract(self,url):
2098                 mobj = re.match(self._VALID_URL, url)
2099                 if mobj is None:
2100                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2101                         return
2102
2103                 video_id = mobj.group(1)
2104
2105                 # Get video webpage
2106                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2107                 try:
2108                         self.report_download_webpage(video_id)
2109                         webpage = urllib2.urlopen(request).read()
2110                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2111                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2112                         return
2113
2114                 self.report_extraction(video_id)
2115                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2116                                  webpage)
2117                 if mobj is None:
2118                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2119                         return
2120                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2121
2122                 mobj = re.search('<title>([^<]+)</title>', webpage)
2123                 if mobj is None:
2124                         self._downloader.trouble(u'ERROR: unable to extract title')
2125                         return
2126
2127                 video_title = mobj.group(1)
2128
2129                 return [{
2130                         'id':           video_id,
2131                         'url':          video_url,
2132                         'uploader':     u'NA',
2133                         'upload_date':  u'NA',
2134                         'title':        video_title,
2135                         'ext':          u'flv',
2136                         'format':       u'NA',
2137                         'player_url':   None,
2138                 }]
2139
2140 class ComedyCentralIE(InfoExtractor):
2141         """Information extractor for The Daily Show and Colbert Report """
2142
2143         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2144         IE_NAME = u'comedycentral'
2145
2146         def report_extraction(self, episode_id):
2147                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2148
2149         def report_config_download(self, episode_id):
2150                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2151
2152         def report_index_download(self, episode_id):
2153                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2154
2155         def report_player_url(self, episode_id):
2156                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2157
2158         def _real_extract(self, url):
2159                 mobj = re.match(self._VALID_URL, url)
2160                 if mobj is None:
2161                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2162                         return
2163
2164                 if mobj.group('shortname'):
2165                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
2166                                 url = u'http://www.thedailyshow.com/full-episodes/'
2167                         else:
2168                                 url = u'http://www.colbertnation.com/full-episodes/'
2169                         mobj = re.match(self._VALID_URL, url)
2170                         assert mobj is not None
2171
2172                 dlNewest = not mobj.group('episode')
2173                 if dlNewest:
2174                         epTitle = mobj.group('showname')
2175                 else:
2176                         epTitle = mobj.group('episode')
2177
2178                 req = urllib2.Request(url)
2179                 self.report_extraction(epTitle)
2180                 try:
2181                         htmlHandle = urllib2.urlopen(req)
2182                         html = htmlHandle.read()
2183                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2184                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2185                         return
2186                 if dlNewest:
2187                         url = htmlHandle.geturl()
2188                         mobj = re.match(self._VALID_URL, url)
2189                         if mobj is None:
2190                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2191                                 return
2192                         if mobj.group('episode') == '':
2193                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2194                                 return
2195                         epTitle = mobj.group('episode')
2196
2197                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2198                 if len(mMovieParams) == 0:
2199                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2200                         return
2201
2202                 playerUrl_raw = mMovieParams[0][0]
2203                 self.report_player_url(epTitle)
2204                 try:
2205                         urlHandle = urllib2.urlopen(playerUrl_raw)
2206                         playerUrl = urlHandle.geturl()
2207                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2208                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2209                         return
2210
2211                 uri = mMovieParams[0][1]
2212                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2213                 self.report_index_download(epTitle)
2214                 try:
2215                         indexXml = urllib2.urlopen(indexUrl).read()
2216                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2217                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2218                         return
2219
2220                 results = []
2221
2222                 idoc = xml.etree.ElementTree.fromstring(indexXml)
2223                 itemEls = idoc.findall('.//item')
2224                 for itemEl in itemEls:
2225                         mediaId = itemEl.findall('./guid')[0].text
2226                         shortMediaId = mediaId.split(':')[-1]
2227                         showId = mediaId.split(':')[-2].replace('.com', '')
2228                         officialTitle = itemEl.findall('./title')[0].text
2229                         officialDate = itemEl.findall('./pubDate')[0].text
2230
2231                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2232                                                 urllib.urlencode({'uri': mediaId}))
2233                         configReq = urllib2.Request(configUrl)
2234                         self.report_config_download(epTitle)
2235                         try:
2236                                 configXml = urllib2.urlopen(configReq).read()
2237                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2238                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2239                                 return
2240
2241                         cdoc = xml.etree.ElementTree.fromstring(configXml)
2242                         turls = []
2243                         for rendition in cdoc.findall('.//rendition'):
2244                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2245                                 turls.append(finfo)
2246
2247                         if len(turls) == 0:
2248                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2249                                 continue
2250
2251                         # For now, just pick the highest bitrate
2252                         format,video_url = turls[-1]
2253
2254                         effTitle = showId + u'-' + epTitle
2255                         info = {
2256                                 'id': shortMediaId,
2257                                 'url': video_url,
2258                                 'uploader': showId,
2259                                 'upload_date': officialDate,
2260                                 'title': effTitle,
2261                                 'ext': 'mp4',
2262                                 'format': format,
2263                                 'thumbnail': None,
2264                                 'description': officialTitle,
2265                                 'player_url': playerUrl
2266                         }
2267
2268                         results.append(info)
2269
2270                 return results
2271
2272
2273 class EscapistIE(InfoExtractor):
2274         """Information extractor for The Escapist """
2275
2276         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2277         IE_NAME = u'escapist'
2278
2279         def report_extraction(self, showName):
2280                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2281
2282         def report_config_download(self, showName):
2283                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2284
2285         def _real_extract(self, url):
2286                 mobj = re.match(self._VALID_URL, url)
2287                 if mobj is None:
2288                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2289                         return
2290                 showName = mobj.group('showname')
2291                 videoId = mobj.group('episode')
2292
2293                 self.report_extraction(showName)
2294                 try:
2295                         webPage = urllib2.urlopen(url)
2296                         webPageBytes = webPage.read()
2297                         m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2298                         webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2299                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2300                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2301                         return
2302
2303                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2304                 description = unescapeHTML(descMatch.group(1))
2305                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2306                 imgUrl = unescapeHTML(imgMatch.group(1))
2307                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2308                 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2309                 configUrlMatch = re.search('config=(.*)$', playerUrl)
2310                 configUrl = urllib2.unquote(configUrlMatch.group(1))
2311
2312                 self.report_config_download(showName)
2313                 try:
2314                         configJSON = urllib2.urlopen(configUrl).read()
2315                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2316                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2317                         return
2318
2319                 # Technically, it's JavaScript, not JSON
2320                 configJSON = configJSON.replace("'", '"')
2321
2322                 try:
2323                         config = json.loads(configJSON)
2324                 except (ValueError,), err:
2325                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2326                         return
2327
2328                 playlist = config['playlist']
2329                 videoUrl = playlist[1]['url']
2330
2331                 info = {
2332                         'id': videoId,
2333                         'url': videoUrl,
2334                         'uploader': showName,
2335                         'upload_date': None,
2336                         'title': showName,
2337                         'ext': 'flv',
2338                         'format': 'flv',
2339                         'thumbnail': imgUrl,
2340                         'description': description,
2341                         'player_url': playerUrl,
2342                 }
2343
2344                 return [info]
2345
2346
2347 class CollegeHumorIE(InfoExtractor):
2348         """Information extractor for collegehumor.com"""
2349
2350         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2351         IE_NAME = u'collegehumor'
2352
2353         def report_webpage(self, video_id):
2354                 """Report information extraction."""
2355                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2356
2357         def report_extraction(self, video_id):
2358                 """Report information extraction."""
2359                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2360
2361         def _real_extract(self, url):
2362                 mobj = re.match(self._VALID_URL, url)
2363                 if mobj is None:
2364                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2365                         return
2366                 video_id = mobj.group('videoid')
2367
2368                 self.report_webpage(video_id)
2369                 request = urllib2.Request(url)
2370                 try:
2371                         webpage = urllib2.urlopen(request).read()
2372                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2373                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2374                         return
2375
2376                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2377                 if m is None:
2378                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2379                         return
2380                 internal_video_id = m.group('internalvideoid')
2381
2382                 info = {
2383                         'id': video_id,
2384                         'internal_id': internal_video_id,
2385                 }
2386
2387                 self.report_extraction(video_id)
2388                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2389                 try:
2390                         metaXml = urllib2.urlopen(xmlUrl).read()
2391                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2392                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2393                         return
2394
2395                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2396                 try:
2397                         videoNode = mdoc.findall('./video')[0]
2398                         info['description'] = videoNode.findall('./description')[0].text
2399                         info['title'] = videoNode.findall('./caption')[0].text
2400                         info['url'] = videoNode.findall('./file')[0].text
2401                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2402                         info['ext'] = info['url'].rpartition('.')[2]
2403                         info['format'] = info['ext']
2404                 except IndexError:
2405                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2406                         return
2407
2408                 return [info]
2409
2410
2411 class XVideosIE(InfoExtractor):
2412         """Information extractor for xvideos.com"""
2413
2414         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2415         IE_NAME = u'xvideos'
2416
2417         def report_webpage(self, video_id):
2418                 """Report information extraction."""
2419                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2420
2421         def report_extraction(self, video_id):
2422                 """Report information extraction."""
2423                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2424
2425         def _real_extract(self, url):
2426                 mobj = re.match(self._VALID_URL, url)
2427                 if mobj is None:
2428                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2429                         return
2430                 video_id = mobj.group(1).decode('utf-8')
2431
2432                 self.report_webpage(video_id)
2433
2434                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2435                 try:
2436                         webpage = urllib2.urlopen(request).read()
2437                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2438                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2439                         return
2440
2441                 self.report_extraction(video_id)
2442
2443
2444                 # Extract video URL
2445                 mobj = re.search(r'flv_url=(.+?)&', webpage)
2446                 if mobj is None:
2447                         self._downloader.trouble(u'ERROR: unable to extract video url')
2448                         return
2449                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2450
2451
2452                 # Extract title
2453                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2454                 if mobj is None:
2455                         self._downloader.trouble(u'ERROR: unable to extract video title')
2456                         return
2457                 video_title = mobj.group(1).decode('utf-8')
2458
2459
2460                 # Extract video thumbnail
2461                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2462                 if mobj is None:
2463                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2464                         return
2465                 video_thumbnail = mobj.group(0).decode('utf-8')
2466
2467                 info = {
2468                         'id': video_id,
2469                         'url': video_url,
2470                         'uploader': None,
2471                         'upload_date': None,
2472                         'title': video_title,
2473                         'ext': 'flv',
2474                         'format': 'flv',
2475                         'thumbnail': video_thumbnail,
2476                         'description': None,
2477                         'player_url': None,
2478                 }
2479
2480                 return [info]
2481
2482
2483 class SoundcloudIE(InfoExtractor):
2484         """Information extractor for soundcloud.com
2485            To access the media, the uid of the song and a stream token
2486            must be extracted from the page source and the script must make
2487            a request to media.soundcloud.com/crossdomain.xml. Then
2488            the media can be grabbed by requesting from an url composed
2489            of the stream token and uid
2490          """
2491
2492         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2493         IE_NAME = u'soundcloud'
2494
2495         def __init__(self, downloader=None):
2496                 InfoExtractor.__init__(self, downloader)
2497
2498         def report_webpage(self, video_id):
2499                 """Report information extraction."""
2500                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2501
2502         def report_extraction(self, video_id):
2503                 """Report information extraction."""
2504                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2505
2506         def _real_extract(self, url):
2507                 mobj = re.match(self._VALID_URL, url)
2508                 if mobj is None:
2509                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2510                         return
2511
2512                 # extract uploader (which is in the url)
2513                 uploader = mobj.group(1).decode('utf-8')
2514                 # extract simple title (uploader + slug of song title)
2515                 slug_title =  mobj.group(2).decode('utf-8')
2516                 simple_title = uploader + u'-' + slug_title
2517
2518                 self.report_webpage('%s/%s' % (uploader, slug_title))
2519
2520                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2521                 try:
2522                         webpage = urllib2.urlopen(request).read()
2523                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2524                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2525                         return
2526
2527                 self.report_extraction('%s/%s' % (uploader, slug_title))
2528
2529                 # extract uid and stream token that soundcloud hands out for access
2530                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2531                 if mobj:
2532                         video_id = mobj.group(1)
2533                         stream_token = mobj.group(2)
2534
2535                 # extract unsimplified title
2536                 mobj = re.search('"title":"(.*?)",', webpage)
2537                 if mobj:
2538                         title = mobj.group(1).decode('utf-8')
2539                 else:
2540                         title = simple_title
2541
2542                 # construct media url (with uid/token)
2543                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2544                 mediaURL = mediaURL % (video_id, stream_token)
2545
2546                 # description
2547                 description = u'No description available'
2548                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2549                 if mobj:
2550                         description = mobj.group(1)
2551
2552                 # upload date
2553                 upload_date = None
2554                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2555                 if mobj:
2556                         try:
2557                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2558                         except Exception, e:
2559                                 self._downloader.to_stderr(str(e))
2560
2561                 # for soundcloud, a request to a cross domain is required for cookies
2562                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2563
2564                 return [{
2565                         'id':           video_id.decode('utf-8'),
2566                         'url':          mediaURL,
2567                         'uploader':     uploader.decode('utf-8'),
2568                         'upload_date':  upload_date,
2569                         'title':        title,
2570                         'ext':          u'mp3',
2571                         'format':       u'NA',
2572                         'player_url':   None,
2573                         'description': description.decode('utf-8')
2574                 }]
2575
2576
2577 class InfoQIE(InfoExtractor):
2578         """Information extractor for infoq.com"""
2579
2580         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2581         IE_NAME = u'infoq'
2582
2583         def report_webpage(self, video_id):
2584                 """Report information extraction."""
2585                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2586
2587         def report_extraction(self, video_id):
2588                 """Report information extraction."""
2589                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2590
2591         def _real_extract(self, url):
2592                 mobj = re.match(self._VALID_URL, url)
2593                 if mobj is None:
2594                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2595                         return
2596
2597                 self.report_webpage(url)
2598
2599                 request = urllib2.Request(url)
2600                 try:
2601                         webpage = urllib2.urlopen(request).read()
2602                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2603                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2604                         return
2605
2606                 self.report_extraction(url)
2607
2608
2609                 # Extract video URL
2610                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2611                 if mobj is None:
2612                         self._downloader.trouble(u'ERROR: unable to extract video url')
2613                         return
2614                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2615
2616
2617                 # Extract title
2618                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2619                 if mobj is None:
2620                         self._downloader.trouble(u'ERROR: unable to extract video title')
2621                         return
2622                 video_title = mobj.group(1).decode('utf-8')
2623
2624                 # Extract description
2625                 video_description = u'No description available.'
2626                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2627                 if mobj is not None:
2628                         video_description = mobj.group(1).decode('utf-8')
2629
2630                 video_filename = video_url.split('/')[-1]
2631                 video_id, extension = video_filename.split('.')
2632
2633                 info = {
2634                         'id': video_id,
2635                         'url': video_url,
2636                         'uploader': None,
2637                         'upload_date': None,
2638                         'title': video_title,
2639                         'ext': extension,
2640                         'format': extension, # Extension is always(?) mp4, but seems to be flv
2641                         'thumbnail': None,
2642                         'description': video_description,
2643                         'player_url': None,
2644                 }
2645
2646                 return [info]
2647
2648 class MixcloudIE(InfoExtractor):
2649         """Information extractor for www.mixcloud.com"""
2650         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2651         IE_NAME = u'mixcloud'
2652
2653         def __init__(self, downloader=None):
2654                 InfoExtractor.__init__(self, downloader)
2655
2656         def report_download_json(self, file_id):
2657                 """Report JSON download."""
2658                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2659
2660         def report_extraction(self, file_id):
2661                 """Report information extraction."""
2662                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2663
2664         def get_urls(self, jsonData, fmt, bitrate='best'):
2665                 """Get urls from 'audio_formats' section in json"""
2666                 file_url = None
2667                 try:
2668                         bitrate_list = jsonData[fmt]
2669                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2670                                 bitrate = max(bitrate_list) # select highest
2671
2672                         url_list = jsonData[fmt][bitrate]
2673                 except TypeError: # we have no bitrate info.
2674                         url_list = jsonData[fmt]
2675                 return url_list
2676
2677         def check_urls(self, url_list):
2678                 """Returns 1st active url from list"""
2679                 for url in url_list:
2680                         try:
2681                                 urllib2.urlopen(url)
2682                                 return url
2683                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2684                                 url = None
2685
2686                 return None
2687
2688         def _print_formats(self, formats):
2689                 print 'Available formats:'
2690                 for fmt in formats.keys():
2691                         for b in formats[fmt]:
2692                                 try:
2693                                         ext = formats[fmt][b][0]
2694                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2695                                 except TypeError: # we have no bitrate info
2696                                         ext = formats[fmt][0]
2697                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2698                                         break
2699
2700         def _real_extract(self, url):
2701                 mobj = re.match(self._VALID_URL, url)
2702                 if mobj is None:
2703                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2704                         return
2705                 # extract uploader & filename from url
2706                 uploader = mobj.group(1).decode('utf-8')
2707                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2708
2709                 # construct API request
2710                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2711                 # retrieve .json file with links to files
2712                 request = urllib2.Request(file_url)
2713                 try:
2714                         self.report_download_json(file_url)
2715                         jsonData = urllib2.urlopen(request).read()
2716                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2717                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2718                         return
2719
2720                 # parse JSON
2721                 json_data = json.loads(jsonData)
2722                 player_url = json_data['player_swf_url']
2723                 formats = dict(json_data['audio_formats'])
2724
2725                 req_format = self._downloader.params.get('format', None)
2726                 bitrate = None
2727
2728                 if self._downloader.params.get('listformats', None):
2729                         self._print_formats(formats)
2730                         return
2731
2732                 if req_format is None or req_format == 'best':
2733                         for format_param in formats.keys():
2734                                 url_list = self.get_urls(formats, format_param)
2735                                 # check urls
2736                                 file_url = self.check_urls(url_list)
2737                                 if file_url is not None:
2738                                         break # got it!
2739                 else:
2740                         if req_format not in formats.keys():
2741                                 self._downloader.trouble(u'ERROR: format is not available')
2742                                 return
2743
2744                         url_list = self.get_urls(formats, req_format)
2745                         file_url = self.check_urls(url_list)
2746                         format_param = req_format
2747
2748                 return [{
2749                         'id': file_id.decode('utf-8'),
2750                         'url': file_url.decode('utf-8'),
2751                         'uploader':     uploader.decode('utf-8'),
2752                         'upload_date': u'NA',
2753                         'title': json_data['name'],
2754                         'ext': file_url.split('.')[-1].decode('utf-8'),
2755                         'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2756                         'thumbnail': json_data['thumbnail_url'],
2757                         'description': json_data['description'],
2758                         'player_url': player_url.decode('utf-8'),
2759                 }]
2760
2761 class StanfordOpenClassroomIE(InfoExtractor):
2762         """Information extractor for Stanford's Open ClassRoom"""
2763
2764         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2765         IE_NAME = u'stanfordoc'
2766
2767         def report_download_webpage(self, objid):
2768                 """Report information extraction."""
2769                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2770
2771         def report_extraction(self, video_id):
2772                 """Report information extraction."""
2773                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2774
2775         def _real_extract(self, url):
2776                 mobj = re.match(self._VALID_URL, url)
2777                 if mobj is None:
2778                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2779                         return
2780
2781                 if mobj.group('course') and mobj.group('video'): # A specific video
2782                         course = mobj.group('course')
2783                         video = mobj.group('video')
2784                         info = {
2785                                 'id': course + '_' + video,
2786                         }
2787
2788                         self.report_extraction(info['id'])
2789                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2790                         xmlUrl = baseUrl + video + '.xml'
2791                         try:
2792                                 metaXml = urllib2.urlopen(xmlUrl).read()
2793                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2794                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2795                                 return
2796                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2797                         try:
2798                                 info['title'] = mdoc.findall('./title')[0].text
2799                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2800                         except IndexError:
2801                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2802                                 return
2803                         info['ext'] = info['url'].rpartition('.')[2]
2804                         info['format'] = info['ext']
2805                         return [info]
2806                 elif mobj.group('course'): # A course page
2807                         course = mobj.group('course')
2808                         info = {
2809                                 'id': course,
2810                                 'type': 'playlist',
2811                         }
2812
2813                         self.report_download_webpage(info['id'])
2814                         try:
2815                                 coursepage = urllib2.urlopen(url).read()
2816                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2817                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2818                                 return
2819
2820                         m = re.search('<h1>([^<]+)</h1>', coursepage)
2821                         if m:
2822                                 info['title'] = unescapeHTML(m.group(1))
2823                         else:
2824                                 info['title'] = info['id']
2825
2826                         m = re.search('<description>([^<]+)</description>', coursepage)
2827                         if m:
2828                                 info['description'] = unescapeHTML(m.group(1))
2829
2830                         links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2831                         info['list'] = [
2832                                 {
2833                                         'type': 'reference',
2834                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2835                                 }
2836                                         for vpage in links]
2837                         results = []
2838                         for entry in info['list']:
2839                                 assert entry['type'] == 'reference'
2840                                 results += self.extract(entry['url'])
2841                         return results
2842
2843                 else: # Root page
2844                         info = {
2845                                 'id': 'Stanford OpenClassroom',
2846                                 'type': 'playlist',
2847                         }
2848
2849                         self.report_download_webpage(info['id'])
2850                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2851                         try:
2852                                 rootpage = urllib2.urlopen(rootURL).read()
2853                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2854                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2855                                 return
2856
2857                         info['title'] = info['id']
2858
2859                         links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2860                         info['list'] = [
2861                                 {
2862                                         'type': 'reference',
2863                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2864                                 }
2865                                         for cpage in links]
2866
2867                         results = []
2868                         for entry in info['list']:
2869                                 assert entry['type'] == 'reference'
2870                                 results += self.extract(entry['url'])
2871                         return results
2872
2873 class MTVIE(InfoExtractor):
2874         """Information extractor for MTV.com"""
2875
2876         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2877         IE_NAME = u'mtv'
2878
2879         def report_webpage(self, video_id):
2880                 """Report information extraction."""
2881                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2882
2883         def report_extraction(self, video_id):
2884                 """Report information extraction."""
2885                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2886
2887         def _real_extract(self, url):
2888                 mobj = re.match(self._VALID_URL, url)
2889                 if mobj is None:
2890                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2891                         return
2892                 if not mobj.group('proto'):
2893                         url = 'http://' + url
2894                 video_id = mobj.group('videoid')
2895                 self.report_webpage(video_id)
2896
2897                 request = urllib2.Request(url)
2898                 try:
2899                         webpage = urllib2.urlopen(request).read()
2900                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2901                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2902                         return
2903
2904                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2905                 if mobj is None:
2906                         self._downloader.trouble(u'ERROR: unable to extract song name')
2907                         return
2908                 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2909                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2910                 if mobj is None:
2911                         self._downloader.trouble(u'ERROR: unable to extract performer')
2912                         return
2913                 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2914                 video_title = performer + ' - ' + song_name
2915
2916                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2917                 if mobj is None:
2918                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
2919                         return
2920                 mtvn_uri = mobj.group(1)
2921
2922                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2923                 if mobj is None:
2924                         self._downloader.trouble(u'ERROR: unable to extract content id')
2925                         return
2926                 content_id = mobj.group(1)
2927
2928                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2929                 self.report_extraction(video_id)
2930                 request = urllib2.Request(videogen_url)
2931                 try:
2932                         metadataXml = urllib2.urlopen(request).read()
2933                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2934                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
2935                         return
2936
2937                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2938                 renditions = mdoc.findall('.//rendition')
2939
2940                 # For now, always pick the highest quality.
2941                 rendition = renditions[-1]
2942
2943                 try:
2944                         _,_,ext = rendition.attrib['type'].partition('/')
2945                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2946                         video_url = rendition.find('./src').text
2947                 except KeyError:
2948                         self._downloader.trouble('Invalid rendition field.')
2949                         return
2950
2951                 info = {
2952                         'id': video_id,
2953                         'url': video_url,
2954                         'uploader': performer,
2955                         'title': video_title,
2956                         'ext': ext,
2957                         'format': format,
2958                 }
2959
2960                 return [info]
2961
2962
2963 class YoukuIE(InfoExtractor):
2964
2965         _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2966         IE_NAME = u'Youku'
2967
2968         def __init__(self, downloader=None):
2969                 InfoExtractor.__init__(self, downloader)
2970
2971         def report_download_webpage(self, file_id):
2972                 """Report webpage download."""
2973                 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
2974
2975         def report_extraction(self, file_id):
2976                 """Report information extraction."""
2977                 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
2978
2979         def _gen_sid(self):
2980                 nowTime = int(time.time() * 1000)
2981                 random1 = random.randint(1000,1998)
2982                 random2 = random.randint(1000,9999)
2983
2984                 return "%d%d%d" %(nowTime,random1,random2)
2985
2986         def _get_file_ID_mix_string(self, seed):
2987                 mixed = []
2988                 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2989                 seed = float(seed)
2990                 for i in range(len(source)):
2991                         seed  =  (seed * 211 + 30031 ) % 65536
2992                         index  =  math.floor(seed / 65536 * len(source) )
2993                         mixed.append(source[int(index)])
2994                         source.remove(source[int(index)])
2995                 #return ''.join(mixed)
2996                 return mixed
2997
2998         def _get_file_id(self, fileId, seed):
2999                 mixed = self._get_file_ID_mix_string(seed)
3000                 ids = fileId.split('*')
3001                 realId = []
3002                 for ch in ids:
3003                         if ch:
3004                                 realId.append(mixed[int(ch)])
3005                 return ''.join(realId)
3006
3007         def _real_extract(self, url):
3008                 mobj = re.match(self._VALID_URL, url)
3009                 if mobj is None:
3010                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3011                         return
3012                 video_id = mobj.group('ID')
3013
3014                 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3015
3016                 request = urllib2.Request(info_url, None, std_headers)
3017                 try:
3018                         self.report_download_webpage(video_id)
3019                         jsondata = urllib2.urlopen(request).read()
3020                 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3021                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3022                         return
3023
3024                 self.report_extraction(video_id)
3025                 try:
3026                         config = json.loads(jsondata)
3027
3028                         video_title =  config['data'][0]['title']
3029                         seed = config['data'][0]['seed']
3030
3031                         format = self._downloader.params.get('format', None)
3032                         supported_format = config['data'][0]['streamfileids'].keys()
3033
3034                         if format is None or format == 'best':
3035                                 if 'hd2' in supported_format:
3036                                         format = 'hd2'
3037                                 else:
3038                                         format = 'flv'
3039                                 ext = u'flv'
3040                         elif format == 'worst':
3041                                 format = 'mp4'
3042                                 ext = u'mp4'
3043                         else:
3044                                 format = 'flv'
3045                                 ext = u'flv'
3046
3047
3048                         fileid = config['data'][0]['streamfileids'][format]
3049                         seg_number = len(config['data'][0]['segs'][format])
3050
3051                         keys=[]
3052                         for i in xrange(seg_number):
3053                                 keys.append(config['data'][0]['segs'][format][i]['k'])
3054
3055                         #TODO check error
3056                         #youku only could be viewed from mainland china
3057                 except:
3058                         self._downloader.trouble(u'ERROR: unable to extract info section')
3059                         return
3060
3061                 files_info=[]
3062                 sid = self._gen_sid()
3063                 fileid = self._get_file_id(fileid, seed)
3064
3065                 #column 8,9 of fileid represent the segment number
3066                 #fileid[7:9] should be changed
3067                 for index, key in enumerate(keys):
3068
3069                         temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3070                         download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3071
3072                         info = {
3073                                 'id': '%s_part%02d' % (video_id, index),
3074                                 'url': download_url,
3075                                 'uploader': None,
3076                                 'title': video_title,
3077                                 'ext': ext,
3078                                 'format': u'NA'
3079                         }
3080                         files_info.append(info)
3081
3082                 return files_info
3083
3084
3085 class XNXXIE(InfoExtractor):
3086         """Information extractor for xnxx.com"""
3087
3088         _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3089         IE_NAME = u'xnxx'
3090         VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3091         VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3092         VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3093
3094         def report_webpage(self, video_id):
3095                 """Report information extraction"""
3096                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3097
3098         def report_extraction(self, video_id):
3099                 """Report information extraction"""
3100                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3101
3102         def _real_extract(self, url):
3103                 mobj = re.match(self._VALID_URL, url)
3104                 if mobj is None:
3105                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3106                         return
3107                 video_id = mobj.group(1).decode('utf-8')
3108
3109                 self.report_webpage(video_id)
3110
3111                 # Get webpage content
3112                 try:
3113                         webpage = urllib2.urlopen(url).read()
3114                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3115                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3116                         return
3117
3118                 result = re.search(self.VIDEO_URL_RE, webpage)
3119                 if result is None:
3120                         self._downloader.trouble(u'ERROR: unable to extract video url')
3121                         return
3122                 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3123
3124                 result = re.search(self.VIDEO_TITLE_RE, webpage)
3125                 if result is None:
3126                         self._downloader.trouble(u'ERROR: unable to extract video title')
3127                         return
3128                 video_title = result.group(1).decode('utf-8')
3129
3130                 result = re.search(self.VIDEO_THUMB_RE, webpage)
3131                 if result is None:
3132                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3133                         return
3134                 video_thumbnail = result.group(1).decode('utf-8')
3135
3136                 info = {'id': video_id,
3137                                 'url': video_url,
3138                                 'uploader': None,
3139                                 'upload_date': None,
3140                                 'title': video_title,
3141                                 'ext': 'flv',
3142                                 'format': 'flv',
3143                                 'thumbnail': video_thumbnail,
3144                                 'description': None,
3145                                 'player_url': None}
3146
3147                 return [info]