_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import datetime
   5 import HTMLParser
   6 import httplib
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import time
  12 import urllib
  13 import urllib2
  14 import email.utils
  15 import xml.etree.ElementTree
  16 from urlparse import parse_qs
  17
  18 try:
  19         import cStringIO as StringIO
  20 except ImportError:
  21         import StringIO
  22
  23 from utils import *
  24
  25
  26 class InfoExtractor(object):
  27         """Information Extractor class.
  28
  29         Information extractors are the classes that, given a URL, extract
  30         information from the video (or videos) the URL refers to. This
  31         information includes the real video URL, the video title and simplified
  32         title, author and others. The information is stored in a dictionary
  33         which is then passed to the FileDownloader. The FileDownloader
  34         processes this information possibly downloading the video to the file
  35         system, among other possible outcomes. The dictionaries must include
  36         the following fields:
  37
  38         id:             Video identifier.
  39         url:            Final video URL.
  40         uploader:       Nickname of the video uploader.
  41         title:          Literal title.
  42         ext:            Video filename extension.
  43         format:         Video format.
  44         player_url:     SWF Player URL (may be None).
  45
  46         The following fields are optional. Their primary purpose is to allow
  47         youtube-dl to serve as the backend for a video search function, such
  48         as the one in youtube2mp3.  They are only used when their respective
  49         forced printing functions are called:
  50
  51         thumbnail:      Full URL to a video thumbnail image.
  52         description:    One-line video description.
  53
  54         Subclasses of this one should re-define the _real_initialize() and
  55         _real_extract() methods and define a _VALID_URL regexp.
  56         Probably, they should also be added to the list of extractors.
  57         """
  58
  59         _ready = False
  60         _downloader = None
  61
  62         def __init__(self, downloader=None):
  63                 """Constructor. Receives an optional downloader."""
  64                 self._ready = False
  65                 self.set_downloader(downloader)
  66
  67         def suitable(self, url):
  68                 """Receives a URL and returns True if suitable for this IE."""
  69                 return re.match(self._VALID_URL, url) is not None
  70
  71         def initialize(self):
  72                 """Initializes an instance (authentication, etc)."""
  73                 if not self._ready:
  74                         self._real_initialize()
  75                         self._ready = True
  76
  77         def extract(self, url):
  78                 """Extracts URL information and returns it in list of dicts."""
  79                 self.initialize()
  80                 return self._real_extract(url)
  81
  82         def set_downloader(self, downloader):
  83                 """Sets the downloader for this IE."""
  84                 self._downloader = downloader
  85
  86         def _real_initialize(self):
  87                 """Real initialization process. Redefine in subclasses."""
  88                 pass
  89
  90         def _real_extract(self, url):
  91                 """Real extraction process. Redefine in subclasses."""
  92                 pass
  93
  94
  95 class YoutubeIE(InfoExtractor):
  96         """Information extractor for youtube.com."""
  97
  98         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
  99         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 100         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 101         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 102         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 103         _NETRC_MACHINE = 'youtube'
 104         # Listed in order of quality
 105         _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 106         _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 107         _video_extensions = {
 108                 '13': '3gp',
 109                 '17': 'mp4',
 110                 '18': 'mp4',
 111                 '22': 'mp4',
 112                 '37': 'mp4',
 113                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 114                 '43': 'webm',
 115                 '44': 'webm',
 116                 '45': 'webm',
 117                 '46': 'webm',
 118         }
 119         _video_dimensions = {
 120                 '5': '240x400',
 121                 '6': '???',
 122                 '13': '???',
 123                 '17': '144x176',
 124                 '18': '360x640',
 125                 '22': '720x1280',
 126                 '34': '360x640',
 127                 '35': '480x854',
 128                 '37': '1080x1920',
 129                 '38': '3072x4096',
 130                 '43': '360x640',
 131                 '44': '480x854',
 132                 '45': '720x1280',
 133                 '46': '1080x1920',
 134         }
 135         IE_NAME = u'youtube'
 136
 137         def report_lang(self):
 138                 """Report attempt to set language."""
 139                 self._downloader.to_screen(u'[youtube] Setting language')
 140
 141         def report_login(self):
 142                 """Report attempt to log in."""
 143                 self._downloader.to_screen(u'[youtube] Logging in')
 144
 145         def report_age_confirmation(self):
 146                 """Report attempt to confirm age."""
 147                 self._downloader.to_screen(u'[youtube] Confirming age')
 148
 149         def report_video_webpage_download(self, video_id):
 150                 """Report attempt to download video webpage."""
 151                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 152
 153         def report_video_info_webpage_download(self, video_id):
 154                 """Report attempt to download video info webpage."""
 155                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 156
 157         def report_video_subtitles_download(self, video_id):
 158                 """Report attempt to download video info webpage."""
 159                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 160
 161         def report_information_extraction(self, video_id):
 162                 """Report attempt to extract video information."""
 163                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 164
 165         def report_unavailable_format(self, video_id, format):
 166                 """Report extracted video URL."""
 167                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 168
 169         def report_rtmp_download(self):
 170                 """Indicate the download will use the RTMP protocol."""
 171                 self._downloader.to_screen(u'[youtube] RTMP download detected')
 172
 173         def _closed_captions_xml_to_srt(self, xml_string):
 174                 srt = ''
 175                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 176                 # TODO parse xml instead of regex
 177                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
 178                         if not dur: dur = '4'
 179                         start = float(start)
 180                         end = start + float(dur)
 181                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 182                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 183                         caption = unescapeHTML(caption)
 184                         caption = unescapeHTML(caption) # double cycle, intentional
 185                         srt += str(n+1) + '\n'
 186                         srt += start + ' --> ' + end + '\n'
 187                         srt += caption + '\n\n'
 188                 return srt
 189
 190         def _print_formats(self, formats):
 191                 print 'Available formats:'
 192                 for x in formats:
 193                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
 194
 195         def _real_initialize(self):
 196                 if self._downloader is None:
 197                         return
 198
 199                 username = None
 200                 password = None
 201                 downloader_params = self._downloader.params
 202
 203                 # Attempt to use provided username and password or .netrc data
 204                 if downloader_params.get('username', None) is not None:
 205                         username = downloader_params['username']
 206                         password = downloader_params['password']
 207                 elif downloader_params.get('usenetrc', False):
 208                         try:
 209                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 210                                 if info is not None:
 211                                         username = info[0]
 212                                         password = info[2]
 213                                 else:
 214                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 215                         except (IOError, netrc.NetrcParseError), err:
 216                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 217                                 return
 218
 219                 # Set language
 220                 request = urllib2.Request(self._LANG_URL)
 221                 try:
 222                         self.report_lang()
 223                         urllib2.urlopen(request).read()
 224                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 225                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 226                         return
 227
 228                 # No authentication to be performed
 229                 if username is None:
 230                         return
 231
 232                 # Log in
 233                 login_form = {
 234                                 'current_form': 'loginForm',
 235                                 'next':         '/',
 236                                 'action_login': 'Log In',
 237                                 'username':     username,
 238                                 'password':     password,
 239                                 }
 240                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
 241                 try:
 242                         self.report_login()
 243                         login_results = urllib2.urlopen(request).read()
 244                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 245                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 246                                 return
 247                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 248                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 249                         return
 250
 251                 # Confirm age
 252                 age_form = {
 253                                 'next_url':             '/',
 254                                 'action_confirm':       'Confirm',
 255                                 }
 256                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
 257                 try:
 258                         self.report_age_confirmation()
 259                         age_results = urllib2.urlopen(request).read()
 260                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 261                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 262                         return
 263
 264         def _real_extract(self, url):
 265                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 266                 mobj = re.search(self._NEXT_URL_RE, url)
 267                 if mobj:
 268                         url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
 269
 270                 # Extract video id from URL
 271                 mobj = re.match(self._VALID_URL, url)
 272                 if mobj is None:
 273                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 274                         return
 275                 video_id = mobj.group(2)
 276
 277                 # Get video webpage
 278                 self.report_video_webpage_download(video_id)
 279                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
 280                 try:
 281                         video_webpage = urllib2.urlopen(request).read()
 282                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 283                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 284                         return
 285
 286                 # Attempt to extract SWF player URL
 287                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 288                 if mobj is not None:
 289                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 290                 else:
 291                         player_url = None
 292
 293                 # Get video info
 294                 self.report_video_info_webpage_download(video_id)
 295                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 296                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 297                                         % (video_id, el_type))
 298                         request = urllib2.Request(video_info_url)
 299                         try:
 300                                 video_info_webpage = urllib2.urlopen(request).read()
 301                                 video_info = parse_qs(video_info_webpage)
 302                                 if 'token' in video_info:
 303                                         break
 304                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 305                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 306                                 return
 307                 if 'token' not in video_info:
 308                         if 'reason' in video_info:
 309                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
 310                         else:
 311                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 312                         return
 313
 314                 # Check for "rental" videos
 315                 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 316                         self._downloader.trouble(u'ERROR: "rental" videos not supported')
 317                         return
 318
 319                 # Start extracting information
 320                 self.report_information_extraction(video_id)
 321
 322                 # uploader
 323                 if 'author' not in video_info:
 324                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 325                         return
 326                 video_uploader = urllib.unquote_plus(video_info['author'][0])
 327
 328                 # title
 329                 if 'title' not in video_info:
 330                         self._downloader.trouble(u'ERROR: unable to extract video title')
 331                         return
 332                 video_title = urllib.unquote_plus(video_info['title'][0])
 333                 video_title = video_title.decode('utf-8')
 334
 335                 # thumbnail image
 336                 if 'thumbnail_url' not in video_info:
 337                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 338                         video_thumbnail = ''
 339                 else:   # don't panic if we can't find it
 340                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
 341
 342                 # upload date
 343                 upload_date = u'NA'
 344                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 345                 if mobj is not None:
 346                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 347                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 348                         for expression in format_expressions:
 349                                 try:
 350                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 351                                 except:
 352                                         pass
 353
 354                 # description
 355                 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
 356                 if video_description: video_description = clean_html(video_description)
 357                 else: video_description = ''
 358
 359                 # closed captions
 360                 video_subtitles = None
 361                 if self._downloader.params.get('writesubtitles', False):
 362                         try:
 363                                 self.report_video_subtitles_download(video_id)
 364                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 365                                 try:
 366                                         srt_list = urllib2.urlopen(request).read()
 367                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 368                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
 369                                 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 370                                 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 371                                 if not srt_lang_list:
 372                                         raise Trouble(u'WARNING: video has no closed captions')
 373                                 if self._downloader.params.get('subtitleslang', False):
 374                                         srt_lang = self._downloader.params.get('subtitleslang')
 375                                 elif 'en' in srt_lang_list:
 376                                         srt_lang = 'en'
 377                                 else:
 378                                         srt_lang = srt_lang_list.keys()[0]
 379                                 if not srt_lang in srt_lang_list:
 380                                         raise Trouble(u'WARNING: no closed captions found in the specified language')
 381                                 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
 382                                 try:
 383                                         srt_xml = urllib2.urlopen(request).read()
 384                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 385                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
 386                                 if not srt_xml:
 387                                         raise Trouble(u'WARNING: unable to download video subtitles')
 388                                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
 389                         except Trouble as trouble:
 390                                 self._downloader.trouble(trouble[0])
 391
 392                 # token
 393                 video_token = urllib.unquote_plus(video_info['token'][0])
 394
 395                 # Decide which formats to download
 396                 req_format = self._downloader.params.get('format', None)
 397
 398                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 399                         self.report_rtmp_download()
 400                         video_url_list = [(None, video_info['conn'][0])]
 401                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 402                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 403                         url_data = [parse_qs(uds) for uds in url_data_strs]
 404                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
 405                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
 406
 407                         format_limit = self._downloader.params.get('format_limit', None)
 408                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 409                         if format_limit is not None and format_limit in available_formats:
 410                                 format_list = available_formats[available_formats.index(format_limit):]
 411                         else:
 412                                 format_list = available_formats
 413                         existing_formats = [x for x in format_list if x in url_map]
 414                         if len(existing_formats) == 0:
 415                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 416                                 return
 417                         if self._downloader.params.get('listformats', None):
 418                                 self._print_formats(existing_formats)
 419                                 return
 420                         if req_format is None or req_format == 'best':
 421                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 422                         elif req_format == 'worst':
 423                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 424                         elif req_format in ('-1', 'all'):
 425                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 426                         else:
 427                                 # Specific formats. We pick the first in a slash-delimeted sequence.
 428                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 429                                 req_formats = req_format.split('/')
 430                                 video_url_list = None
 431                                 for rf in req_formats:
 432                                         if rf in url_map:
 433                                                 video_url_list = [(rf, url_map[rf])]
 434                                                 break
 435                                 if video_url_list is None:
 436                                         self._downloader.trouble(u'ERROR: requested format not available')
 437                                         return
 438                 else:
 439                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 440                         return
 441
 442                 results = []
 443                 for format_param, video_real_url in video_url_list:
 444                         # Extension
 445                         video_extension = self._video_extensions.get(format_param, 'flv')
 446
 447                         results.append({
 448                                 'id':           video_id.decode('utf-8'),
 449                                 'url':          video_real_url.decode('utf-8'),
 450                                 'uploader':     video_uploader.decode('utf-8'),
 451                                 'upload_date':  upload_date,
 452                                 'title':        video_title,
 453                                 'ext':          video_extension.decode('utf-8'),
 454                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 455                                 'thumbnail':    video_thumbnail.decode('utf-8'),
 456                                 'description':  video_description,
 457                                 'player_url':   player_url,
 458                                 'subtitles':    video_subtitles
 459                         })
 460                 return results
 461
 462
 463 class MetacafeIE(InfoExtractor):
 464         """Information Extractor for metacafe.com."""
 465
 466         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 467         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 468         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 469         IE_NAME = u'metacafe'
 470
 471         def __init__(self, downloader=None):
 472                 InfoExtractor.__init__(self, downloader)
 473
 474         def report_disclaimer(self):
 475                 """Report disclaimer retrieval."""
 476                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 477
 478         def report_age_confirmation(self):
 479                 """Report attempt to confirm age."""
 480                 self._downloader.to_screen(u'[metacafe] Confirming age')
 481
 482         def report_download_webpage(self, video_id):
 483                 """Report webpage download."""
 484                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 485
 486         def report_extraction(self, video_id):
 487                 """Report information extraction."""
 488                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 489
 490         def _real_initialize(self):
 491                 # Retrieve disclaimer
 492                 request = urllib2.Request(self._DISCLAIMER)
 493                 try:
 494                         self.report_disclaimer()
 495                         disclaimer = urllib2.urlopen(request).read()
 496                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 497                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 498                         return
 499
 500                 # Confirm age
 501                 disclaimer_form = {
 502                         'filters': '0',
 503                         'submit': "Continue - I'm over 18",
 504                         }
 505                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
 506                 try:
 507                         self.report_age_confirmation()
 508                         disclaimer = urllib2.urlopen(request).read()
 509                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 510                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 511                         return
 512
 513         def _real_extract(self, url):
 514                 # Extract id and simplified title from URL
 515                 mobj = re.match(self._VALID_URL, url)
 516                 if mobj is None:
 517                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 518                         return
 519
 520                 video_id = mobj.group(1)
 521
 522                 # Check if video comes from YouTube
 523                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 524                 if mobj2 is not None:
 525                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 526                         return
 527
 528                 # Retrieve video webpage to extract further information
 529                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 530                 try:
 531                         self.report_download_webpage(video_id)
 532                         webpage = urllib2.urlopen(request).read()
 533                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 534                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 535                         return
 536
 537                 # Extract URL, uploader and title from webpage
 538                 self.report_extraction(video_id)
 539                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 540                 if mobj is not None:
 541                         mediaURL = urllib.unquote(mobj.group(1))
 542                         video_extension = mediaURL[-3:]
 543
 544                         # Extract gdaKey if available
 545                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 546                         if mobj is None:
 547                                 video_url = mediaURL
 548                         else:
 549                                 gdaKey = mobj.group(1)
 550                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 551                 else:
 552                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 553                         if mobj is None:
 554                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 555                                 return
 556                         vardict = parse_qs(mobj.group(1))
 557                         if 'mediaData' not in vardict:
 558                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 559                                 return
 560                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 561                         if mobj is None:
 562                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 563                                 return
 564                         mediaURL = mobj.group(1).replace('\\/', '/')
 565                         video_extension = mediaURL[-3:]
 566                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 567
 568                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 569                 if mobj is None:
 570                         self._downloader.trouble(u'ERROR: unable to extract title')
 571                         return
 572                 video_title = mobj.group(1).decode('utf-8')
 573
 574                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
 575                 if mobj is None:
 576                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 577                         return
 578                 video_uploader = mobj.group(1)
 579
 580                 return [{
 581                         'id':           video_id.decode('utf-8'),
 582                         'url':          video_url.decode('utf-8'),
 583                         'uploader':     video_uploader.decode('utf-8'),
 584                         'upload_date':  u'NA',
 585                         'title':        video_title,
 586                         'ext':          video_extension.decode('utf-8'),
 587                         'format':       u'NA',
 588                         'player_url':   None,
 589                 }]
 590
 591
 592 class DailymotionIE(InfoExtractor):
 593         """Information Extractor for Dailymotion"""
 594
 595         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
 596         IE_NAME = u'dailymotion'
 597
 598         def __init__(self, downloader=None):
 599                 InfoExtractor.__init__(self, downloader)
 600
 601         def report_download_webpage(self, video_id):
 602                 """Report webpage download."""
 603                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
 604
 605         def report_extraction(self, video_id):
 606                 """Report information extraction."""
 607                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 608
 609         def _real_extract(self, url):
 610                 # Extract id and simplified title from URL
 611                 mobj = re.match(self._VALID_URL, url)
 612                 if mobj is None:
 613                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 614                         return
 615
 616                 video_id = mobj.group(1)
 617
 618                 video_extension = 'flv'
 619
 620                 # Retrieve video webpage to extract further information
 621                 request = urllib2.Request(url)
 622                 request.add_header('Cookie', 'family_filter=off')
 623                 try:
 624                         self.report_download_webpage(video_id)
 625                         webpage = urllib2.urlopen(request).read()
 626                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 627                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 628                         return
 629
 630                 # Extract URL, uploader and title from webpage
 631                 self.report_extraction(video_id)
 632                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
 633                 if mobj is None:
 634                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 635                         return
 636                 sequence = urllib.unquote(mobj.group(1))
 637                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
 638                 if mobj is None:
 639                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 640                         return
 641                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
 642
 643                 # if needed add http://www.dailymotion.com/ if relative URL
 644
 645                 video_url = mediaURL
 646
 647                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 648                 if mobj is None:
 649                         self._downloader.trouble(u'ERROR: unable to extract title')
 650                         return
 651                 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
 652
 653                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
 654                 if mobj is None:
 655                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 656                         return
 657                 video_uploader = mobj.group(1)
 658
 659                 return [{
 660                         'id':           video_id.decode('utf-8'),
 661                         'url':          video_url.decode('utf-8'),
 662                         'uploader':     video_uploader.decode('utf-8'),
 663                         'upload_date':  u'NA',
 664                         'title':        video_title,
 665                         'ext':          video_extension.decode('utf-8'),
 666                         'format':       u'NA',
 667                         'player_url':   None,
 668                 }]
 669
 670
 671 class GoogleIE(InfoExtractor):
 672         """Information extractor for video.google.com."""
 673
 674         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
 675         IE_NAME = u'video.google'
 676
 677         def __init__(self, downloader=None):
 678                 InfoExtractor.__init__(self, downloader)
 679
 680         def report_download_webpage(self, video_id):
 681                 """Report webpage download."""
 682                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
 683
 684         def report_extraction(self, video_id):
 685                 """Report information extraction."""
 686                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
 687
 688         def _real_extract(self, url):
 689                 # Extract id from URL
 690                 mobj = re.match(self._VALID_URL, url)
 691                 if mobj is None:
 692                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 693                         return
 694
 695                 video_id = mobj.group(1)
 696
 697                 video_extension = 'mp4'
 698
 699                 # Retrieve video webpage to extract further information
 700                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
 701                 try:
 702                         self.report_download_webpage(video_id)
 703                         webpage = urllib2.urlopen(request).read()
 704                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 705                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 706                         return
 707
 708                 # Extract URL, uploader, and title from webpage
 709                 self.report_extraction(video_id)
 710                 mobj = re.search(r"download_url:'([^']+)'", webpage)
 711                 if mobj is None:
 712                         video_extension = 'flv'
 713                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
 714                 if mobj is None:
 715                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 716                         return
 717                 mediaURL = urllib.unquote(mobj.group(1))
 718                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
 719                 mediaURL = mediaURL.replace('\\x26', '\x26')
 720
 721                 video_url = mediaURL
 722
 723                 mobj = re.search(r'<title>(.*)</title>', webpage)
 724                 if mobj is None:
 725                         self._downloader.trouble(u'ERROR: unable to extract title')
 726                         return
 727                 video_title = mobj.group(1).decode('utf-8')
 728
 729                 # Extract video description
 730                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
 731                 if mobj is None:
 732                         self._downloader.trouble(u'ERROR: unable to extract video description')
 733                         return
 734                 video_description = mobj.group(1).decode('utf-8')
 735                 if not video_description:
 736                         video_description = 'No description available.'
 737
 738                 # Extract video thumbnail
 739                 if self._downloader.params.get('forcethumbnail', False):
 740                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
 741                         try:
 742                                 webpage = urllib2.urlopen(request).read()
 743                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 744                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 745                                 return
 746                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
 747                         if mobj is None:
 748                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 749                                 return
 750                         video_thumbnail = mobj.group(1)
 751                 else:   # we need something to pass to process_info
 752                         video_thumbnail = ''
 753
 754                 return [{
 755                         'id':           video_id.decode('utf-8'),
 756                         'url':          video_url.decode('utf-8'),
 757                         'uploader':     u'NA',
 758                         'upload_date':  u'NA',
 759                         'title':        video_title,
 760                         'ext':          video_extension.decode('utf-8'),
 761                         'format':       u'NA',
 762                         'player_url':   None,
 763                 }]
 764
 765
 766 class PhotobucketIE(InfoExtractor):
 767         """Information extractor for photobucket.com."""
 768
 769         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 770         IE_NAME = u'photobucket'
 771
 772         def __init__(self, downloader=None):
 773                 InfoExtractor.__init__(self, downloader)
 774
 775         def report_download_webpage(self, video_id):
 776                 """Report webpage download."""
 777                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 778
 779         def report_extraction(self, video_id):
 780                 """Report information extraction."""
 781                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 782
 783         def _real_extract(self, url):
 784                 # Extract id from URL
 785                 mobj = re.match(self._VALID_URL, url)
 786                 if mobj is None:
 787                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 788                         return
 789
 790                 video_id = mobj.group(1)
 791
 792                 video_extension = 'flv'
 793
 794                 # Retrieve video webpage to extract further information
 795                 request = urllib2.Request(url)
 796                 try:
 797                         self.report_download_webpage(video_id)
 798                         webpage = urllib2.urlopen(request).read()
 799                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 800                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 801                         return
 802
 803                 # Extract URL, uploader, and title from webpage
 804                 self.report_extraction(video_id)
 805                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 806                 if mobj is None:
 807                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 808                         return
 809                 mediaURL = urllib.unquote(mobj.group(1))
 810
 811                 video_url = mediaURL
 812
 813                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 814                 if mobj is None:
 815                         self._downloader.trouble(u'ERROR: unable to extract title')
 816                         return
 817                 video_title = mobj.group(1).decode('utf-8')
 818
 819                 video_uploader = mobj.group(2).decode('utf-8')
 820
 821                 return [{
 822                         'id':           video_id.decode('utf-8'),
 823                         'url':          video_url.decode('utf-8'),
 824                         'uploader':     video_uploader,
 825                         'upload_date':  u'NA',
 826                         'title':        video_title,
 827                         'ext':          video_extension.decode('utf-8'),
 828                         'format':       u'NA',
 829                         'player_url':   None,
 830                 }]
 831
 832
 833 class YahooIE(InfoExtractor):
 834         """Information extractor for video.yahoo.com."""
 835
 836         # _VALID_URL matches all Yahoo! Video URLs
 837         # _VPAGE_URL matches only the extractable '/watch/' URLs
 838         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 839         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 840         IE_NAME = u'video.yahoo'
 841
 842         def __init__(self, downloader=None):
 843                 InfoExtractor.__init__(self, downloader)
 844
 845         def report_download_webpage(self, video_id):
 846                 """Report webpage download."""
 847                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 848
 849         def report_extraction(self, video_id):
 850                 """Report information extraction."""
 851                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 852
 853         def _real_extract(self, url, new_video=True):
 854                 # Extract ID from URL
 855                 mobj = re.match(self._VALID_URL, url)
 856                 if mobj is None:
 857                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 858                         return
 859
 860                 video_id = mobj.group(2)
 861                 video_extension = 'flv'
 862
 863                 # Rewrite valid but non-extractable URLs as
 864                 # extractable English language /watch/ URLs
 865                 if re.match(self._VPAGE_URL, url) is None:
 866                         request = urllib2.Request(url)
 867                         try:
 868                                 webpage = urllib2.urlopen(request).read()
 869                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 870                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 871                                 return
 872
 873                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 874                         if mobj is None:
 875                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 876                                 return
 877                         yahoo_id = mobj.group(1)
 878
 879                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 880                         if mobj is None:
 881                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 882                                 return
 883                         yahoo_vid = mobj.group(1)
 884
 885                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 886                         return self._real_extract(url, new_video=False)
 887
 888                 # Retrieve video webpage to extract further information
 889                 request = urllib2.Request(url)
 890                 try:
 891                         self.report_download_webpage(video_id)
 892                         webpage = urllib2.urlopen(request).read()
 893                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 894                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 895                         return
 896
 897                 # Extract uploader and title from webpage
 898                 self.report_extraction(video_id)
 899                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 900                 if mobj is None:
 901                         self._downloader.trouble(u'ERROR: unable to extract video title')
 902                         return
 903                 video_title = mobj.group(1).decode('utf-8')
 904
 905                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 906                 if mobj is None:
 907                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
 908                         return
 909                 video_uploader = mobj.group(1).decode('utf-8')
 910
 911                 # Extract video thumbnail
 912                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 913                 if mobj is None:
 914                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 915                         return
 916                 video_thumbnail = mobj.group(1).decode('utf-8')
 917
 918                 # Extract video description
 919                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 920                 if mobj is None:
 921                         self._downloader.trouble(u'ERROR: unable to extract video description')
 922                         return
 923                 video_description = mobj.group(1).decode('utf-8')
 924                 if not video_description:
 925                         video_description = 'No description available.'
 926
 927                 # Extract video height and width
 928                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 929                 if mobj is None:
 930                         self._downloader.trouble(u'ERROR: unable to extract video height')
 931                         return
 932                 yv_video_height = mobj.group(1)
 933
 934                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 935                 if mobj is None:
 936                         self._downloader.trouble(u'ERROR: unable to extract video width')
 937                         return
 938                 yv_video_width = mobj.group(1)
 939
 940                 # Retrieve video playlist to extract media URL
 941                 # I'm not completely sure what all these options are, but we
 942                 # seem to need most of them, otherwise the server sends a 401.
 943                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 944                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 945                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 946                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 947                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 948                 try:
 949                         self.report_download_webpage(video_id)
 950                         webpage = urllib2.urlopen(request).read()
 951                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 952                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 953                         return
 954
 955                 # Extract media URL from playlist XML
 956                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 957                 if mobj is None:
 958                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
 959                         return
 960                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
 961                 video_url = unescapeHTML(video_url)
 962
 963                 return [{
 964                         'id':           video_id.decode('utf-8'),
 965                         'url':          video_url,
 966                         'uploader':     video_uploader,
 967                         'upload_date':  u'NA',
 968                         'title':        video_title,
 969                         'ext':          video_extension.decode('utf-8'),
 970                         'thumbnail':    video_thumbnail.decode('utf-8'),
 971                         'description':  video_description,
 972                         'thumbnail':    video_thumbnail,
 973                         'player_url':   None,
 974                 }]
 975
 976
 977 class VimeoIE(InfoExtractor):
 978         """Information extractor for vimeo.com."""
 979
 980         # _VALID_URL matches Vimeo URLs
 981         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
 982         IE_NAME = u'vimeo'
 983
 984         def __init__(self, downloader=None):
 985                 InfoExtractor.__init__(self, downloader)
 986
 987         def report_download_webpage(self, video_id):
 988                 """Report webpage download."""
 989                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
 990
 991         def report_extraction(self, video_id):
 992                 """Report information extraction."""
 993                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
 994
 995         def _real_extract(self, url, new_video=True):
 996                 # Extract ID from URL
 997                 mobj = re.match(self._VALID_URL, url)
 998                 if mobj is None:
 999                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1000                         return
1001
1002                 video_id = mobj.group(1)
1003
1004                 # Retrieve video webpage to extract further information
1005                 request = urllib2.Request(url, None, std_headers)
1006                 try:
1007                         self.report_download_webpage(video_id)
1008                         webpage = urllib2.urlopen(request).read()
1009                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1010                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1011                         return
1012
1013                 # Now we begin extracting as much information as we can from what we
1014                 # retrieved. First we extract the information common to all extractors,
1015                 # and latter we extract those that are Vimeo specific.
1016                 self.report_extraction(video_id)
1017
1018                 # Extract the config JSON
1019                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1020                 try:
1021                         config = json.loads(config)
1022                 except:
1023                         self._downloader.trouble(u'ERROR: unable to extract info section')
1024                         return
1025
1026                 # Extract title
1027                 video_title = config["video"]["title"]
1028
1029                 # Extract uploader
1030                 video_uploader = config["video"]["owner"]["name"]
1031
1032                 # Extract video thumbnail
1033                 video_thumbnail = config["video"]["thumbnail"]
1034
1035                 # Extract video description
1036                 video_description = get_element_by_id("description", webpage.decode('utf8'))
1037                 if video_description: video_description = clean_html(video_description)
1038                 else: video_description = ''
1039
1040                 # Extract upload date
1041                 video_upload_date = u'NA'
1042                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1043                 if mobj is not None:
1044                         video_upload_date = mobj.group(1)
1045
1046                 # Vimeo specific: extract request signature and timestamp
1047                 sig = config['request']['signature']
1048                 timestamp = config['request']['timestamp']
1049
1050                 # Vimeo specific: extract video codec and quality information
1051                 # TODO bind to format param
1052                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1053                 for codec in codecs:
1054                         if codec[0] in config["video"]["files"]:
1055                                 video_codec = codec[0]
1056                                 video_extension = codec[1]
1057                                 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1058                                 else: quality = 'sd'
1059                                 break
1060                 else:
1061                         self._downloader.trouble(u'ERROR: no known codec found')
1062                         return
1063
1064                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1065                                         %(video_id, sig, timestamp, quality, video_codec.upper())
1066
1067                 return [{
1068                         'id':           video_id,
1069                         'url':          video_url,
1070                         'uploader':     video_uploader,
1071                         'upload_date':  video_upload_date,
1072                         'title':        video_title,
1073                         'ext':          video_extension,
1074                         'thumbnail':    video_thumbnail,
1075                         'description':  video_description,
1076                         'player_url':   None,
1077                 }]
1078
1079
1080 class ArteTvIE(InfoExtractor):
1081         """arte.tv information extractor."""
1082
1083         _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1084         _LIVE_URL = r'index-[0-9]+\.html$'
1085
1086         IE_NAME = u'arte.tv'
1087
1088         def __init__(self, downloader=None):
1089                 InfoExtractor.__init__(self, downloader)
1090
1091         def report_download_webpage(self, video_id):
1092                 """Report webpage download."""
1093                 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1094
1095         def report_extraction(self, video_id):
1096                 """Report information extraction."""
1097                 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1098
1099         def fetch_webpage(self, url):
1100                 self._downloader.increment_downloads()
1101                 request = urllib2.Request(url)
1102                 try:
1103                         self.report_download_webpage(url)
1104                         webpage = urllib2.urlopen(request).read()
1105                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1106                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1107                         return
1108                 except ValueError, err:
1109                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1110                         return
1111                 return webpage
1112
1113         def grep_webpage(self, url, regex, regexFlags, matchTuples):
1114                 page = self.fetch_webpage(url)
1115                 mobj = re.search(regex, page, regexFlags)
1116                 info = {}
1117
1118                 if mobj is None:
1119                     self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1120                     return
1121
1122                 for (i, key, err) in matchTuples:
1123                     if mobj.group(i) is None:
1124                         self._downloader.trouble(err)
1125                         return
1126                     else:
1127                         info[key] = mobj.group(i)
1128
1129                 return info
1130
1131         def extractLiveStream(self, url):
1132
1133                 video_lang = url.split('/')[-4]
1134
1135                 info = self.grep_webpage(
1136                     url,
1137                     r'src="(.*?/videothek_js.*?\.js)',
1138                     0,
1139                     [
1140                         (1, 'url', u'ERROR: Invalid URL: %s' % url)
1141                     ]
1142                 )
1143
1144                 http_host = url.split('/')[2]
1145                 next_url = 'http://%s%s' % (http_host, urllib.unquote(info.get('url')))
1146
1147                 info = self.grep_webpage(
1148                     next_url,
1149                     r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1150                      '(http://.*?\.swf).*?' +
1151                      '(rtmp://.*?)\'',
1152                     re.DOTALL,
1153                     [
1154                         (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1155                         (2, 'player', u'ERROR: could not extract video player: %s' % url),
1156                         (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1157                     ]
1158                 )
1159
1160                 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1161
1162                 print u'rtmpdump --swfVfy \'%s\' --rtmp \'%s\' --live -o arte-live.mp4' % (info.get('player'), video_url)
1163
1164         def extractPlus7Stream(self, url):
1165
1166                 video_lang = url.split('/')[-3]
1167
1168                 info = self.grep_webpage(
1169                     url,
1170                     r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1171                     0,
1172                     [
1173                         (1, 'url', u'ERROR: Invalid URL: %s' % url)
1174                     ]
1175                 )
1176
1177                 next_url = urllib.unquote(info.get('url'))
1178
1179                 info = self.grep_webpage(
1180                     next_url,
1181                     r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1182                     0,
1183                     [
1184                         (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1185                     ]
1186                 )
1187
1188                 next_url = urllib.unquote(info.get('url'))
1189
1190                 info = self.grep_webpage(
1191                     next_url,
1192                     r'<video id="(.*?)".*?>.*?' +
1193                      '<name>(.*?)</name>.*?' +
1194                      '<dateVideo>(.*?)</dateVideo>.*?' +
1195                      '<url quality="hd">(.*?)</url>',
1196                     re.DOTALL,
1197                     [
1198                         (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1199                         (2, 'title', u'ERROR: could not extract video title: %s' % url),
1200                         (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1201                         (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1202                     ]
1203                 )
1204
1205                 return {
1206                     'id':           info.get('id'),
1207                     'url':          urllib.unquote(info.get('url')),
1208                     'uploader':     u'arte.tv',
1209                     'upload_date':  info.get('date'),
1210                     'title':        info.get('title'),
1211                     'ext':          u'mp4',
1212                     'format':       u'NA',
1213                     'player_url':   None,
1214                 }
1215
1216         def _real_extract(self, url):
1217
1218                 video_id = url.split('/')[-1]
1219
1220                 self.report_extraction(video_id)
1221
1222                 if re.search(self._LIVE_URL, video_id) is not None:
1223                     self.extractLiveStream(url)
1224                     return
1225                 else:
1226                     info = self.extractPlus7Stream(url)
1227
1228                 try:
1229                         # Process video information
1230                         self._downloader.process_info(info)
1231                 except UnavailableVideoError, err:
1232                         self._downloader.trouble(u'\nERROR: unable to download video')
1233
1234
1235 class GenericIE(InfoExtractor):
1236         """Generic last-resort information extractor."""
1237
1238         _VALID_URL = r'.*'
1239         IE_NAME = u'generic'
1240
1241         def __init__(self, downloader=None):
1242                 InfoExtractor.__init__(self, downloader)
1243
1244         def report_download_webpage(self, video_id):
1245                 """Report webpage download."""
1246                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1247                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1248
1249         def report_extraction(self, video_id):
1250                 """Report information extraction."""
1251                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1252
1253         def report_following_redirect(self, new_url):
1254                 """Report information extraction."""
1255                 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1256
1257         def _test_redirect(self, url):
1258                 """Check if it is a redirect, like url shorteners, in case restart chain."""
1259                 class HeadRequest(urllib2.Request):
1260                         def get_method(self):
1261                                 return "HEAD"
1262
1263                 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1264                         """
1265                         Subclass the HTTPRedirectHandler to make it use our
1266                         HeadRequest also on the redirected URL
1267                         """
1268                         def redirect_request(self, req, fp, code, msg, headers, newurl):
1269                                 if code in (301, 302, 303, 307):
1270                                         newurl = newurl.replace(' ', '%20')
1271                                         newheaders = dict((k,v) for k,v in req.headers.items()
1272                                                                           if k.lower() not in ("content-length", "content-type"))
1273                                         return HeadRequest(newurl,
1274                                                                            headers=newheaders,
1275                                                                            origin_req_host=req.get_origin_req_host(),
1276                                                                            unverifiable=True)
1277                                 else:
1278                                         raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1279
1280                 class HTTPMethodFallback(urllib2.BaseHandler):
1281                         """
1282                         Fallback to GET if HEAD is not allowed (405 HTTP error)
1283                         """
1284                         def http_error_405(self, req, fp, code, msg, headers):
1285                                 fp.read()
1286                                 fp.close()
1287
1288                                 newheaders = dict((k,v) for k,v in req.headers.items()
1289                                                                   if k.lower() not in ("content-length", "content-type"))
1290                                 return self.parent.open(urllib2.Request(req.get_full_url(),
1291                                                                                                  headers=newheaders,
1292                                                                                                  origin_req_host=req.get_origin_req_host(),
1293                                                                                                  unverifiable=True))
1294
1295                 # Build our opener
1296                 opener = urllib2.OpenerDirector()
1297                 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1298                                                 HTTPMethodFallback, HEADRedirectHandler,
1299                                                 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1300                         opener.add_handler(handler())
1301
1302                 response = opener.open(HeadRequest(url))
1303                 new_url = response.geturl()
1304
1305                 if url == new_url: return False
1306
1307                 self.report_following_redirect(new_url)
1308                 self._downloader.download([new_url])
1309                 return True
1310
1311         def _real_extract(self, url):
1312                 if self._test_redirect(url): return
1313
1314                 video_id = url.split('/')[-1]
1315                 request = urllib2.Request(url)
1316                 try:
1317                         self.report_download_webpage(video_id)
1318                         webpage = urllib2.urlopen(request).read()
1319                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1320                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1321                         return
1322                 except ValueError, err:
1323                         # since this is the last-resort InfoExtractor, if
1324                         # this error is thrown, it'll be thrown here
1325                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1326                         return
1327
1328                 self.report_extraction(video_id)
1329                 # Start with something easy: JW Player in SWFObject
1330                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1331                 if mobj is None:
1332                         # Broaden the search a little bit
1333                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1334                 if mobj is None:
1335                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1336                         return
1337
1338                 # It's possible that one of the regexes
1339                 # matched, but returned an empty group:
1340                 if mobj.group(1) is None:
1341                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1342                         return
1343
1344                 video_url = urllib.unquote(mobj.group(1))
1345                 video_id = os.path.basename(video_url)
1346
1347                 # here's a fun little line of code for you:
1348                 video_extension = os.path.splitext(video_id)[1][1:]
1349                 video_id = os.path.splitext(video_id)[0]
1350
1351                 # it's tempting to parse this further, but you would
1352                 # have to take into account all the variations like
1353                 #   Video Title - Site Name
1354                 #   Site Name | Video Title
1355                 #   Video Title - Tagline | Site Name
1356                 # and so on and so forth; it's just not practical
1357                 mobj = re.search(r'<title>(.*)</title>', webpage)
1358                 if mobj is None:
1359                         self._downloader.trouble(u'ERROR: unable to extract title')
1360                         return
1361                 video_title = mobj.group(1).decode('utf-8')
1362
1363                 # video uploader is domain name
1364                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1365                 if mobj is None:
1366                         self._downloader.trouble(u'ERROR: unable to extract title')
1367                         return
1368                 video_uploader = mobj.group(1).decode('utf-8')
1369
1370                 return [{
1371                         'id':           video_id.decode('utf-8'),
1372                         'url':          video_url.decode('utf-8'),
1373                         'uploader':     video_uploader,
1374                         'upload_date':  u'NA',
1375                         'title':        video_title,
1376                         'ext':          video_extension.decode('utf-8'),
1377                         'format':       u'NA',
1378                         'player_url':   None,
1379                 }]
1380
1381
1382 class YoutubeSearchIE(InfoExtractor):
1383         """Information Extractor for YouTube search queries."""
1384         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1385         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1386         _max_youtube_results = 1000
1387         IE_NAME = u'youtube:search'
1388
1389         def __init__(self, downloader=None):
1390                 InfoExtractor.__init__(self, downloader)
1391
1392         def report_download_page(self, query, pagenum):
1393                 """Report attempt to download search page with given number."""
1394                 query = query.decode(preferredencoding())
1395                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1396
1397         def _real_extract(self, query):
1398                 mobj = re.match(self._VALID_URL, query)
1399                 if mobj is None:
1400                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1401                         return
1402
1403                 prefix, query = query.split(':')
1404                 prefix = prefix[8:]
1405                 query = query.encode('utf-8')
1406                 if prefix == '':
1407                         self._download_n_results(query, 1)
1408                         return
1409                 elif prefix == 'all':
1410                         self._download_n_results(query, self._max_youtube_results)
1411                         return
1412                 else:
1413                         try:
1414                                 n = long(prefix)
1415                                 if n <= 0:
1416                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1417                                         return
1418                                 elif n > self._max_youtube_results:
1419                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1420                                         n = self._max_youtube_results
1421                                 self._download_n_results(query, n)
1422                                 return
1423                         except ValueError: # parsing prefix as integer fails
1424                                 self._download_n_results(query, 1)
1425                                 return
1426
1427         def _download_n_results(self, query, n):
1428                 """Downloads a specified number of results for a query"""
1429
1430                 video_ids = []
1431                 pagenum = 0
1432                 limit = n
1433
1434                 while (50 * pagenum) < limit:
1435                         self.report_download_page(query, pagenum+1)
1436                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1437                         request = urllib2.Request(result_url)
1438                         try:
1439                                 data = urllib2.urlopen(request).read()
1440                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1441                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1442                                 return
1443                         api_response = json.loads(data)['data']
1444
1445                         new_ids = list(video['id'] for video in api_response['items'])
1446                         video_ids += new_ids
1447
1448                         limit = min(n, api_response['totalItems'])
1449                         pagenum += 1
1450
1451                 if len(video_ids) > n:
1452                         video_ids = video_ids[:n]
1453                 for id in video_ids:
1454                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1455                 return
1456
1457
1458 class GoogleSearchIE(InfoExtractor):
1459         """Information Extractor for Google Video search queries."""
1460         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1461         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1462         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1463         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1464         _max_google_results = 1000
1465         IE_NAME = u'video.google:search'
1466
1467         def __init__(self, downloader=None):
1468                 InfoExtractor.__init__(self, downloader)
1469
1470         def report_download_page(self, query, pagenum):
1471                 """Report attempt to download playlist page with given number."""
1472                 query = query.decode(preferredencoding())
1473                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1474
1475         def _real_extract(self, query):
1476                 mobj = re.match(self._VALID_URL, query)
1477                 if mobj is None:
1478                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1479                         return
1480
1481                 prefix, query = query.split(':')
1482                 prefix = prefix[8:]
1483                 query = query.encode('utf-8')
1484                 if prefix == '':
1485                         self._download_n_results(query, 1)
1486                         return
1487                 elif prefix == 'all':
1488                         self._download_n_results(query, self._max_google_results)
1489                         return
1490                 else:
1491                         try:
1492                                 n = long(prefix)
1493                                 if n <= 0:
1494                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1495                                         return
1496                                 elif n > self._max_google_results:
1497                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1498                                         n = self._max_google_results
1499                                 self._download_n_results(query, n)
1500                                 return
1501                         except ValueError: # parsing prefix as integer fails
1502                                 self._download_n_results(query, 1)
1503                                 return
1504
1505         def _download_n_results(self, query, n):
1506                 """Downloads a specified number of results for a query"""
1507
1508                 video_ids = []
1509                 pagenum = 0
1510
1511                 while True:
1512                         self.report_download_page(query, pagenum)
1513                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1514                         request = urllib2.Request(result_url)
1515                         try:
1516                                 page = urllib2.urlopen(request).read()
1517                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1518                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1519                                 return
1520
1521                         # Extract video identifiers
1522                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1523                                 video_id = mobj.group(1)
1524                                 if video_id not in video_ids:
1525                                         video_ids.append(video_id)
1526                                         if len(video_ids) == n:
1527                                                 # Specified n videos reached
1528                                                 for id in video_ids:
1529                                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1530                                                 return
1531
1532                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1533                                 for id in video_ids:
1534                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1535                                 return
1536
1537                         pagenum = pagenum + 1
1538
1539
1540 class YahooSearchIE(InfoExtractor):
1541         """Information Extractor for Yahoo! Video search queries."""
1542         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1543         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1544         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1545         _MORE_PAGES_INDICATOR = r'\s*Next'
1546         _max_yahoo_results = 1000
1547         IE_NAME = u'video.yahoo:search'
1548
1549         def __init__(self, downloader=None):
1550                 InfoExtractor.__init__(self, downloader)
1551
1552         def report_download_page(self, query, pagenum):
1553                 """Report attempt to download playlist page with given number."""
1554                 query = query.decode(preferredencoding())
1555                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1556
1557         def _real_extract(self, query):
1558                 mobj = re.match(self._VALID_URL, query)
1559                 if mobj is None:
1560                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1561                         return
1562
1563                 prefix, query = query.split(':')
1564                 prefix = prefix[8:]
1565                 query = query.encode('utf-8')
1566                 if prefix == '':
1567                         self._download_n_results(query, 1)
1568                         return
1569                 elif prefix == 'all':
1570                         self._download_n_results(query, self._max_yahoo_results)
1571                         return
1572                 else:
1573                         try:
1574                                 n = long(prefix)
1575                                 if n <= 0:
1576                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1577                                         return
1578                                 elif n > self._max_yahoo_results:
1579                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1580                                         n = self._max_yahoo_results
1581                                 self._download_n_results(query, n)
1582                                 return
1583                         except ValueError: # parsing prefix as integer fails
1584                                 self._download_n_results(query, 1)
1585                                 return
1586
1587         def _download_n_results(self, query, n):
1588                 """Downloads a specified number of results for a query"""
1589
1590                 video_ids = []
1591                 already_seen = set()
1592                 pagenum = 1
1593
1594                 while True:
1595                         self.report_download_page(query, pagenum)
1596                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1597                         request = urllib2.Request(result_url)
1598                         try:
1599                                 page = urllib2.urlopen(request).read()
1600                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1601                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1602                                 return
1603
1604                         # Extract video identifiers
1605                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1606                                 video_id = mobj.group(1)
1607                                 if video_id not in already_seen:
1608                                         video_ids.append(video_id)
1609                                         already_seen.add(video_id)
1610                                         if len(video_ids) == n:
1611                                                 # Specified n videos reached
1612                                                 for id in video_ids:
1613                                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1614                                                 return
1615
1616                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1617                                 for id in video_ids:
1618                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1619                                 return
1620
1621                         pagenum = pagenum + 1
1622
1623
1624 class YoutubePlaylistIE(InfoExtractor):
1625         """Information Extractor for YouTube playlists."""
1626
1627         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1628         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1629         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=(PL)?%s&'
1630         _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1631         IE_NAME = u'youtube:playlist'
1632
1633         def __init__(self, downloader=None):
1634                 InfoExtractor.__init__(self, downloader)
1635
1636         def report_download_page(self, playlist_id, pagenum):
1637                 """Report attempt to download playlist page with given number."""
1638                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1639
1640         def _real_extract(self, url):
1641                 # Extract playlist id
1642                 mobj = re.match(self._VALID_URL, url)
1643                 if mobj is None:
1644                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1645                         return
1646
1647                 # Single video case
1648                 if mobj.group(3) is not None:
1649                         self._downloader.download([mobj.group(3)])
1650                         return
1651
1652                 # Download playlist pages
1653                 # prefix is 'p' as default for playlists but there are other types that need extra care
1654                 playlist_prefix = mobj.group(1)
1655                 if playlist_prefix == 'a':
1656                         playlist_access = 'artist'
1657                 else:
1658                         playlist_prefix = 'p'
1659                         playlist_access = 'view_play_list'
1660                 playlist_id = mobj.group(2)
1661                 video_ids = []
1662                 pagenum = 1
1663
1664                 while True:
1665                         self.report_download_page(playlist_id, pagenum)
1666                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1667                         request = urllib2.Request(url)
1668                         try:
1669                                 page = urllib2.urlopen(request).read()
1670                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1671                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1672                                 return
1673
1674                         # Extract video identifiers
1675                         ids_in_page = []
1676                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1677                                 if mobj.group(1) not in ids_in_page:
1678                                         ids_in_page.append(mobj.group(1))
1679                         video_ids.extend(ids_in_page)
1680
1681                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1682                                 break
1683                         pagenum = pagenum + 1
1684
1685                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1686                 playlistend = self._downloader.params.get('playlistend', -1)
1687                 if playlistend == -1:
1688                         video_ids = video_ids[playliststart:]
1689                 else:
1690                         video_ids = video_ids[playliststart:playlistend]
1691
1692                 for id in video_ids:
1693                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1694                 return
1695
1696
1697 class YoutubeUserIE(InfoExtractor):
1698         """Information Extractor for YouTube users."""
1699
1700         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1701         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1702         _GDATA_PAGE_SIZE = 50
1703         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1704         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1705         IE_NAME = u'youtube:user'
1706
1707         def __init__(self, downloader=None):
1708                 InfoExtractor.__init__(self, downloader)
1709
1710         def report_download_page(self, username, start_index):
1711                 """Report attempt to download user page."""
1712                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1713                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1714
1715         def _real_extract(self, url):
1716                 # Extract username
1717                 mobj = re.match(self._VALID_URL, url)
1718                 if mobj is None:
1719                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1720                         return
1721
1722                 username = mobj.group(1)
1723
1724                 # Download video ids using YouTube Data API. Result size per
1725                 # query is limited (currently to 50 videos) so we need to query
1726                 # page by page until there are no video ids - it means we got
1727                 # all of them.
1728
1729                 video_ids = []
1730                 pagenum = 0
1731
1732                 while True:
1733                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1734                         self.report_download_page(username, start_index)
1735
1736                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1737
1738                         try:
1739                                 page = urllib2.urlopen(request).read()
1740                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1741                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1742                                 return
1743
1744                         # Extract video identifiers
1745                         ids_in_page = []
1746
1747                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1748                                 if mobj.group(1) not in ids_in_page:
1749                                         ids_in_page.append(mobj.group(1))
1750
1751                         video_ids.extend(ids_in_page)
1752
1753                         # A little optimization - if current page is not
1754                         # "full", ie. does not contain PAGE_SIZE video ids then
1755                         # we can assume that this page is the last one - there
1756                         # are no more ids on further pages - no need to query
1757                         # again.
1758
1759                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1760                                 break
1761
1762                         pagenum += 1
1763
1764                 all_ids_count = len(video_ids)
1765                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1766                 playlistend = self._downloader.params.get('playlistend', -1)
1767
1768                 if playlistend == -1:
1769                         video_ids = video_ids[playliststart:]
1770                 else:
1771                         video_ids = video_ids[playliststart:playlistend]
1772
1773                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1774                                 (username, all_ids_count, len(video_ids)))
1775
1776                 for video_id in video_ids:
1777                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1778
1779
1780 class BlipTVUserIE(InfoExtractor):
1781         """Information Extractor for blip.tv users."""
1782
1783         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1784         _PAGE_SIZE = 12
1785         IE_NAME = u'blip.tv:user'
1786
1787         def __init__(self, downloader=None):
1788                 InfoExtractor.__init__(self, downloader)
1789
1790         def report_download_page(self, username, pagenum):
1791                 """Report attempt to download user page."""
1792                 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1793                                 (self.IE_NAME, username, pagenum))
1794
1795         def _real_extract(self, url):
1796                 # Extract username
1797                 mobj = re.match(self._VALID_URL, url)
1798                 if mobj is None:
1799                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1800                         return
1801
1802                 username = mobj.group(1)
1803
1804                 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1805
1806                 request = urllib2.Request(url)
1807
1808                 try:
1809                         page = urllib2.urlopen(request).read().decode('utf-8')
1810                         mobj = re.search(r'data-users-id="([^"]+)"', page)
1811                         page_base = page_base % mobj.group(1)
1812                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1813                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1814                         return
1815
1816
1817                 # Download video ids using BlipTV Ajax calls. Result size per
1818                 # query is limited (currently to 12 videos) so we need to query
1819                 # page by page until there are no video ids - it means we got
1820                 # all of them.
1821
1822                 video_ids = []
1823                 pagenum = 1
1824
1825                 while True:
1826                         self.report_download_page(username, pagenum)
1827
1828                         request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1829
1830                         try:
1831                                 page = urllib2.urlopen(request).read().decode('utf-8')
1832                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1833                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1834                                 return
1835
1836                         # Extract video identifiers
1837                         ids_in_page = []
1838
1839                         for mobj in re.finditer(r'href="/([^"]+)"', page):
1840                                 if mobj.group(1) not in ids_in_page:
1841                                         ids_in_page.append(unescapeHTML(mobj.group(1)))
1842
1843                         video_ids.extend(ids_in_page)
1844
1845                         # A little optimization - if current page is not
1846                         # "full", ie. does not contain PAGE_SIZE video ids then
1847                         # we can assume that this page is the last one - there
1848                         # are no more ids on further pages - no need to query
1849                         # again.
1850
1851                         if len(ids_in_page) < self._PAGE_SIZE:
1852                                 break
1853
1854                         pagenum += 1
1855
1856                 all_ids_count = len(video_ids)
1857                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1858                 playlistend = self._downloader.params.get('playlistend', -1)
1859
1860                 if playlistend == -1:
1861                         video_ids = video_ids[playliststart:]
1862                 else:
1863                         video_ids = video_ids[playliststart:playlistend]
1864
1865                 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1866                                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1867
1868                 for video_id in video_ids:
1869                         self._downloader.download([u'http://blip.tv/'+video_id])
1870
1871
1872 class DepositFilesIE(InfoExtractor):
1873         """Information extractor for depositfiles.com"""
1874
1875         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1876         IE_NAME = u'DepositFiles'
1877
1878         def __init__(self, downloader=None):
1879                 InfoExtractor.__init__(self, downloader)
1880
1881         def report_download_webpage(self, file_id):
1882                 """Report webpage download."""
1883                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1884
1885         def report_extraction(self, file_id):
1886                 """Report information extraction."""
1887                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1888
1889         def _real_extract(self, url):
1890                 file_id = url.split('/')[-1]
1891                 # Rebuild url in english locale
1892                 url = 'http://depositfiles.com/en/files/' + file_id
1893
1894                 # Retrieve file webpage with 'Free download' button pressed
1895                 free_download_indication = { 'gateway_result' : '1' }
1896                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1897                 try:
1898                         self.report_download_webpage(file_id)
1899                         webpage = urllib2.urlopen(request).read()
1900                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1901                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1902                         return
1903
1904                 # Search for the real file URL
1905                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1906                 if (mobj is None) or (mobj.group(1) is None):
1907                         # Try to figure out reason of the error.
1908                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1909                         if (mobj is not None) and (mobj.group(1) is not None):
1910                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1911                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1912                         else:
1913                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1914                         return
1915
1916                 file_url = mobj.group(1)
1917                 file_extension = os.path.splitext(file_url)[1][1:]
1918
1919                 # Search for file title
1920                 mobj = re.search(r'<b title="(.*?)">', webpage)
1921                 if mobj is None:
1922                         self._downloader.trouble(u'ERROR: unable to extract title')
1923                         return
1924                 file_title = mobj.group(1).decode('utf-8')
1925
1926                 return [{
1927                         'id':           file_id.decode('utf-8'),
1928                         'url':          file_url.decode('utf-8'),
1929                         'uploader':     u'NA',
1930                         'upload_date':  u'NA',
1931                         'title':        file_title,
1932                         'ext':          file_extension.decode('utf-8'),
1933                         'format':       u'NA',
1934                         'player_url':   None,
1935                 }]
1936
1937
1938 class FacebookIE(InfoExtractor):
1939         """Information Extractor for Facebook"""
1940
1941         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1942         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1943         _NETRC_MACHINE = 'facebook'
1944         _available_formats = ['video', 'highqual', 'lowqual']
1945         _video_extensions = {
1946                 'video': 'mp4',
1947                 'highqual': 'mp4',
1948                 'lowqual': 'mp4',
1949         }
1950         IE_NAME = u'facebook'
1951
1952         def __init__(self, downloader=None):
1953                 InfoExtractor.__init__(self, downloader)
1954
1955         def _reporter(self, message):
1956                 """Add header and report message."""
1957                 self._downloader.to_screen(u'[facebook] %s' % message)
1958
1959         def report_login(self):
1960                 """Report attempt to log in."""
1961                 self._reporter(u'Logging in')
1962
1963         def report_video_webpage_download(self, video_id):
1964                 """Report attempt to download video webpage."""
1965                 self._reporter(u'%s: Downloading video webpage' % video_id)
1966
1967         def report_information_extraction(self, video_id):
1968                 """Report attempt to extract video information."""
1969                 self._reporter(u'%s: Extracting video information' % video_id)
1970
1971         def _parse_page(self, video_webpage):
1972                 """Extract video information from page"""
1973                 # General data
1974                 data = {'title': r'\("video_title", "(.*?)"\)',
1975                         'description': r'<div class="datawrap">(.*?)</div>',
1976                         'owner': r'\("video_owner_name", "(.*?)"\)',
1977                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1978                         }
1979                 video_info = {}
1980                 for piece in data.keys():
1981                         mobj = re.search(data[piece], video_webpage)
1982                         if mobj is not None:
1983                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1984
1985                 # Video urls
1986                 video_urls = {}
1987                 for fmt in self._available_formats:
1988                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1989                         if mobj is not None:
1990                                 # URL is in a Javascript segment inside an escaped Unicode format within
1991                                 # the generally utf-8 page
1992                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1993                 video_info['video_urls'] = video_urls
1994
1995                 return video_info
1996
1997         def _real_initialize(self):
1998                 if self._downloader is None:
1999                         return
2000
2001                 useremail = None
2002                 password = None
2003                 downloader_params = self._downloader.params
2004
2005                 # Attempt to use provided username and password or .netrc data
2006                 if downloader_params.get('username', None) is not None:
2007                         useremail = downloader_params['username']
2008                         password = downloader_params['password']
2009                 elif downloader_params.get('usenetrc', False):
2010                         try:
2011                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2012                                 if info is not None:
2013                                         useremail = info[0]
2014                                         password = info[2]
2015                                 else:
2016                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2017                         except (IOError, netrc.NetrcParseError), err:
2018                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2019                                 return
2020
2021                 if useremail is None:
2022                         return
2023
2024                 # Log in
2025                 login_form = {
2026                         'email': useremail,
2027                         'pass': password,
2028                         'login': 'Log+In'
2029                         }
2030                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2031                 try:
2032                         self.report_login()
2033                         login_results = urllib2.urlopen(request).read()
2034                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2035                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2036                                 return
2037                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2038                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2039                         return
2040
2041         def _real_extract(self, url):
2042                 mobj = re.match(self._VALID_URL, url)
2043                 if mobj is None:
2044                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2045                         return
2046                 video_id = mobj.group('ID')
2047
2048                 # Get video webpage
2049                 self.report_video_webpage_download(video_id)
2050                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2051                 try:
2052                         page = urllib2.urlopen(request)
2053                         video_webpage = page.read()
2054                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2055                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2056                         return
2057
2058                 # Start extracting information
2059                 self.report_information_extraction(video_id)
2060
2061                 # Extract information
2062                 video_info = self._parse_page(video_webpage)
2063
2064                 # uploader
2065                 if 'owner' not in video_info:
2066                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2067                         return
2068                 video_uploader = video_info['owner']
2069
2070                 # title
2071                 if 'title' not in video_info:
2072                         self._downloader.trouble(u'ERROR: unable to extract video title')
2073                         return
2074                 video_title = video_info['title']
2075                 video_title = video_title.decode('utf-8')
2076
2077                 # thumbnail image
2078                 if 'thumbnail' not in video_info:
2079                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2080                         video_thumbnail = ''
2081                 else:
2082                         video_thumbnail = video_info['thumbnail']
2083
2084                 # upload date
2085                 upload_date = u'NA'
2086                 if 'upload_date' in video_info:
2087                         upload_time = video_info['upload_date']
2088                         timetuple = email.utils.parsedate_tz(upload_time)
2089                         if timetuple is not None:
2090                                 try:
2091                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2092                                 except:
2093                                         pass
2094
2095                 # description
2096                 video_description = video_info.get('description', 'No description available.')
2097
2098                 url_map = video_info['video_urls']
2099                 if len(url_map.keys()) > 0:
2100                         # Decide which formats to download
2101                         req_format = self._downloader.params.get('format', None)
2102                         format_limit = self._downloader.params.get('format_limit', None)
2103
2104                         if format_limit is not None and format_limit in self._available_formats:
2105                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2106                         else:
2107                                 format_list = self._available_formats
2108                         existing_formats = [x for x in format_list if x in url_map]
2109                         if len(existing_formats) == 0:
2110                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2111                                 return
2112                         if req_format is None:
2113                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2114                         elif req_format == 'worst':
2115                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2116                         elif req_format == '-1':
2117                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2118                         else:
2119                                 # Specific format
2120                                 if req_format not in url_map:
2121                                         self._downloader.trouble(u'ERROR: requested format not available')
2122                                         return
2123                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2124
2125                 results = []
2126                 for format_param, video_real_url in video_url_list:
2127                         # Extension
2128                         video_extension = self._video_extensions.get(format_param, 'mp4')
2129
2130                         results.append({
2131                                 'id':           video_id.decode('utf-8'),
2132                                 'url':          video_real_url.decode('utf-8'),
2133                                 'uploader':     video_uploader.decode('utf-8'),
2134                                 'upload_date':  upload_date,
2135                                 'title':        video_title,
2136                                 'ext':          video_extension.decode('utf-8'),
2137                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2138                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2139                                 'description':  video_description.decode('utf-8'),
2140                                 'player_url':   None,
2141                         })
2142                 return results
2143
2144 class BlipTVIE(InfoExtractor):
2145         """Information extractor for blip.tv"""
2146
2147         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2148         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2149         IE_NAME = u'blip.tv'
2150
2151         def report_extraction(self, file_id):
2152                 """Report information extraction."""
2153                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2154
2155         def report_direct_download(self, title):
2156                 """Report information extraction."""
2157                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2158
2159         def _real_extract(self, url):
2160                 mobj = re.match(self._VALID_URL, url)
2161                 if mobj is None:
2162                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2163                         return
2164
2165                 if '?' in url:
2166                         cchar = '&'
2167                 else:
2168                         cchar = '?'
2169                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2170                 request = urllib2.Request(json_url.encode('utf-8'))
2171                 self.report_extraction(mobj.group(1))
2172                 info = None
2173                 try:
2174                         urlh = urllib2.urlopen(request)
2175                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2176                                 basename = url.split('/')[-1]
2177                                 title,ext = os.path.splitext(basename)
2178                                 title = title.decode('UTF-8')
2179                                 ext = ext.replace('.', '')
2180                                 self.report_direct_download(title)
2181                                 info = {
2182                                         'id': title,
2183                                         'url': url,
2184                                         'title': title,
2185                                         'ext': ext,
2186                                         'urlhandle': urlh
2187                                 }
2188                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2189                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2190                         return
2191                 if info is None: # Regular URL
2192                         try:
2193                                 json_code = urlh.read()
2194                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2195                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2196                                 return
2197
2198                         try:
2199                                 json_data = json.loads(json_code)
2200                                 if 'Post' in json_data:
2201                                         data = json_data['Post']
2202                                 else:
2203                                         data = json_data
2204
2205                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2206                                 video_url = data['media']['url']
2207                                 umobj = re.match(self._URL_EXT, video_url)
2208                                 if umobj is None:
2209                                         raise ValueError('Can not determine filename extension')
2210                                 ext = umobj.group(1)
2211
2212                                 info = {
2213                                         'id': data['item_id'],
2214                                         'url': video_url,
2215                                         'uploader': data['display_name'],
2216                                         'upload_date': upload_date,
2217                                         'title': data['title'],
2218                                         'ext': ext,
2219                                         'format': data['media']['mimeType'],
2220                                         'thumbnail': data['thumbnailUrl'],
2221                                         'description': data['description'],
2222                                         'player_url': data['embedUrl']
2223                                 }
2224                         except (ValueError,KeyError), err:
2225                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2226                                 return
2227
2228                 std_headers['User-Agent'] = 'iTunes/10.6.1'
2229                 return [info]
2230
2231
2232 class MyVideoIE(InfoExtractor):
2233         """Information Extractor for myvideo.de."""
2234
2235         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2236         IE_NAME = u'myvideo'
2237
2238         def __init__(self, downloader=None):
2239                 InfoExtractor.__init__(self, downloader)
2240
2241         def report_download_webpage(self, video_id):
2242                 """Report webpage download."""
2243                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2244
2245         def report_extraction(self, video_id):
2246                 """Report information extraction."""
2247                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2248
2249         def _real_extract(self,url):
2250                 mobj = re.match(self._VALID_URL, url)
2251                 if mobj is None:
2252                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2253                         return
2254
2255                 video_id = mobj.group(1)
2256
2257                 # Get video webpage
2258                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2259                 try:
2260                         self.report_download_webpage(video_id)
2261                         webpage = urllib2.urlopen(request).read()
2262                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2263                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2264                         return
2265
2266                 self.report_extraction(video_id)
2267                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2268                                  webpage)
2269                 if mobj is None:
2270                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2271                         return
2272                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2273
2274                 mobj = re.search('<title>([^<]+)</title>', webpage)
2275                 if mobj is None:
2276                         self._downloader.trouble(u'ERROR: unable to extract title')
2277                         return
2278
2279                 video_title = mobj.group(1)
2280
2281                 return [{
2282                         'id':           video_id,
2283                         'url':          video_url,
2284                         'uploader':     u'NA',
2285                         'upload_date':  u'NA',
2286                         'title':        video_title,
2287                         'ext':          u'flv',
2288                         'format':       u'NA',
2289                         'player_url':   None,
2290                 }]
2291
2292 class ComedyCentralIE(InfoExtractor):
2293         """Information extractor for The Daily Show and Colbert Report """
2294
2295         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2296         IE_NAME = u'comedycentral'
2297
2298         def report_extraction(self, episode_id):
2299                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2300
2301         def report_config_download(self, episode_id):
2302                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2303
2304         def report_index_download(self, episode_id):
2305                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2306
2307         def report_player_url(self, episode_id):
2308                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2309
2310         def _real_extract(self, url):
2311                 mobj = re.match(self._VALID_URL, url)
2312                 if mobj is None:
2313                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2314                         return
2315
2316                 if mobj.group('shortname'):
2317                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
2318                                 url = u'http://www.thedailyshow.com/full-episodes/'
2319                         else:
2320                                 url = u'http://www.colbertnation.com/full-episodes/'
2321                         mobj = re.match(self._VALID_URL, url)
2322                         assert mobj is not None
2323
2324                 dlNewest = not mobj.group('episode')
2325                 if dlNewest:
2326                         epTitle = mobj.group('showname')
2327                 else:
2328                         epTitle = mobj.group('episode')
2329
2330                 req = urllib2.Request(url)
2331                 self.report_extraction(epTitle)
2332                 try:
2333                         htmlHandle = urllib2.urlopen(req)
2334                         html = htmlHandle.read()
2335                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2336                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2337                         return
2338                 if dlNewest:
2339                         url = htmlHandle.geturl()
2340                         mobj = re.match(self._VALID_URL, url)
2341                         if mobj is None:
2342                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2343                                 return
2344                         if mobj.group('episode') == '':
2345                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2346                                 return
2347                         epTitle = mobj.group('episode')
2348
2349                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2350                 if len(mMovieParams) == 0:
2351                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2352                         return
2353
2354                 playerUrl_raw = mMovieParams[0][0]
2355                 self.report_player_url(epTitle)
2356                 try:
2357                         urlHandle = urllib2.urlopen(playerUrl_raw)
2358                         playerUrl = urlHandle.geturl()
2359                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2360                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2361                         return
2362
2363                 uri = mMovieParams[0][1]
2364                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2365                 self.report_index_download(epTitle)
2366                 try:
2367                         indexXml = urllib2.urlopen(indexUrl).read()
2368                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2369                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2370                         return
2371
2372                 results = []
2373
2374                 idoc = xml.etree.ElementTree.fromstring(indexXml)
2375                 itemEls = idoc.findall('.//item')
2376                 for itemEl in itemEls:
2377                         mediaId = itemEl.findall('./guid')[0].text
2378                         shortMediaId = mediaId.split(':')[-1]
2379                         showId = mediaId.split(':')[-2].replace('.com', '')
2380                         officialTitle = itemEl.findall('./title')[0].text
2381                         officialDate = itemEl.findall('./pubDate')[0].text
2382
2383                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2384                                                 urllib.urlencode({'uri': mediaId}))
2385                         configReq = urllib2.Request(configUrl)
2386                         self.report_config_download(epTitle)
2387                         try:
2388                                 configXml = urllib2.urlopen(configReq).read()
2389                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2390                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2391                                 return
2392
2393                         cdoc = xml.etree.ElementTree.fromstring(configXml)
2394                         turls = []
2395                         for rendition in cdoc.findall('.//rendition'):
2396                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2397                                 turls.append(finfo)
2398
2399                         if len(turls) == 0:
2400                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2401                                 continue
2402
2403                         # For now, just pick the highest bitrate
2404                         format,video_url = turls[-1]
2405
2406                         effTitle = showId + u'-' + epTitle
2407                         info = {
2408                                 'id': shortMediaId,
2409                                 'url': video_url,
2410                                 'uploader': showId,
2411                                 'upload_date': officialDate,
2412                                 'title': effTitle,
2413                                 'ext': 'mp4',
2414                                 'format': format,
2415                                 'thumbnail': None,
2416                                 'description': officialTitle,
2417                                 'player_url': playerUrl
2418                         }
2419
2420                         results.append(info)
2421
2422                 return results
2423
2424
2425 class EscapistIE(InfoExtractor):
2426         """Information extractor for The Escapist """
2427
2428         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2429         IE_NAME = u'escapist'
2430
2431         def report_extraction(self, showName):
2432                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2433
2434         def report_config_download(self, showName):
2435                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2436
2437         def _real_extract(self, url):
2438                 mobj = re.match(self._VALID_URL, url)
2439                 if mobj is None:
2440                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2441                         return
2442                 showName = mobj.group('showname')
2443                 videoId = mobj.group('episode')
2444
2445                 self.report_extraction(showName)
2446                 try:
2447                         webPage = urllib2.urlopen(url)
2448                         webPageBytes = webPage.read()
2449                         m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2450                         webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2451                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2452                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2453                         return
2454
2455                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2456                 description = unescapeHTML(descMatch.group(1))
2457                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2458                 imgUrl = unescapeHTML(imgMatch.group(1))
2459                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2460                 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2461                 configUrlMatch = re.search('config=(.*)$', playerUrl)
2462                 configUrl = urllib2.unquote(configUrlMatch.group(1))
2463
2464                 self.report_config_download(showName)
2465                 try:
2466                         configJSON = urllib2.urlopen(configUrl).read()
2467                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2468                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2469                         return
2470
2471                 # Technically, it's JavaScript, not JSON
2472                 configJSON = configJSON.replace("'", '"')
2473
2474                 try:
2475                         config = json.loads(configJSON)
2476                 except (ValueError,), err:
2477                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2478                         return
2479
2480                 playlist = config['playlist']
2481                 videoUrl = playlist[1]['url']
2482
2483                 info = {
2484                         'id': videoId,
2485                         'url': videoUrl,
2486                         'uploader': showName,
2487                         'upload_date': None,
2488                         'title': showName,
2489                         'ext': 'flv',
2490                         'format': 'flv',
2491                         'thumbnail': imgUrl,
2492                         'description': description,
2493                         'player_url': playerUrl,
2494                 }
2495
2496                 return [info]
2497
2498
2499 class CollegeHumorIE(InfoExtractor):
2500         """Information extractor for collegehumor.com"""
2501
2502         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2503         IE_NAME = u'collegehumor'
2504
2505         def report_webpage(self, video_id):
2506                 """Report information extraction."""
2507                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2508
2509         def report_extraction(self, video_id):
2510                 """Report information extraction."""
2511                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2512
2513         def _real_extract(self, url):
2514                 mobj = re.match(self._VALID_URL, url)
2515                 if mobj is None:
2516                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2517                         return
2518                 video_id = mobj.group('videoid')
2519
2520                 self.report_webpage(video_id)
2521                 request = urllib2.Request(url)
2522                 try:
2523                         webpage = urllib2.urlopen(request).read()
2524                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2525                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2526                         return
2527
2528                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2529                 if m is None:
2530                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2531                         return
2532                 internal_video_id = m.group('internalvideoid')
2533
2534                 info = {
2535                         'id': video_id,
2536                         'internal_id': internal_video_id,
2537                 }
2538
2539                 self.report_extraction(video_id)
2540                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2541                 try:
2542                         metaXml = urllib2.urlopen(xmlUrl).read()
2543                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2544                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2545                         return
2546
2547                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2548                 try:
2549                         videoNode = mdoc.findall('./video')[0]
2550                         info['description'] = videoNode.findall('./description')[0].text
2551                         info['title'] = videoNode.findall('./caption')[0].text
2552                         info['url'] = videoNode.findall('./file')[0].text
2553                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2554                         info['ext'] = info['url'].rpartition('.')[2]
2555                         info['format'] = info['ext']
2556                 except IndexError:
2557                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2558                         return
2559
2560                 return [info]
2561
2562
2563 class XVideosIE(InfoExtractor):
2564         """Information extractor for xvideos.com"""
2565
2566         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2567         IE_NAME = u'xvideos'
2568
2569         def report_webpage(self, video_id):
2570                 """Report information extraction."""
2571                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2572
2573         def report_extraction(self, video_id):
2574                 """Report information extraction."""
2575                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2576
2577         def _real_extract(self, url):
2578                 mobj = re.match(self._VALID_URL, url)
2579                 if mobj is None:
2580                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2581                         return
2582                 video_id = mobj.group(1).decode('utf-8')
2583
2584                 self.report_webpage(video_id)
2585
2586                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2587                 try:
2588                         webpage = urllib2.urlopen(request).read()
2589                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2590                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2591                         return
2592
2593                 self.report_extraction(video_id)
2594
2595
2596                 # Extract video URL
2597                 mobj = re.search(r'flv_url=(.+?)&', webpage)
2598                 if mobj is None:
2599                         self._downloader.trouble(u'ERROR: unable to extract video url')
2600                         return
2601                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2602
2603
2604                 # Extract title
2605                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2606                 if mobj is None:
2607                         self._downloader.trouble(u'ERROR: unable to extract video title')
2608                         return
2609                 video_title = mobj.group(1).decode('utf-8')
2610
2611
2612                 # Extract video thumbnail
2613                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2614                 if mobj is None:
2615                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2616                         return
2617                 video_thumbnail = mobj.group(0).decode('utf-8')
2618
2619                 info = {
2620                         'id': video_id,
2621                         'url': video_url,
2622                         'uploader': None,
2623                         'upload_date': None,
2624                         'title': video_title,
2625                         'ext': 'flv',
2626                         'format': 'flv',
2627                         'thumbnail': video_thumbnail,
2628                         'description': None,
2629                         'player_url': None,
2630                 }
2631
2632                 return [info]
2633
2634
2635 class SoundcloudIE(InfoExtractor):
2636         """Information extractor for soundcloud.com
2637            To access the media, the uid of the song and a stream token
2638            must be extracted from the page source and the script must make
2639            a request to media.soundcloud.com/crossdomain.xml. Then
2640            the media can be grabbed by requesting from an url composed
2641            of the stream token and uid
2642          """
2643
2644         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2645         IE_NAME = u'soundcloud'
2646
2647         def __init__(self, downloader=None):
2648                 InfoExtractor.__init__(self, downloader)
2649
2650         def report_webpage(self, video_id):
2651                 """Report information extraction."""
2652                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2653
2654         def report_extraction(self, video_id):
2655                 """Report information extraction."""
2656                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2657
2658         def _real_extract(self, url):
2659                 mobj = re.match(self._VALID_URL, url)
2660                 if mobj is None:
2661                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2662                         return
2663
2664                 # extract uploader (which is in the url)
2665                 uploader = mobj.group(1).decode('utf-8')
2666                 # extract simple title (uploader + slug of song title)
2667                 slug_title =  mobj.group(2).decode('utf-8')
2668                 simple_title = uploader + u'-' + slug_title
2669
2670                 self.report_webpage('%s/%s' % (uploader, slug_title))
2671
2672                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2673                 try:
2674                         webpage = urllib2.urlopen(request).read()
2675                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2676                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2677                         return
2678
2679                 self.report_extraction('%s/%s' % (uploader, slug_title))
2680
2681                 # extract uid and stream token that soundcloud hands out for access
2682                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2683                 if mobj:
2684                         video_id = mobj.group(1)
2685                         stream_token = mobj.group(2)
2686
2687                 # extract unsimplified title
2688                 mobj = re.search('"title":"(.*?)",', webpage)
2689                 if mobj:
2690                         title = mobj.group(1).decode('utf-8')
2691                 else:
2692                         title = simple_title
2693
2694                 # construct media url (with uid/token)
2695                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2696                 mediaURL = mediaURL % (video_id, stream_token)
2697
2698                 # description
2699                 description = u'No description available'
2700                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2701                 if mobj:
2702                         description = mobj.group(1)
2703
2704                 # upload date
2705                 upload_date = None
2706                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2707                 if mobj:
2708                         try:
2709                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2710                         except Exception, e:
2711                                 self._downloader.to_stderr(str(e))
2712
2713                 # for soundcloud, a request to a cross domain is required for cookies
2714                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2715
2716                 return [{
2717                         'id':           video_id.decode('utf-8'),
2718                         'url':          mediaURL,
2719                         'uploader':     uploader.decode('utf-8'),
2720                         'upload_date':  upload_date,
2721                         'title':        title,
2722                         'ext':          u'mp3',
2723                         'format':       u'NA',
2724                         'player_url':   None,
2725                         'description': description.decode('utf-8')
2726                 }]
2727
2728
2729 class InfoQIE(InfoExtractor):
2730         """Information extractor for infoq.com"""
2731
2732         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2733         IE_NAME = u'infoq'
2734
2735         def report_webpage(self, video_id):
2736                 """Report information extraction."""
2737                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2738
2739         def report_extraction(self, video_id):
2740                 """Report information extraction."""
2741                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2742
2743         def _real_extract(self, url):
2744                 mobj = re.match(self._VALID_URL, url)
2745                 if mobj is None:
2746                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2747                         return
2748
2749                 self.report_webpage(url)
2750
2751                 request = urllib2.Request(url)
2752                 try:
2753                         webpage = urllib2.urlopen(request).read()
2754                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2755                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2756                         return
2757
2758                 self.report_extraction(url)
2759
2760
2761                 # Extract video URL
2762                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2763                 if mobj is None:
2764                         self._downloader.trouble(u'ERROR: unable to extract video url')
2765                         return
2766                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2767
2768
2769                 # Extract title
2770                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2771                 if mobj is None:
2772                         self._downloader.trouble(u'ERROR: unable to extract video title')
2773                         return
2774                 video_title = mobj.group(1).decode('utf-8')
2775
2776                 # Extract description
2777                 video_description = u'No description available.'
2778                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2779                 if mobj is not None:
2780                         video_description = mobj.group(1).decode('utf-8')
2781
2782                 video_filename = video_url.split('/')[-1]
2783                 video_id, extension = video_filename.split('.')
2784
2785                 info = {
2786                         'id': video_id,
2787                         'url': video_url,
2788                         'uploader': None,
2789                         'upload_date': None,
2790                         'title': video_title,
2791                         'ext': extension,
2792                         'format': extension, # Extension is always(?) mp4, but seems to be flv
2793                         'thumbnail': None,
2794                         'description': video_description,
2795                         'player_url': None,
2796                 }
2797
2798                 return [info]
2799
2800 class MixcloudIE(InfoExtractor):
2801         """Information extractor for www.mixcloud.com"""
2802         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2803         IE_NAME = u'mixcloud'
2804
2805         def __init__(self, downloader=None):
2806                 InfoExtractor.__init__(self, downloader)
2807
2808         def report_download_json(self, file_id):
2809                 """Report JSON download."""
2810                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2811
2812         def report_extraction(self, file_id):
2813                 """Report information extraction."""
2814                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2815
2816         def get_urls(self, jsonData, fmt, bitrate='best'):
2817                 """Get urls from 'audio_formats' section in json"""
2818                 file_url = None
2819                 try:
2820                         bitrate_list = jsonData[fmt]
2821                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2822                                 bitrate = max(bitrate_list) # select highest
2823
2824                         url_list = jsonData[fmt][bitrate]
2825                 except TypeError: # we have no bitrate info.
2826                         url_list = jsonData[fmt]
2827                 return url_list
2828
2829         def check_urls(self, url_list):
2830                 """Returns 1st active url from list"""
2831                 for url in url_list:
2832                         try:
2833                                 urllib2.urlopen(url)
2834                                 return url
2835                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2836                                 url = None
2837
2838                 return None
2839
2840         def _print_formats(self, formats):
2841                 print 'Available formats:'
2842                 for fmt in formats.keys():
2843                         for b in formats[fmt]:
2844                                 try:
2845                                         ext = formats[fmt][b][0]
2846                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2847                                 except TypeError: # we have no bitrate info
2848                                         ext = formats[fmt][0]
2849                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2850                                         break
2851
2852         def _real_extract(self, url):
2853                 mobj = re.match(self._VALID_URL, url)
2854                 if mobj is None:
2855                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2856                         return
2857                 # extract uploader & filename from url
2858                 uploader = mobj.group(1).decode('utf-8')
2859                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2860
2861                 # construct API request
2862                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2863                 # retrieve .json file with links to files
2864                 request = urllib2.Request(file_url)
2865                 try:
2866                         self.report_download_json(file_url)
2867                         jsonData = urllib2.urlopen(request).read()
2868                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2869                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2870                         return
2871
2872                 # parse JSON
2873                 json_data = json.loads(jsonData)
2874                 player_url = json_data['player_swf_url']
2875                 formats = dict(json_data['audio_formats'])
2876
2877                 req_format = self._downloader.params.get('format', None)
2878                 bitrate = None
2879
2880                 if self._downloader.params.get('listformats', None):
2881                         self._print_formats(formats)
2882                         return
2883
2884                 if req_format is None or req_format == 'best':
2885                         for format_param in formats.keys():
2886                                 url_list = self.get_urls(formats, format_param)
2887                                 # check urls
2888                                 file_url = self.check_urls(url_list)
2889                                 if file_url is not None:
2890                                         break # got it!
2891                 else:
2892                         if req_format not in formats.keys():
2893                                 self._downloader.trouble(u'ERROR: format is not available')
2894                                 return
2895
2896                         url_list = self.get_urls(formats, req_format)
2897                         file_url = self.check_urls(url_list)
2898                         format_param = req_format
2899
2900                 return [{
2901                         'id': file_id.decode('utf-8'),
2902                         'url': file_url.decode('utf-8'),
2903                         'uploader':     uploader.decode('utf-8'),
2904                         'upload_date': u'NA',
2905                         'title': json_data['name'],
2906                         'ext': file_url.split('.')[-1].decode('utf-8'),
2907                         'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2908                         'thumbnail': json_data['thumbnail_url'],
2909                         'description': json_data['description'],
2910                         'player_url': player_url.decode('utf-8'),
2911                 }]
2912
2913 class StanfordOpenClassroomIE(InfoExtractor):
2914         """Information extractor for Stanford's Open ClassRoom"""
2915
2916         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2917         IE_NAME = u'stanfordoc'
2918
2919         def report_download_webpage(self, objid):
2920                 """Report information extraction."""
2921                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2922
2923         def report_extraction(self, video_id):
2924                 """Report information extraction."""
2925                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2926
2927         def _real_extract(self, url):
2928                 mobj = re.match(self._VALID_URL, url)
2929                 if mobj is None:
2930                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2931                         return
2932
2933                 if mobj.group('course') and mobj.group('video'): # A specific video
2934                         course = mobj.group('course')
2935                         video = mobj.group('video')
2936                         info = {
2937                                 'id': course + '_' + video,
2938                         }
2939
2940                         self.report_extraction(info['id'])
2941                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2942                         xmlUrl = baseUrl + video + '.xml'
2943                         try:
2944                                 metaXml = urllib2.urlopen(xmlUrl).read()
2945                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2946                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2947                                 return
2948                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2949                         try:
2950                                 info['title'] = mdoc.findall('./title')[0].text
2951                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2952                         except IndexError:
2953                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2954                                 return
2955                         info['ext'] = info['url'].rpartition('.')[2]
2956                         info['format'] = info['ext']
2957                         return [info]
2958                 elif mobj.group('course'): # A course page
2959                         course = mobj.group('course')
2960                         info = {
2961                                 'id': course,
2962                                 'type': 'playlist',
2963                         }
2964
2965                         self.report_download_webpage(info['id'])
2966                         try:
2967                                 coursepage = urllib2.urlopen(url).read()
2968                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2969                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2970                                 return
2971
2972                         m = re.search('<h1>([^<]+)</h1>', coursepage)
2973                         if m:
2974                                 info['title'] = unescapeHTML(m.group(1))
2975                         else:
2976                                 info['title'] = info['id']
2977
2978                         m = re.search('<description>([^<]+)</description>', coursepage)
2979                         if m:
2980                                 info['description'] = unescapeHTML(m.group(1))
2981
2982                         links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2983                         info['list'] = [
2984                                 {
2985                                         'type': 'reference',
2986                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2987                                 }
2988                                         for vpage in links]
2989                         results = []
2990                         for entry in info['list']:
2991                                 assert entry['type'] == 'reference'
2992                                 results += self.extract(entry['url'])
2993                         return results
2994
2995                 else: # Root page
2996                         info = {
2997                                 'id': 'Stanford OpenClassroom',
2998                                 'type': 'playlist',
2999                         }
3000
3001                         self.report_download_webpage(info['id'])
3002                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3003                         try:
3004                                 rootpage = urllib2.urlopen(rootURL).read()
3005                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3006                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3007                                 return
3008
3009                         info['title'] = info['id']
3010
3011                         links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3012                         info['list'] = [
3013                                 {
3014                                         'type': 'reference',
3015                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3016                                 }
3017                                         for cpage in links]
3018
3019                         results = []
3020                         for entry in info['list']:
3021                                 assert entry['type'] == 'reference'
3022                                 results += self.extract(entry['url'])
3023                         return results
3024
3025 class MTVIE(InfoExtractor):
3026         """Information extractor for MTV.com"""
3027
3028         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3029         IE_NAME = u'mtv'
3030
3031         def report_webpage(self, video_id):
3032                 """Report information extraction."""
3033                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3034
3035         def report_extraction(self, video_id):
3036                 """Report information extraction."""
3037                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3038
3039         def _real_extract(self, url):
3040                 mobj = re.match(self._VALID_URL, url)
3041                 if mobj is None:
3042                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3043                         return
3044                 if not mobj.group('proto'):
3045                         url = 'http://' + url
3046                 video_id = mobj.group('videoid')
3047                 self.report_webpage(video_id)
3048
3049                 request = urllib2.Request(url)
3050                 try:
3051                         webpage = urllib2.urlopen(request).read()
3052                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3053                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3054                         return
3055
3056                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3057                 if mobj is None:
3058                         self._downloader.trouble(u'ERROR: unable to extract song name')
3059                         return
3060                 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3061                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3062                 if mobj is None:
3063                         self._downloader.trouble(u'ERROR: unable to extract performer')
3064                         return
3065                 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3066                 video_title = performer + ' - ' + song_name
3067
3068                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3069                 if mobj is None:
3070                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3071                         return
3072                 mtvn_uri = mobj.group(1)
3073
3074                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3075                 if mobj is None:
3076                         self._downloader.trouble(u'ERROR: unable to extract content id')
3077                         return
3078                 content_id = mobj.group(1)
3079
3080                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3081                 self.report_extraction(video_id)
3082                 request = urllib2.Request(videogen_url)
3083                 try:
3084                         metadataXml = urllib2.urlopen(request).read()
3085                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3086                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
3087                         return
3088
3089                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3090                 renditions = mdoc.findall('.//rendition')
3091
3092                 # For now, always pick the highest quality.
3093                 rendition = renditions[-1]
3094
3095                 try:
3096                         _,_,ext = rendition.attrib['type'].partition('/')
3097                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3098                         video_url = rendition.find('./src').text
3099                 except KeyError:
3100                         self._downloader.trouble('Invalid rendition field.')
3101                         return
3102
3103                 info = {
3104                         'id': video_id,
3105                         'url': video_url,
3106                         'uploader': performer,
3107                         'title': video_title,
3108                         'ext': ext,
3109                         'format': format,
3110                 }
3111
3112                 return [info]