merged unescapeHTML branch; removed lxml dependency
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import datetime
5 import HTMLParser
6 import httplib
7 import netrc
8 import os
9 import re
10 import socket
11 import time
12 import urllib
13 import urllib2
14 import email.utils
15
16 try:
17         import cStringIO as StringIO
18 except ImportError:
19         import StringIO
20
21 # parse_qs was moved from the cgi module to the urlparse module recently.
22 try:
23         from urlparse import parse_qs
24 except ImportError:
25         from cgi import parse_qs
26
27 try:
28         import xml.etree.ElementTree
29 except ImportError: # Python<2.5: Not officially supported, but let it slip
30         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
31
32 from utils import *
33
34
35 class InfoExtractor(object):
36         """Information Extractor class.
37
38         Information extractors are the classes that, given a URL, extract
39         information from the video (or videos) the URL refers to. This
40         information includes the real video URL, the video title and simplified
41         title, author and others. The information is stored in a dictionary
42         which is then passed to the FileDownloader. The FileDownloader
43         processes this information possibly downloading the video to the file
44         system, among other possible outcomes. The dictionaries must include
45         the following fields:
46
47         id:             Video identifier.
48         url:            Final video URL.
49         uploader:       Nickname of the video uploader.
50         title:          Literal title.
51         stitle:         Simplified title.
52         ext:            Video filename extension.
53         format:         Video format.
54         player_url:     SWF Player URL (may be None).
55
56         The following fields are optional. Their primary purpose is to allow
57         youtube-dl to serve as the backend for a video search function, such
58         as the one in youtube2mp3.  They are only used when their respective
59         forced printing functions are called:
60
61         thumbnail:      Full URL to a video thumbnail image.
62         description:    One-line video description.
63
64         Subclasses of this one should re-define the _real_initialize() and
65         _real_extract() methods and define a _VALID_URL regexp.
66         Probably, they should also be added to the list of extractors.
67         """
68
69         _ready = False
70         _downloader = None
71
72         def __init__(self, downloader=None):
73                 """Constructor. Receives an optional downloader."""
74                 self._ready = False
75                 self.set_downloader(downloader)
76
77         def suitable(self, url):
78                 """Receives a URL and returns True if suitable for this IE."""
79                 return re.match(self._VALID_URL, url) is not None
80
81         def initialize(self):
82                 """Initializes an instance (authentication, etc)."""
83                 if not self._ready:
84                         self._real_initialize()
85                         self._ready = True
86
87         def extract(self, url):
88                 """Extracts URL information and returns it in list of dicts."""
89                 self.initialize()
90                 return self._real_extract(url)
91
92         def set_downloader(self, downloader):
93                 """Sets the downloader for this IE."""
94                 self._downloader = downloader
95
96         def _real_initialize(self):
97                 """Real initialization process. Redefine in subclasses."""
98                 pass
99
100         def _real_extract(self, url):
101                 """Real extraction process. Redefine in subclasses."""
102                 pass
103
104
105 class YoutubeIE(InfoExtractor):
106         """Information extractor for youtube.com."""
107
108         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
109         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
110         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
111         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
112         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
113         _NETRC_MACHINE = 'youtube'
114         # Listed in order of quality
115         _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
116         _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
117         _video_extensions = {
118                 '13': '3gp',
119                 '17': 'mp4',
120                 '18': 'mp4',
121                 '22': 'mp4',
122                 '37': 'mp4',
123                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
124                 '43': 'webm',
125                 '44': 'webm',
126                 '45': 'webm',
127         }
128         _video_dimensions = {
129                 '5': '240x400',
130                 '6': '???',
131                 '13': '???',
132                 '17': '144x176',
133                 '18': '360x640',
134                 '22': '720x1280',
135                 '34': '360x640',
136                 '35': '480x854',
137                 '37': '1080x1920',
138                 '38': '3072x4096',
139                 '43': '360x640',
140                 '44': '480x854',
141                 '45': '720x1280',
142         }       
143         IE_NAME = u'youtube'
144
145         def report_lang(self):
146                 """Report attempt to set language."""
147                 self._downloader.to_screen(u'[youtube] Setting language')
148
149         def report_login(self):
150                 """Report attempt to log in."""
151                 self._downloader.to_screen(u'[youtube] Logging in')
152
153         def report_age_confirmation(self):
154                 """Report attempt to confirm age."""
155                 self._downloader.to_screen(u'[youtube] Confirming age')
156
157         def report_video_webpage_download(self, video_id):
158                 """Report attempt to download video webpage."""
159                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
160
161         def report_video_info_webpage_download(self, video_id):
162                 """Report attempt to download video info webpage."""
163                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
164
165         def report_video_subtitles_download(self, video_id):
166                 """Report attempt to download video info webpage."""
167                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
168
169         def report_information_extraction(self, video_id):
170                 """Report attempt to extract video information."""
171                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
172
173         def report_unavailable_format(self, video_id, format):
174                 """Report extracted video URL."""
175                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
176
177         def report_rtmp_download(self):
178                 """Indicate the download will use the RTMP protocol."""
179                 self._downloader.to_screen(u'[youtube] RTMP download detected')
180
181         def _closed_captions_xml_to_srt(self, xml_string):
182                 srt = ''
183                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
184                 # TODO parse xml instead of regex
185                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
186                         if not dur: dur = '4'
187                         start = float(start)
188                         end = start + float(dur)
189                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
190                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
191                         caption = unescapeHTML(caption)
192                         caption = unescapeHTML(caption) # double cycle, inentional
193                         srt += str(n) + '\n'
194                         srt += start + ' --> ' + end + '\n'
195                         srt += caption + '\n\n'
196                 return srt
197
198         def _print_formats(self, formats):
199                 print 'Available formats:'
200                 for x in formats:
201                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
202
203         def _real_initialize(self):
204                 if self._downloader is None:
205                         return
206
207                 username = None
208                 password = None
209                 downloader_params = self._downloader.params
210
211                 # Attempt to use provided username and password or .netrc data
212                 if downloader_params.get('username', None) is not None:
213                         username = downloader_params['username']
214                         password = downloader_params['password']
215                 elif downloader_params.get('usenetrc', False):
216                         try:
217                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
218                                 if info is not None:
219                                         username = info[0]
220                                         password = info[2]
221                                 else:
222                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
223                         except (IOError, netrc.NetrcParseError), err:
224                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
225                                 return
226
227                 # Set language
228                 request = urllib2.Request(self._LANG_URL)
229                 try:
230                         self.report_lang()
231                         urllib2.urlopen(request).read()
232                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
233                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
234                         return
235
236                 # No authentication to be performed
237                 if username is None:
238                         return
239
240                 # Log in
241                 login_form = {
242                                 'current_form': 'loginForm',
243                                 'next':         '/',
244                                 'action_login': 'Log In',
245                                 'username':     username,
246                                 'password':     password,
247                                 }
248                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
249                 try:
250                         self.report_login()
251                         login_results = urllib2.urlopen(request).read()
252                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
253                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
254                                 return
255                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
256                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
257                         return
258
259                 # Confirm age
260                 age_form = {
261                                 'next_url':             '/',
262                                 'action_confirm':       'Confirm',
263                                 }
264                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
265                 try:
266                         self.report_age_confirmation()
267                         age_results = urllib2.urlopen(request).read()
268                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
269                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
270                         return
271
272         def _real_extract(self, url):
273                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
274                 mobj = re.search(self._NEXT_URL_RE, url)
275                 if mobj:
276                         url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
277
278                 # Extract video id from URL
279                 mobj = re.match(self._VALID_URL, url)
280                 if mobj is None:
281                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
282                         return
283                 video_id = mobj.group(2)
284
285                 # Get video webpage
286                 self.report_video_webpage_download(video_id)
287                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
288                 try:
289                         video_webpage = urllib2.urlopen(request).read()
290                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
291                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
292                         return
293
294                 # Attempt to extract SWF player URL
295                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
296                 if mobj is not None:
297                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
298                 else:
299                         player_url = None
300
301                 # Get video info
302                 self.report_video_info_webpage_download(video_id)
303                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
304                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
305                                         % (video_id, el_type))
306                         request = urllib2.Request(video_info_url)
307                         try:
308                                 video_info_webpage = urllib2.urlopen(request).read()
309                                 video_info = parse_qs(video_info_webpage)
310                                 if 'token' in video_info:
311                                         break
312                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
313                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
314                                 return
315                 if 'token' not in video_info:
316                         if 'reason' in video_info:
317                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
318                         else:
319                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
320                         return
321
322                 # Start extracting information
323                 self.report_information_extraction(video_id)
324
325                 # uploader
326                 if 'author' not in video_info:
327                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
328                         return
329                 video_uploader = urllib.unquote_plus(video_info['author'][0])
330
331                 # title
332                 if 'title' not in video_info:
333                         self._downloader.trouble(u'ERROR: unable to extract video title')
334                         return
335                 video_title = urllib.unquote_plus(video_info['title'][0])
336                 video_title = video_title.decode('utf-8')
337                 video_title = sanitize_title(video_title)
338
339                 # simplified title
340                 simple_title = simplify_title(video_title)
341
342                 # thumbnail image
343                 if 'thumbnail_url' not in video_info:
344                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
345                         video_thumbnail = ''
346                 else:   # don't panic if we can't find it
347                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
348
349                 # upload date
350                 upload_date = u'NA'
351                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
352                 if mobj is not None:
353                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
354                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
355                         for expression in format_expressions:
356                                 try:
357                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
358                                 except:
359                                         pass
360
361                 # description
362                 video_description = get_element_by_id("eow-description", video_webpage)
363                 if video_description: video_description = clean_html(video_description.decode('utf8'))
364                 else: video_description = ''
365                         
366                 # closed captions
367                 video_subtitles = None
368                 if self._downloader.params.get('writesubtitles', False):
369                         self.report_video_subtitles_download(video_id)
370                         request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
371                         try:
372                                 srt_list = urllib2.urlopen(request).read()
373                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
374                                 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
375                         else:
376                                 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
377                                 if srt_lang_list:
378                                         if self._downloader.params.get('subtitleslang', False):
379                                                 srt_lang = self._downloader.params.get('subtitleslang')
380                                         elif 'en' in srt_lang_list:
381                                                 srt_lang = 'en'
382                                         else:
383                                                 srt_lang = srt_lang_list[0]
384                                         if not srt_lang in srt_lang_list:
385                                                 self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
386                                         else:
387                                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
388                                                 try:
389                                                         srt_xml = urllib2.urlopen(request).read()
390                                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
391                                                         self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
392                                                 else:
393                                                         video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
394                                 else:
395                                         self._downloader.trouble(u'WARNING: video has no closed captions')
396
397                 # token
398                 video_token = urllib.unquote_plus(video_info['token'][0])
399
400                 # Decide which formats to download
401                 req_format = self._downloader.params.get('format', None)
402
403                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
404                         self.report_rtmp_download()
405                         video_url_list = [(None, video_info['conn'][0])]
406                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
407                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
408                         url_data = [parse_qs(uds) for uds in url_data_strs]
409                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
410                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
411
412                         format_limit = self._downloader.params.get('format_limit', None)
413                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
414                         if format_limit is not None and format_limit in available_formats:
415                                 format_list = available_formats[available_formats.index(format_limit):]
416                         else:
417                                 format_list = available_formats
418                         existing_formats = [x for x in format_list if x in url_map]
419                         if len(existing_formats) == 0:
420                                 self._downloader.trouble(u'ERROR: no known formats available for video')
421                                 return
422                         if self._downloader.params.get('listformats', None):
423                                 self._print_formats(existing_formats)
424                                 return
425                         if req_format is None or req_format == 'best':
426                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
427                         elif req_format == 'worst':
428                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
429                         elif req_format in ('-1', 'all'):
430                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
431                         else:
432                                 # Specific formats. We pick the first in a slash-delimeted sequence.
433                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
434                                 req_formats = req_format.split('/')
435                                 video_url_list = None
436                                 for rf in req_formats:
437                                         if rf in url_map:
438                                                 video_url_list = [(rf, url_map[rf])]
439                                                 break
440                                 if video_url_list is None:
441                                         self._downloader.trouble(u'ERROR: requested format not available')
442                                         return
443                 else:
444                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
445                         return
446
447                 results = []
448                 for format_param, video_real_url in video_url_list:
449                         # Extension
450                         video_extension = self._video_extensions.get(format_param, 'flv')
451
452                         results.append({
453                                 'id':           video_id.decode('utf-8'),
454                                 'url':          video_real_url.decode('utf-8'),
455                                 'uploader':     video_uploader.decode('utf-8'),
456                                 'upload_date':  upload_date,
457                                 'title':        video_title,
458                                 'stitle':       simple_title,
459                                 'ext':          video_extension.decode('utf-8'),
460                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
461                                 'thumbnail':    video_thumbnail.decode('utf-8'),
462                                 'description':  video_description,
463                                 'player_url':   player_url,
464                                 'subtitles':    video_subtitles
465                         })
466                 return results
467
468
469 class MetacafeIE(InfoExtractor):
470         """Information Extractor for metacafe.com."""
471
472         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
473         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
474         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
475         IE_NAME = u'metacafe'
476
477         def __init__(self, downloader=None):
478                 InfoExtractor.__init__(self, downloader)
479
480         def report_disclaimer(self):
481                 """Report disclaimer retrieval."""
482                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
483
484         def report_age_confirmation(self):
485                 """Report attempt to confirm age."""
486                 self._downloader.to_screen(u'[metacafe] Confirming age')
487
488         def report_download_webpage(self, video_id):
489                 """Report webpage download."""
490                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
491
492         def report_extraction(self, video_id):
493                 """Report information extraction."""
494                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
495
496         def _real_initialize(self):
497                 # Retrieve disclaimer
498                 request = urllib2.Request(self._DISCLAIMER)
499                 try:
500                         self.report_disclaimer()
501                         disclaimer = urllib2.urlopen(request).read()
502                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
503                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
504                         return
505
506                 # Confirm age
507                 disclaimer_form = {
508                         'filters': '0',
509                         'submit': "Continue - I'm over 18",
510                         }
511                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
512                 try:
513                         self.report_age_confirmation()
514                         disclaimer = urllib2.urlopen(request).read()
515                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
516                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
517                         return
518
519         def _real_extract(self, url):
520                 # Extract id and simplified title from URL
521                 mobj = re.match(self._VALID_URL, url)
522                 if mobj is None:
523                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
524                         return
525
526                 video_id = mobj.group(1)
527
528                 # Check if video comes from YouTube
529                 mobj2 = re.match(r'^yt-(.*)$', video_id)
530                 if mobj2 is not None:
531                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
532                         return
533
534                 simple_title = mobj.group(2).decode('utf-8')
535
536                 # Retrieve video webpage to extract further information
537                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
538                 try:
539                         self.report_download_webpage(video_id)
540                         webpage = urllib2.urlopen(request).read()
541                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
542                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
543                         return
544
545                 # Extract URL, uploader and title from webpage
546                 self.report_extraction(video_id)
547                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
548                 if mobj is not None:
549                         mediaURL = urllib.unquote(mobj.group(1))
550                         video_extension = mediaURL[-3:]
551
552                         # Extract gdaKey if available
553                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
554                         if mobj is None:
555                                 video_url = mediaURL
556                         else:
557                                 gdaKey = mobj.group(1)
558                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
559                 else:
560                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
561                         if mobj is None:
562                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
563                                 return
564                         vardict = parse_qs(mobj.group(1))
565                         if 'mediaData' not in vardict:
566                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
567                                 return
568                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
569                         if mobj is None:
570                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
571                                 return
572                         mediaURL = mobj.group(1).replace('\\/', '/')
573                         video_extension = mediaURL[-3:]
574                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
575
576                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
577                 if mobj is None:
578                         self._downloader.trouble(u'ERROR: unable to extract title')
579                         return
580                 video_title = mobj.group(1).decode('utf-8')
581                 video_title = sanitize_title(video_title)
582
583                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
584                 if mobj is None:
585                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
586                         return
587                 video_uploader = mobj.group(1)
588
589                 return [{
590                         'id':           video_id.decode('utf-8'),
591                         'url':          video_url.decode('utf-8'),
592                         'uploader':     video_uploader.decode('utf-8'),
593                         'upload_date':  u'NA',
594                         'title':        video_title,
595                         'stitle':       simple_title,
596                         'ext':          video_extension.decode('utf-8'),
597                         'format':       u'NA',
598                         'player_url':   None,
599                 }]
600
601
602 class DailymotionIE(InfoExtractor):
603         """Information Extractor for Dailymotion"""
604
605         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
606         IE_NAME = u'dailymotion'
607
608         def __init__(self, downloader=None):
609                 InfoExtractor.__init__(self, downloader)
610
611         def report_download_webpage(self, video_id):
612                 """Report webpage download."""
613                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
614
615         def report_extraction(self, video_id):
616                 """Report information extraction."""
617                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
618
619         def _real_extract(self, url):
620                 # Extract id and simplified title from URL
621                 mobj = re.match(self._VALID_URL, url)
622                 if mobj is None:
623                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
624                         return
625
626                 video_id = mobj.group(1)
627
628                 video_extension = 'flv'
629
630                 # Retrieve video webpage to extract further information
631                 request = urllib2.Request(url)
632                 request.add_header('Cookie', 'family_filter=off')
633                 try:
634                         self.report_download_webpage(video_id)
635                         webpage = urllib2.urlopen(request).read()
636                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
637                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
638                         return
639
640                 # Extract URL, uploader and title from webpage
641                 self.report_extraction(video_id)
642                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
643                 if mobj is None:
644                         self._downloader.trouble(u'ERROR: unable to extract media URL')
645                         return
646                 sequence = urllib.unquote(mobj.group(1))
647                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
648                 if mobj is None:
649                         self._downloader.trouble(u'ERROR: unable to extract media URL')
650                         return
651                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
652
653                 # if needed add http://www.dailymotion.com/ if relative URL
654
655                 video_url = mediaURL
656
657                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
658                 if mobj is None:
659                         self._downloader.trouble(u'ERROR: unable to extract title')
660                         return
661                 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
662                 video_title = sanitize_title(video_title)
663                 simple_title = simplify_title(video_title)
664
665                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
666                 if mobj is None:
667                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
668                         return
669                 video_uploader = mobj.group(1)
670
671                 return [{
672                         'id':           video_id.decode('utf-8'),
673                         'url':          video_url.decode('utf-8'),
674                         'uploader':     video_uploader.decode('utf-8'),
675                         'upload_date':  u'NA',
676                         'title':        video_title,
677                         'stitle':       simple_title,
678                         'ext':          video_extension.decode('utf-8'),
679                         'format':       u'NA',
680                         'player_url':   None,
681                 }]
682
683
684 class GoogleIE(InfoExtractor):
685         """Information extractor for video.google.com."""
686
687         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
688         IE_NAME = u'video.google'
689
690         def __init__(self, downloader=None):
691                 InfoExtractor.__init__(self, downloader)
692
693         def report_download_webpage(self, video_id):
694                 """Report webpage download."""
695                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
696
697         def report_extraction(self, video_id):
698                 """Report information extraction."""
699                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
700
701         def _real_extract(self, url):
702                 # Extract id from URL
703                 mobj = re.match(self._VALID_URL, url)
704                 if mobj is None:
705                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
706                         return
707
708                 video_id = mobj.group(1)
709
710                 video_extension = 'mp4'
711
712                 # Retrieve video webpage to extract further information
713                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
714                 try:
715                         self.report_download_webpage(video_id)
716                         webpage = urllib2.urlopen(request).read()
717                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
718                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
719                         return
720
721                 # Extract URL, uploader, and title from webpage
722                 self.report_extraction(video_id)
723                 mobj = re.search(r"download_url:'([^']+)'", webpage)
724                 if mobj is None:
725                         video_extension = 'flv'
726                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
727                 if mobj is None:
728                         self._downloader.trouble(u'ERROR: unable to extract media URL')
729                         return
730                 mediaURL = urllib.unquote(mobj.group(1))
731                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
732                 mediaURL = mediaURL.replace('\\x26', '\x26')
733
734                 video_url = mediaURL
735
736                 mobj = re.search(r'<title>(.*)</title>', webpage)
737                 if mobj is None:
738                         self._downloader.trouble(u'ERROR: unable to extract title')
739                         return
740                 video_title = mobj.group(1).decode('utf-8')
741                 video_title = sanitize_title(video_title)
742                 simple_title = simplify_title(video_title)
743
744                 # Extract video description
745                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
746                 if mobj is None:
747                         self._downloader.trouble(u'ERROR: unable to extract video description')
748                         return
749                 video_description = mobj.group(1).decode('utf-8')
750                 if not video_description:
751                         video_description = 'No description available.'
752
753                 # Extract video thumbnail
754                 if self._downloader.params.get('forcethumbnail', False):
755                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
756                         try:
757                                 webpage = urllib2.urlopen(request).read()
758                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
759                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
760                                 return
761                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
762                         if mobj is None:
763                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
764                                 return
765                         video_thumbnail = mobj.group(1)
766                 else:   # we need something to pass to process_info
767                         video_thumbnail = ''
768
769                 return [{
770                         'id':           video_id.decode('utf-8'),
771                         'url':          video_url.decode('utf-8'),
772                         'uploader':     u'NA',
773                         'upload_date':  u'NA',
774                         'title':        video_title,
775                         'stitle':       simple_title,
776                         'ext':          video_extension.decode('utf-8'),
777                         'format':       u'NA',
778                         'player_url':   None,
779                 }]
780
781
782 class PhotobucketIE(InfoExtractor):
783         """Information extractor for photobucket.com."""
784
785         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
786         IE_NAME = u'photobucket'
787
788         def __init__(self, downloader=None):
789                 InfoExtractor.__init__(self, downloader)
790
791         def report_download_webpage(self, video_id):
792                 """Report webpage download."""
793                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
794
795         def report_extraction(self, video_id):
796                 """Report information extraction."""
797                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
798
799         def _real_extract(self, url):
800                 # Extract id from URL
801                 mobj = re.match(self._VALID_URL, url)
802                 if mobj is None:
803                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
804                         return
805
806                 video_id = mobj.group(1)
807
808                 video_extension = 'flv'
809
810                 # Retrieve video webpage to extract further information
811                 request = urllib2.Request(url)
812                 try:
813                         self.report_download_webpage(video_id)
814                         webpage = urllib2.urlopen(request).read()
815                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
816                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
817                         return
818
819                 # Extract URL, uploader, and title from webpage
820                 self.report_extraction(video_id)
821                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
822                 if mobj is None:
823                         self._downloader.trouble(u'ERROR: unable to extract media URL')
824                         return
825                 mediaURL = urllib.unquote(mobj.group(1))
826
827                 video_url = mediaURL
828
829                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
830                 if mobj is None:
831                         self._downloader.trouble(u'ERROR: unable to extract title')
832                         return
833                 video_title = mobj.group(1).decode('utf-8')
834                 video_title = sanitize_title(video_title)
835                 simple_title = simplify_title(video_title)
836
837                 video_uploader = mobj.group(2).decode('utf-8')
838
839                 return [{
840                         'id':           video_id.decode('utf-8'),
841                         'url':          video_url.decode('utf-8'),
842                         'uploader':     video_uploader,
843                         'upload_date':  u'NA',
844                         'title':        video_title,
845                         'stitle':       simple_title,
846                         'ext':          video_extension.decode('utf-8'),
847                         'format':       u'NA',
848                         'player_url':   None,
849                 }]
850
851
852 class YahooIE(InfoExtractor):
853         """Information extractor for video.yahoo.com."""
854
855         # _VALID_URL matches all Yahoo! Video URLs
856         # _VPAGE_URL matches only the extractable '/watch/' URLs
857         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
858         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
859         IE_NAME = u'video.yahoo'
860
861         def __init__(self, downloader=None):
862                 InfoExtractor.__init__(self, downloader)
863
864         def report_download_webpage(self, video_id):
865                 """Report webpage download."""
866                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
867
868         def report_extraction(self, video_id):
869                 """Report information extraction."""
870                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
871
872         def _real_extract(self, url, new_video=True):
873                 # Extract ID from URL
874                 mobj = re.match(self._VALID_URL, url)
875                 if mobj is None:
876                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
877                         return
878
879                 video_id = mobj.group(2)
880                 video_extension = 'flv'
881
882                 # Rewrite valid but non-extractable URLs as
883                 # extractable English language /watch/ URLs
884                 if re.match(self._VPAGE_URL, url) is None:
885                         request = urllib2.Request(url)
886                         try:
887                                 webpage = urllib2.urlopen(request).read()
888                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
889                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
890                                 return
891
892                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
893                         if mobj is None:
894                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
895                                 return
896                         yahoo_id = mobj.group(1)
897
898                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
899                         if mobj is None:
900                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
901                                 return
902                         yahoo_vid = mobj.group(1)
903
904                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
905                         return self._real_extract(url, new_video=False)
906
907                 # Retrieve video webpage to extract further information
908                 request = urllib2.Request(url)
909                 try:
910                         self.report_download_webpage(video_id)
911                         webpage = urllib2.urlopen(request).read()
912                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
913                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
914                         return
915
916                 # Extract uploader and title from webpage
917                 self.report_extraction(video_id)
918                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
919                 if mobj is None:
920                         self._downloader.trouble(u'ERROR: unable to extract video title')
921                         return
922                 video_title = mobj.group(1).decode('utf-8')
923                 simple_title = simplify_title(video_title)
924
925                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
926                 if mobj is None:
927                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
928                         return
929                 video_uploader = mobj.group(1).decode('utf-8')
930
931                 # Extract video thumbnail
932                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
933                 if mobj is None:
934                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
935                         return
936                 video_thumbnail = mobj.group(1).decode('utf-8')
937
938                 # Extract video description
939                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
940                 if mobj is None:
941                         self._downloader.trouble(u'ERROR: unable to extract video description')
942                         return
943                 video_description = mobj.group(1).decode('utf-8')
944                 if not video_description:
945                         video_description = 'No description available.'
946
947                 # Extract video height and width
948                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
949                 if mobj is None:
950                         self._downloader.trouble(u'ERROR: unable to extract video height')
951                         return
952                 yv_video_height = mobj.group(1)
953
954                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
955                 if mobj is None:
956                         self._downloader.trouble(u'ERROR: unable to extract video width')
957                         return
958                 yv_video_width = mobj.group(1)
959
960                 # Retrieve video playlist to extract media URL
961                 # I'm not completely sure what all these options are, but we
962                 # seem to need most of them, otherwise the server sends a 401.
963                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
964                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
965                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
966                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
967                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
968                 try:
969                         self.report_download_webpage(video_id)
970                         webpage = urllib2.urlopen(request).read()
971                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
972                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
973                         return
974
975                 # Extract media URL from playlist XML
976                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
977                 if mobj is None:
978                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
979                         return
980                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
981                 video_url = unescapeHTML(video_url)
982
983                 return [{
984                         'id':           video_id.decode('utf-8'),
985                         'url':          video_url,
986                         'uploader':     video_uploader,
987                         'upload_date':  u'NA',
988                         'title':        video_title,
989                         'stitle':       simple_title,
990                         'ext':          video_extension.decode('utf-8'),
991                         'thumbnail':    video_thumbnail.decode('utf-8'),
992                         'description':  video_description,
993                         'thumbnail':    video_thumbnail,
994                         'player_url':   None,
995                 }]
996
997
998 class VimeoIE(InfoExtractor):
999         """Information extractor for vimeo.com."""
1000
1001         # _VALID_URL matches Vimeo URLs
1002         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1003         IE_NAME = u'vimeo'
1004
1005         def __init__(self, downloader=None):
1006                 InfoExtractor.__init__(self, downloader)
1007
1008         def report_download_webpage(self, video_id):
1009                 """Report webpage download."""
1010                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1011
1012         def report_extraction(self, video_id):
1013                 """Report information extraction."""
1014                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1015
1016         def _real_extract(self, url, new_video=True):
1017                 # Extract ID from URL
1018                 mobj = re.match(self._VALID_URL, url)
1019                 if mobj is None:
1020                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1021                         return
1022
1023                 video_id = mobj.group(1)
1024
1025                 # Retrieve video webpage to extract further information
1026                 request = urllib2.Request(url, None, std_headers)
1027                 try:
1028                         self.report_download_webpage(video_id)
1029                         webpage = urllib2.urlopen(request).read()
1030                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1031                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1032                         return
1033
1034                 # Now we begin extracting as much information as we can from what we
1035                 # retrieved. First we extract the information common to all extractors,
1036                 # and latter we extract those that are Vimeo specific.
1037                 self.report_extraction(video_id)
1038
1039                 # Extract the config JSON
1040                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1041                 try:
1042                         config = json.loads(config)
1043                 except:
1044                         self._downloader.trouble(u'ERROR: unable to extract info section')
1045                         return
1046                 
1047                 # Extract title
1048                 video_title = config["video"]["title"]
1049                 simple_title = simplify_title(video_title)
1050
1051                 # Extract uploader
1052                 video_uploader = config["video"]["owner"]["name"]
1053
1054                 # Extract video thumbnail
1055                 video_thumbnail = config["video"]["thumbnail"]
1056
1057                 # Extract video description
1058                 video_description = get_element_by_id("description", webpage)
1059                 if video_description: video_description = clean_html(video_description.decode('utf8'))
1060                 else: video_description = ''
1061
1062                 # Extract upload date
1063                 video_upload_date = u'NA'
1064                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1065                 if mobj is not None:
1066                         video_upload_date = mobj.group(1)
1067
1068                 # Vimeo specific: extract request signature and timestamp
1069                 sig = config['request']['signature']
1070                 timestamp = config['request']['timestamp']
1071
1072                 # Vimeo specific: extract video codec and quality information
1073                 # TODO bind to format param
1074                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1075                 for codec in codecs:
1076                         if codec[0] in config["video"]["files"]:
1077                                 video_codec = codec[0]
1078                                 video_extension = codec[1]
1079                                 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1080                                 else: quality = 'sd'
1081                                 break
1082                 else:
1083                         self._downloader.trouble(u'ERROR: no known codec found')
1084                         return
1085
1086                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1087                                         %(video_id, sig, timestamp, quality, video_codec.upper())
1088
1089                 return [{
1090                         'id':           video_id,
1091                         'url':          video_url,
1092                         'uploader':     video_uploader,
1093                         'upload_date':  video_upload_date,
1094                         'title':        video_title,
1095                         'stitle':       simple_title,
1096                         'ext':          video_extension,
1097                         'thumbnail':    video_thumbnail,
1098                         'description':  video_description,
1099                         'player_url':   None,
1100                 }]
1101
1102
1103 class GenericIE(InfoExtractor):
1104         """Generic last-resort information extractor."""
1105
1106         _VALID_URL = r'.*'
1107         IE_NAME = u'generic'
1108
1109         def __init__(self, downloader=None):
1110                 InfoExtractor.__init__(self, downloader)
1111
1112         def report_download_webpage(self, video_id):
1113                 """Report webpage download."""
1114                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1115                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1116
1117         def report_extraction(self, video_id):
1118                 """Report information extraction."""
1119                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1120
1121         def report_following_redirect(self, new_url):
1122                 """Report information extraction."""
1123                 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1124                 
1125         def _test_redirect(self, url):
1126                 """Check if it is a redirect, like url shorteners, in case restart chain."""
1127                 class HeadRequest(urllib2.Request):
1128                         def get_method(self):
1129                                 return "HEAD"
1130
1131                 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1132                         """
1133                         Subclass the HTTPRedirectHandler to make it use our 
1134                         HeadRequest also on the redirected URL
1135                         """
1136                         def redirect_request(self, req, fp, code, msg, headers, newurl): 
1137                                 if code in (301, 302, 303, 307):
1138                                         newurl = newurl.replace(' ', '%20') 
1139                                         newheaders = dict((k,v) for k,v in req.headers.items()
1140                                                                           if k.lower() not in ("content-length", "content-type"))
1141                                         return HeadRequest(newurl, 
1142                                                                            headers=newheaders,
1143                                                                            origin_req_host=req.get_origin_req_host(), 
1144                                                                            unverifiable=True) 
1145                                 else: 
1146                                         raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp) 
1147
1148                 class HTTPMethodFallback(urllib2.BaseHandler):
1149                         """
1150                         Fallback to GET if HEAD is not allowed (405 HTTP error)
1151                         """
1152                         def http_error_405(self, req, fp, code, msg, headers): 
1153                                 fp.read()
1154                                 fp.close()
1155
1156                                 newheaders = dict((k,v) for k,v in req.headers.items()
1157                                                                   if k.lower() not in ("content-length", "content-type"))
1158                                 return self.parent.open(urllib2.Request(req.get_full_url(), 
1159                                                                                                  headers=newheaders, 
1160                                                                                                  origin_req_host=req.get_origin_req_host(), 
1161                                                                                                  unverifiable=True))
1162
1163                 # Build our opener
1164                 opener = urllib2.OpenerDirector() 
1165                 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1166                                                 HTTPMethodFallback, HEADRedirectHandler,
1167                                                 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1168                         opener.add_handler(handler())
1169
1170                 response = opener.open(HeadRequest(url))
1171                 new_url = response.geturl()
1172                 
1173                 if url == new_url: return False
1174                 
1175                 self.report_following_redirect(new_url)
1176                 self._downloader.download([new_url])
1177                 return True
1178
1179         def _real_extract(self, url):
1180                 if self._test_redirect(url): return
1181
1182                 video_id = url.split('/')[-1]
1183                 request = urllib2.Request(url)
1184                 try:
1185                         self.report_download_webpage(video_id)
1186                         webpage = urllib2.urlopen(request).read()
1187                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1188                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1189                         return
1190                 except ValueError, err:
1191                         # since this is the last-resort InfoExtractor, if
1192                         # this error is thrown, it'll be thrown here
1193                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1194                         return
1195
1196                 self.report_extraction(video_id)
1197                 # Start with something easy: JW Player in SWFObject
1198                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1199                 if mobj is None:
1200                         # Broaden the search a little bit
1201                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1202                 if mobj is None:
1203                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1204                         return
1205
1206                 # It's possible that one of the regexes
1207                 # matched, but returned an empty group:
1208                 if mobj.group(1) is None:
1209                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1210                         return
1211
1212                 video_url = urllib.unquote(mobj.group(1))
1213                 video_id = os.path.basename(video_url)
1214
1215                 # here's a fun little line of code for you:
1216                 video_extension = os.path.splitext(video_id)[1][1:]
1217                 video_id = os.path.splitext(video_id)[0]
1218
1219                 # it's tempting to parse this further, but you would
1220                 # have to take into account all the variations like
1221                 #   Video Title - Site Name
1222                 #   Site Name | Video Title
1223                 #   Video Title - Tagline | Site Name
1224                 # and so on and so forth; it's just not practical
1225                 mobj = re.search(r'<title>(.*)</title>', webpage)
1226                 if mobj is None:
1227                         self._downloader.trouble(u'ERROR: unable to extract title')
1228                         return
1229                 video_title = mobj.group(1).decode('utf-8')
1230                 video_title = sanitize_title(video_title)
1231                 simple_title = simplify_title(video_title)
1232
1233                 # video uploader is domain name
1234                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1235                 if mobj is None:
1236                         self._downloader.trouble(u'ERROR: unable to extract title')
1237                         return
1238                 video_uploader = mobj.group(1).decode('utf-8')
1239
1240                 return [{
1241                         'id':           video_id.decode('utf-8'),
1242                         'url':          video_url.decode('utf-8'),
1243                         'uploader':     video_uploader,
1244                         'upload_date':  u'NA',
1245                         'title':        video_title,
1246                         'stitle':       simple_title,
1247                         'ext':          video_extension.decode('utf-8'),
1248                         'format':       u'NA',
1249                         'player_url':   None,
1250                 }]
1251
1252
1253 class YoutubeSearchIE(InfoExtractor):
1254         """Information Extractor for YouTube search queries."""
1255         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1256         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1257         _max_youtube_results = 1000
1258         IE_NAME = u'youtube:search'
1259
1260         def __init__(self, downloader=None):
1261                 InfoExtractor.__init__(self, downloader)
1262
1263         def report_download_page(self, query, pagenum):
1264                 """Report attempt to download playlist page with given number."""
1265                 query = query.decode(preferredencoding())
1266                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1267
1268         def _real_extract(self, query):
1269                 mobj = re.match(self._VALID_URL, query)
1270                 if mobj is None:
1271                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1272                         return
1273
1274                 prefix, query = query.split(':')
1275                 prefix = prefix[8:]
1276                 query = query.encode('utf-8')
1277                 if prefix == '':
1278                         self._download_n_results(query, 1)
1279                         return
1280                 elif prefix == 'all':
1281                         self._download_n_results(query, self._max_youtube_results)
1282                         return
1283                 else:
1284                         try:
1285                                 n = long(prefix)
1286                                 if n <= 0:
1287                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1288                                         return
1289                                 elif n > self._max_youtube_results:
1290                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1291                                         n = self._max_youtube_results
1292                                 self._download_n_results(query, n)
1293                                 return
1294                         except ValueError: # parsing prefix as integer fails
1295                                 self._download_n_results(query, 1)
1296                                 return
1297
1298         def _download_n_results(self, query, n):
1299                 """Downloads a specified number of results for a query"""
1300
1301                 video_ids = []
1302                 pagenum = 0
1303                 limit = n
1304
1305                 while (50 * pagenum) < limit:
1306                         self.report_download_page(query, pagenum+1)
1307                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1308                         request = urllib2.Request(result_url)
1309                         try:
1310                                 data = urllib2.urlopen(request).read()
1311                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1312                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1313                                 return
1314                         api_response = json.loads(data)['data']
1315
1316                         new_ids = list(video['id'] for video in api_response['items'])
1317                         video_ids += new_ids
1318
1319                         limit = min(n, api_response['totalItems'])
1320                         pagenum += 1
1321
1322                 if len(video_ids) > n:
1323                         video_ids = video_ids[:n]
1324                 for id in video_ids:
1325                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1326                 return
1327
1328
1329 class GoogleSearchIE(InfoExtractor):
1330         """Information Extractor for Google Video search queries."""
1331         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1332         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1333         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1334         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1335         _max_google_results = 1000
1336         IE_NAME = u'video.google:search'
1337
1338         def __init__(self, downloader=None):
1339                 InfoExtractor.__init__(self, downloader)
1340
1341         def report_download_page(self, query, pagenum):
1342                 """Report attempt to download playlist page with given number."""
1343                 query = query.decode(preferredencoding())
1344                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1345
1346         def _real_extract(self, query):
1347                 mobj = re.match(self._VALID_URL, query)
1348                 if mobj is None:
1349                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1350                         return
1351
1352                 prefix, query = query.split(':')
1353                 prefix = prefix[8:]
1354                 query = query.encode('utf-8')
1355                 if prefix == '':
1356                         self._download_n_results(query, 1)
1357                         return
1358                 elif prefix == 'all':
1359                         self._download_n_results(query, self._max_google_results)
1360                         return
1361                 else:
1362                         try:
1363                                 n = long(prefix)
1364                                 if n <= 0:
1365                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1366                                         return
1367                                 elif n > self._max_google_results:
1368                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1369                                         n = self._max_google_results
1370                                 self._download_n_results(query, n)
1371                                 return
1372                         except ValueError: # parsing prefix as integer fails
1373                                 self._download_n_results(query, 1)
1374                                 return
1375
1376         def _download_n_results(self, query, n):
1377                 """Downloads a specified number of results for a query"""
1378
1379                 video_ids = []
1380                 pagenum = 0
1381
1382                 while True:
1383                         self.report_download_page(query, pagenum)
1384                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1385                         request = urllib2.Request(result_url)
1386                         try:
1387                                 page = urllib2.urlopen(request).read()
1388                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1389                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1390                                 return
1391
1392                         # Extract video identifiers
1393                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1394                                 video_id = mobj.group(1)
1395                                 if video_id not in video_ids:
1396                                         video_ids.append(video_id)
1397                                         if len(video_ids) == n:
1398                                                 # Specified n videos reached
1399                                                 for id in video_ids:
1400                                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1401                                                 return
1402
1403                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1404                                 for id in video_ids:
1405                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1406                                 return
1407
1408                         pagenum = pagenum + 1
1409
1410
1411 class YahooSearchIE(InfoExtractor):
1412         """Information Extractor for Yahoo! Video search queries."""
1413         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1414         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1415         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1416         _MORE_PAGES_INDICATOR = r'\s*Next'
1417         _max_yahoo_results = 1000
1418         IE_NAME = u'video.yahoo:search'
1419
1420         def __init__(self, downloader=None):
1421                 InfoExtractor.__init__(self, downloader)
1422
1423         def report_download_page(self, query, pagenum):
1424                 """Report attempt to download playlist page with given number."""
1425                 query = query.decode(preferredencoding())
1426                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1427
1428         def _real_extract(self, query):
1429                 mobj = re.match(self._VALID_URL, query)
1430                 if mobj is None:
1431                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1432                         return
1433
1434                 prefix, query = query.split(':')
1435                 prefix = prefix[8:]
1436                 query = query.encode('utf-8')
1437                 if prefix == '':
1438                         self._download_n_results(query, 1)
1439                         return
1440                 elif prefix == 'all':
1441                         self._download_n_results(query, self._max_yahoo_results)
1442                         return
1443                 else:
1444                         try:
1445                                 n = long(prefix)
1446                                 if n <= 0:
1447                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1448                                         return
1449                                 elif n > self._max_yahoo_results:
1450                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1451                                         n = self._max_yahoo_results
1452                                 self._download_n_results(query, n)
1453                                 return
1454                         except ValueError: # parsing prefix as integer fails
1455                                 self._download_n_results(query, 1)
1456                                 return
1457
1458         def _download_n_results(self, query, n):
1459                 """Downloads a specified number of results for a query"""
1460
1461                 video_ids = []
1462                 already_seen = set()
1463                 pagenum = 1
1464
1465                 while True:
1466                         self.report_download_page(query, pagenum)
1467                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1468                         request = urllib2.Request(result_url)
1469                         try:
1470                                 page = urllib2.urlopen(request).read()
1471                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1472                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1473                                 return
1474
1475                         # Extract video identifiers
1476                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1477                                 video_id = mobj.group(1)
1478                                 if video_id not in already_seen:
1479                                         video_ids.append(video_id)
1480                                         already_seen.add(video_id)
1481                                         if len(video_ids) == n:
1482                                                 # Specified n videos reached
1483                                                 for id in video_ids:
1484                                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1485                                                 return
1486
1487                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1488                                 for id in video_ids:
1489                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1490                                 return
1491
1492                         pagenum = pagenum + 1
1493
1494
1495 class YoutubePlaylistIE(InfoExtractor):
1496         """Information Extractor for YouTube playlists."""
1497
1498         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1499         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1500         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=PL%s&'
1501         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1502         IE_NAME = u'youtube:playlist'
1503
1504         def __init__(self, downloader=None):
1505                 InfoExtractor.__init__(self, downloader)
1506
1507         def report_download_page(self, playlist_id, pagenum):
1508                 """Report attempt to download playlist page with given number."""
1509                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1510
1511         def _real_extract(self, url):
1512                 # Extract playlist id
1513                 mobj = re.match(self._VALID_URL, url)
1514                 if mobj is None:
1515                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1516                         return
1517
1518                 # Single video case
1519                 if mobj.group(3) is not None:
1520                         self._downloader.download([mobj.group(3)])
1521                         return
1522
1523                 # Download playlist pages
1524                 # prefix is 'p' as default for playlists but there are other types that need extra care
1525                 playlist_prefix = mobj.group(1)
1526                 if playlist_prefix == 'a':
1527                         playlist_access = 'artist'
1528                 else:
1529                         playlist_prefix = 'p'
1530                         playlist_access = 'view_play_list'
1531                 playlist_id = mobj.group(2)
1532                 video_ids = []
1533                 pagenum = 1
1534
1535                 while True:
1536                         self.report_download_page(playlist_id, pagenum)
1537                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1538                         request = urllib2.Request(url)
1539                         try:
1540                                 page = urllib2.urlopen(request).read()
1541                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1542                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1543                                 return
1544
1545                         # Extract video identifiers
1546                         ids_in_page = []
1547                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1548                                 if mobj.group(1) not in ids_in_page:
1549                                         ids_in_page.append(mobj.group(1))
1550                         video_ids.extend(ids_in_page)
1551
1552                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1553                                 break
1554                         pagenum = pagenum + 1
1555
1556                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1557                 playlistend = self._downloader.params.get('playlistend', -1)
1558                 if playlistend == -1:
1559                         video_ids = video_ids[playliststart:]
1560                 else:
1561                         video_ids = video_ids[playliststart:playlistend]
1562
1563                 for id in video_ids:
1564                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1565                 return
1566
1567
1568 class YoutubeUserIE(InfoExtractor):
1569         """Information Extractor for YouTube users."""
1570
1571         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1572         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1573         _GDATA_PAGE_SIZE = 50
1574         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1575         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1576         IE_NAME = u'youtube:user'
1577
1578         def __init__(self, downloader=None):
1579                 InfoExtractor.__init__(self, downloader)
1580
1581         def report_download_page(self, username, start_index):
1582                 """Report attempt to download user page."""
1583                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1584                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1585
1586         def _real_extract(self, url):
1587                 # Extract username
1588                 mobj = re.match(self._VALID_URL, url)
1589                 if mobj is None:
1590                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1591                         return
1592
1593                 username = mobj.group(1)
1594
1595                 # Download video ids using YouTube Data API. Result size per
1596                 # query is limited (currently to 50 videos) so we need to query
1597                 # page by page until there are no video ids - it means we got
1598                 # all of them.
1599
1600                 video_ids = []
1601                 pagenum = 0
1602
1603                 while True:
1604                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1605                         self.report_download_page(username, start_index)
1606
1607                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1608
1609                         try:
1610                                 page = urllib2.urlopen(request).read()
1611                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1612                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1613                                 return
1614
1615                         # Extract video identifiers
1616                         ids_in_page = []
1617
1618                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1619                                 if mobj.group(1) not in ids_in_page:
1620                                         ids_in_page.append(mobj.group(1))
1621
1622                         video_ids.extend(ids_in_page)
1623
1624                         # A little optimization - if current page is not
1625                         # "full", ie. does not contain PAGE_SIZE video ids then
1626                         # we can assume that this page is the last one - there
1627                         # are no more ids on further pages - no need to query
1628                         # again.
1629
1630                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1631                                 break
1632
1633                         pagenum += 1
1634
1635                 all_ids_count = len(video_ids)
1636                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1637                 playlistend = self._downloader.params.get('playlistend', -1)
1638
1639                 if playlistend == -1:
1640                         video_ids = video_ids[playliststart:]
1641                 else:
1642                         video_ids = video_ids[playliststart:playlistend]
1643
1644                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1645                                 (username, all_ids_count, len(video_ids)))
1646
1647                 for video_id in video_ids:
1648                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1649
1650
1651 class DepositFilesIE(InfoExtractor):
1652         """Information extractor for depositfiles.com"""
1653
1654         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1655         IE_NAME = u'DepositFiles'
1656
1657         def __init__(self, downloader=None):
1658                 InfoExtractor.__init__(self, downloader)
1659
1660         def report_download_webpage(self, file_id):
1661                 """Report webpage download."""
1662                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1663
1664         def report_extraction(self, file_id):
1665                 """Report information extraction."""
1666                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1667
1668         def _real_extract(self, url):
1669                 file_id = url.split('/')[-1]
1670                 # Rebuild url in english locale
1671                 url = 'http://depositfiles.com/en/files/' + file_id
1672
1673                 # Retrieve file webpage with 'Free download' button pressed
1674                 free_download_indication = { 'gateway_result' : '1' }
1675                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1676                 try:
1677                         self.report_download_webpage(file_id)
1678                         webpage = urllib2.urlopen(request).read()
1679                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1680                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1681                         return
1682
1683                 # Search for the real file URL
1684                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1685                 if (mobj is None) or (mobj.group(1) is None):
1686                         # Try to figure out reason of the error.
1687                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1688                         if (mobj is not None) and (mobj.group(1) is not None):
1689                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1690                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1691                         else:
1692                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1693                         return
1694
1695                 file_url = mobj.group(1)
1696                 file_extension = os.path.splitext(file_url)[1][1:]
1697
1698                 # Search for file title
1699                 mobj = re.search(r'<b title="(.*?)">', webpage)
1700                 if mobj is None:
1701                         self._downloader.trouble(u'ERROR: unable to extract title')
1702                         return
1703                 file_title = mobj.group(1).decode('utf-8')
1704
1705                 return [{
1706                         'id':           file_id.decode('utf-8'),
1707                         'url':          file_url.decode('utf-8'),
1708                         'uploader':     u'NA',
1709                         'upload_date':  u'NA',
1710                         'title':        file_title,
1711                         'stitle':       file_title,
1712                         'ext':          file_extension.decode('utf-8'),
1713                         'format':       u'NA',
1714                         'player_url':   None,
1715                 }]
1716
1717
1718 class FacebookIE(InfoExtractor):
1719         """Information Extractor for Facebook"""
1720
1721         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1722         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1723         _NETRC_MACHINE = 'facebook'
1724         _available_formats = ['video', 'highqual', 'lowqual']
1725         _video_extensions = {
1726                 'video': 'mp4',
1727                 'highqual': 'mp4',
1728                 'lowqual': 'mp4',
1729         }
1730         IE_NAME = u'facebook'
1731
1732         def __init__(self, downloader=None):
1733                 InfoExtractor.__init__(self, downloader)
1734
1735         def _reporter(self, message):
1736                 """Add header and report message."""
1737                 self._downloader.to_screen(u'[facebook] %s' % message)
1738
1739         def report_login(self):
1740                 """Report attempt to log in."""
1741                 self._reporter(u'Logging in')
1742
1743         def report_video_webpage_download(self, video_id):
1744                 """Report attempt to download video webpage."""
1745                 self._reporter(u'%s: Downloading video webpage' % video_id)
1746
1747         def report_information_extraction(self, video_id):
1748                 """Report attempt to extract video information."""
1749                 self._reporter(u'%s: Extracting video information' % video_id)
1750
1751         def _parse_page(self, video_webpage):
1752                 """Extract video information from page"""
1753                 # General data
1754                 data = {'title': r'\("video_title", "(.*?)"\)',
1755                         'description': r'<div class="datawrap">(.*?)</div>',
1756                         'owner': r'\("video_owner_name", "(.*?)"\)',
1757                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1758                         }
1759                 video_info = {}
1760                 for piece in data.keys():
1761                         mobj = re.search(data[piece], video_webpage)
1762                         if mobj is not None:
1763                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1764
1765                 # Video urls
1766                 video_urls = {}
1767                 for fmt in self._available_formats:
1768                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1769                         if mobj is not None:
1770                                 # URL is in a Javascript segment inside an escaped Unicode format within
1771                                 # the generally utf-8 page
1772                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1773                 video_info['video_urls'] = video_urls
1774
1775                 return video_info
1776
1777         def _real_initialize(self):
1778                 if self._downloader is None:
1779                         return
1780
1781                 useremail = None
1782                 password = None
1783                 downloader_params = self._downloader.params
1784
1785                 # Attempt to use provided username and password or .netrc data
1786                 if downloader_params.get('username', None) is not None:
1787                         useremail = downloader_params['username']
1788                         password = downloader_params['password']
1789                 elif downloader_params.get('usenetrc', False):
1790                         try:
1791                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1792                                 if info is not None:
1793                                         useremail = info[0]
1794                                         password = info[2]
1795                                 else:
1796                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1797                         except (IOError, netrc.NetrcParseError), err:
1798                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1799                                 return
1800
1801                 if useremail is None:
1802                         return
1803
1804                 # Log in
1805                 login_form = {
1806                         'email': useremail,
1807                         'pass': password,
1808                         'login': 'Log+In'
1809                         }
1810                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1811                 try:
1812                         self.report_login()
1813                         login_results = urllib2.urlopen(request).read()
1814                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1815                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1816                                 return
1817                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1818                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1819                         return
1820
1821         def _real_extract(self, url):
1822                 mobj = re.match(self._VALID_URL, url)
1823                 if mobj is None:
1824                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1825                         return
1826                 video_id = mobj.group('ID')
1827
1828                 # Get video webpage
1829                 self.report_video_webpage_download(video_id)
1830                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1831                 try:
1832                         page = urllib2.urlopen(request)
1833                         video_webpage = page.read()
1834                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1835                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1836                         return
1837
1838                 # Start extracting information
1839                 self.report_information_extraction(video_id)
1840
1841                 # Extract information
1842                 video_info = self._parse_page(video_webpage)
1843
1844                 # uploader
1845                 if 'owner' not in video_info:
1846                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1847                         return
1848                 video_uploader = video_info['owner']
1849
1850                 # title
1851                 if 'title' not in video_info:
1852                         self._downloader.trouble(u'ERROR: unable to extract video title')
1853                         return
1854                 video_title = video_info['title']
1855                 video_title = video_title.decode('utf-8')
1856                 video_title = sanitize_title(video_title)
1857
1858                 simple_title = simplify_title(video_title)
1859
1860                 # thumbnail image
1861                 if 'thumbnail' not in video_info:
1862                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1863                         video_thumbnail = ''
1864                 else:
1865                         video_thumbnail = video_info['thumbnail']
1866
1867                 # upload date
1868                 upload_date = u'NA'
1869                 if 'upload_date' in video_info:
1870                         upload_time = video_info['upload_date']
1871                         timetuple = email.utils.parsedate_tz(upload_time)
1872                         if timetuple is not None:
1873                                 try:
1874                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1875                                 except:
1876                                         pass
1877
1878                 # description
1879                 video_description = video_info.get('description', 'No description available.')
1880
1881                 url_map = video_info['video_urls']
1882                 if len(url_map.keys()) > 0:
1883                         # Decide which formats to download
1884                         req_format = self._downloader.params.get('format', None)
1885                         format_limit = self._downloader.params.get('format_limit', None)
1886
1887                         if format_limit is not None and format_limit in self._available_formats:
1888                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1889                         else:
1890                                 format_list = self._available_formats
1891                         existing_formats = [x for x in format_list if x in url_map]
1892                         if len(existing_formats) == 0:
1893                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1894                                 return
1895                         if req_format is None:
1896                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1897                         elif req_format == 'worst':
1898                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1899                         elif req_format == '-1':
1900                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1901                         else:
1902                                 # Specific format
1903                                 if req_format not in url_map:
1904                                         self._downloader.trouble(u'ERROR: requested format not available')
1905                                         return
1906                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1907
1908                 results = []
1909                 for format_param, video_real_url in video_url_list:
1910                         # Extension
1911                         video_extension = self._video_extensions.get(format_param, 'mp4')
1912
1913                         results.append({
1914                                 'id':           video_id.decode('utf-8'),
1915                                 'url':          video_real_url.decode('utf-8'),
1916                                 'uploader':     video_uploader.decode('utf-8'),
1917                                 'upload_date':  upload_date,
1918                                 'title':        video_title,
1919                                 'stitle':       simple_title,
1920                                 'ext':          video_extension.decode('utf-8'),
1921                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1922                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1923                                 'description':  video_description.decode('utf-8'),
1924                                 'player_url':   None,
1925                         })
1926                 return results
1927
1928 class BlipTVIE(InfoExtractor):
1929         """Information extractor for blip.tv"""
1930
1931         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1932         _URL_EXT = r'^.*\.([a-z0-9]+)$'
1933         IE_NAME = u'blip.tv'
1934
1935         def report_extraction(self, file_id):
1936                 """Report information extraction."""
1937                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
1938
1939         def report_direct_download(self, title):
1940                 """Report information extraction."""
1941                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
1942
1943         def _real_extract(self, url):
1944                 mobj = re.match(self._VALID_URL, url)
1945                 if mobj is None:
1946                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1947                         return
1948
1949                 if '?' in url:
1950                         cchar = '&'
1951                 else:
1952                         cchar = '?'
1953                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1954                 request = urllib2.Request(json_url)
1955                 self.report_extraction(mobj.group(1))
1956                 info = None
1957                 try:
1958                         urlh = urllib2.urlopen(request)
1959                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1960                                 basename = url.split('/')[-1]
1961                                 title,ext = os.path.splitext(basename)
1962                                 title = title.decode('UTF-8')
1963                                 ext = ext.replace('.', '')
1964                                 self.report_direct_download(title)
1965                                 info = {
1966                                         'id': title,
1967                                         'url': url,
1968                                         'title': title,
1969                                         'stitle': simplify_title(title),
1970                                         'ext': ext,
1971                                         'urlhandle': urlh
1972                                 }
1973                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1974                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1975                         return
1976                 if info is None: # Regular URL
1977                         try:
1978                                 json_code = urlh.read()
1979                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1980                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
1981                                 return
1982
1983                         try:
1984                                 json_data = json.loads(json_code)
1985                                 if 'Post' in json_data:
1986                                         data = json_data['Post']
1987                                 else:
1988                                         data = json_data
1989         
1990                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1991                                 video_url = data['media']['url']
1992                                 umobj = re.match(self._URL_EXT, video_url)
1993                                 if umobj is None:
1994                                         raise ValueError('Can not determine filename extension')
1995                                 ext = umobj.group(1)
1996         
1997                                 info = {
1998                                         'id': data['item_id'],
1999                                         'url': video_url,
2000                                         'uploader': data['display_name'],
2001                                         'upload_date': upload_date,
2002                                         'title': data['title'],
2003                                         'stitle': simplify_title(data['title']),
2004                                         'ext': ext,
2005                                         'format': data['media']['mimeType'],
2006                                         'thumbnail': data['thumbnailUrl'],
2007                                         'description': data['description'],
2008                                         'player_url': data['embedUrl']
2009                                 }
2010                         except (ValueError,KeyError), err:
2011                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2012                                 return
2013
2014                 return [info]
2015
2016
2017 class MyVideoIE(InfoExtractor):
2018         """Information Extractor for myvideo.de."""
2019
2020         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2021         IE_NAME = u'myvideo'
2022
2023         def __init__(self, downloader=None):
2024                 InfoExtractor.__init__(self, downloader)
2025         
2026         def report_download_webpage(self, video_id):
2027                 """Report webpage download."""
2028                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2029
2030         def report_extraction(self, video_id):
2031                 """Report information extraction."""
2032                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2033
2034         def _real_extract(self,url):
2035                 mobj = re.match(self._VALID_URL, url)
2036                 if mobj is None:
2037                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2038                         return
2039
2040                 video_id = mobj.group(1)
2041
2042                 # Get video webpage
2043                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2044                 try:
2045                         self.report_download_webpage(video_id)
2046                         webpage = urllib2.urlopen(request).read()
2047                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2048                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2049                         return
2050
2051                 self.report_extraction(video_id)
2052                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2053                                  webpage)
2054                 if mobj is None:
2055                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2056                         return
2057                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2058
2059                 mobj = re.search('<title>([^<]+)</title>', webpage)
2060                 if mobj is None:
2061                         self._downloader.trouble(u'ERROR: unable to extract title')
2062                         return
2063
2064                 video_title = mobj.group(1)
2065                 video_title = sanitize_title(video_title)
2066
2067                 simple_title = simplify_title(video_title)
2068
2069                 return [{
2070                         'id':           video_id,
2071                         'url':          video_url,
2072                         'uploader':     u'NA',
2073                         'upload_date':  u'NA',
2074                         'title':        video_title,
2075                         'stitle':       simple_title,
2076                         'ext':          u'flv',
2077                         'format':       u'NA',
2078                         'player_url':   None,
2079                 }]
2080
2081 class ComedyCentralIE(InfoExtractor):
2082         """Information extractor for The Daily Show and Colbert Report """
2083
2084         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2085         IE_NAME = u'comedycentral'
2086
2087         def report_extraction(self, episode_id):
2088                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2089         
2090         def report_config_download(self, episode_id):
2091                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2092
2093         def report_index_download(self, episode_id):
2094                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2095
2096         def report_player_url(self, episode_id):
2097                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2098
2099         def _real_extract(self, url):
2100                 mobj = re.match(self._VALID_URL, url)
2101                 if mobj is None:
2102                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2103                         return
2104
2105                 if mobj.group('shortname'):
2106                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
2107                                 url = u'http://www.thedailyshow.com/full-episodes/'
2108                         else:
2109                                 url = u'http://www.colbertnation.com/full-episodes/'
2110                         mobj = re.match(self._VALID_URL, url)
2111                         assert mobj is not None
2112
2113                 dlNewest = not mobj.group('episode')
2114                 if dlNewest:
2115                         epTitle = mobj.group('showname')
2116                 else:
2117                         epTitle = mobj.group('episode')
2118
2119                 req = urllib2.Request(url)
2120                 self.report_extraction(epTitle)
2121                 try:
2122                         htmlHandle = urllib2.urlopen(req)
2123                         html = htmlHandle.read()
2124                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2125                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2126                         return
2127                 if dlNewest:
2128                         url = htmlHandle.geturl()
2129                         mobj = re.match(self._VALID_URL, url)
2130                         if mobj is None:
2131                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2132                                 return
2133                         if mobj.group('episode') == '':
2134                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2135                                 return
2136                         epTitle = mobj.group('episode')
2137
2138                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2139                 if len(mMovieParams) == 0:
2140                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2141                         return
2142
2143                 playerUrl_raw = mMovieParams[0][0]
2144                 self.report_player_url(epTitle)
2145                 try:
2146                         urlHandle = urllib2.urlopen(playerUrl_raw)
2147                         playerUrl = urlHandle.geturl()
2148                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2149                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2150                         return
2151
2152                 uri = mMovieParams[0][1]
2153                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2154                 self.report_index_download(epTitle)
2155                 try:
2156                         indexXml = urllib2.urlopen(indexUrl).read()
2157                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2158                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2159                         return
2160
2161                 results = []
2162
2163                 idoc = xml.etree.ElementTree.fromstring(indexXml)
2164                 itemEls = idoc.findall('.//item')
2165                 for itemEl in itemEls:
2166                         mediaId = itemEl.findall('./guid')[0].text
2167                         shortMediaId = mediaId.split(':')[-1]
2168                         showId = mediaId.split(':')[-2].replace('.com', '')
2169                         officialTitle = itemEl.findall('./title')[0].text
2170                         officialDate = itemEl.findall('./pubDate')[0].text
2171
2172                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2173                                                 urllib.urlencode({'uri': mediaId}))
2174                         configReq = urllib2.Request(configUrl)
2175                         self.report_config_download(epTitle)
2176                         try:
2177                                 configXml = urllib2.urlopen(configReq).read()
2178                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2179                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2180                                 return
2181
2182                         cdoc = xml.etree.ElementTree.fromstring(configXml)
2183                         turls = []
2184                         for rendition in cdoc.findall('.//rendition'):
2185                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2186                                 turls.append(finfo)
2187
2188                         if len(turls) == 0:
2189                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2190                                 continue
2191
2192                         # For now, just pick the highest bitrate
2193                         format,video_url = turls[-1]
2194
2195                         effTitle = showId + u'-' + epTitle
2196                         info = {
2197                                 'id': shortMediaId,
2198                                 'url': video_url,
2199                                 'uploader': showId,
2200                                 'upload_date': officialDate,
2201                                 'title': effTitle,
2202                                 'stitle': simplify_title(effTitle),
2203                                 'ext': 'mp4',
2204                                 'format': format,
2205                                 'thumbnail': None,
2206                                 'description': officialTitle,
2207                                 'player_url': playerUrl
2208                         }
2209
2210                         results.append(info)
2211                         
2212                 return results
2213
2214
2215 class EscapistIE(InfoExtractor):
2216         """Information extractor for The Escapist """
2217
2218         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2219         IE_NAME = u'escapist'
2220
2221         def report_extraction(self, showName):
2222                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2223
2224         def report_config_download(self, showName):
2225                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2226
2227         def _real_extract(self, url):
2228                 mobj = re.match(self._VALID_URL, url)
2229                 if mobj is None:
2230                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2231                         return
2232                 showName = mobj.group('showname')
2233                 videoId = mobj.group('episode')
2234
2235                 self.report_extraction(showName)
2236                 try:
2237                         webPage = urllib2.urlopen(url).read()
2238                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2239                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2240                         return
2241
2242                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2243                 description = unescapeHTML(descMatch.group(1))
2244                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2245                 imgUrl = unescapeHTML(imgMatch.group(1))
2246                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2247                 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2248                 configUrlMatch = re.search('config=(.*)$', playerUrl)
2249                 configUrl = urllib2.unquote(configUrlMatch.group(1))
2250
2251                 self.report_config_download(showName)
2252                 try:
2253                         configJSON = urllib2.urlopen(configUrl).read()
2254                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2255                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2256                         return
2257
2258                 # Technically, it's JavaScript, not JSON
2259                 configJSON = configJSON.replace("'", '"')
2260
2261                 try:
2262                         config = json.loads(configJSON)
2263                 except (ValueError,), err:
2264                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2265                         return
2266
2267                 playlist = config['playlist']
2268                 videoUrl = playlist[1]['url']
2269
2270                 info = {
2271                         'id': videoId,
2272                         'url': videoUrl,
2273                         'uploader': showName,
2274                         'upload_date': None,
2275                         'title': showName,
2276                         'stitle': simplify_title(showName),
2277                         'ext': 'flv',
2278                         'format': 'flv',
2279                         'thumbnail': imgUrl,
2280                         'description': description,
2281                         'player_url': playerUrl,
2282                 }
2283
2284                 return [info]
2285
2286
2287 class CollegeHumorIE(InfoExtractor):
2288         """Information extractor for collegehumor.com"""
2289
2290         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2291         IE_NAME = u'collegehumor'
2292
2293         def report_webpage(self, video_id):
2294                 """Report information extraction."""
2295                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2296
2297         def report_extraction(self, video_id):
2298                 """Report information extraction."""
2299                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2300
2301         def _real_extract(self, url):
2302                 mobj = re.match(self._VALID_URL, url)
2303                 if mobj is None:
2304                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2305                         return
2306                 video_id = mobj.group('videoid')
2307
2308                 self.report_webpage(video_id)
2309                 request = urllib2.Request(url)
2310                 try:
2311                         webpage = urllib2.urlopen(request).read()
2312                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2313                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2314                         return
2315
2316                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2317                 if m is None:
2318                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2319                         return
2320                 internal_video_id = m.group('internalvideoid')
2321
2322                 info = {
2323                         'id': video_id,
2324                         'internal_id': internal_video_id,
2325                 }
2326
2327                 self.report_extraction(video_id)
2328                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2329                 try:
2330                         metaXml = urllib2.urlopen(xmlUrl).read()
2331                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2332                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2333                         return
2334
2335                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2336                 try:
2337                         videoNode = mdoc.findall('./video')[0]
2338                         info['description'] = videoNode.findall('./description')[0].text
2339                         info['title'] = videoNode.findall('./caption')[0].text
2340                         info['stitle'] = simplify_title(info['title'])
2341                         info['url'] = videoNode.findall('./file')[0].text
2342                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2343                         info['ext'] = info['url'].rpartition('.')[2]
2344                         info['format'] = info['ext']
2345                 except IndexError:
2346                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2347                         return
2348
2349                 return [info]
2350
2351
2352 class XVideosIE(InfoExtractor):
2353         """Information extractor for xvideos.com"""
2354
2355         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2356         IE_NAME = u'xvideos'
2357
2358         def report_webpage(self, video_id):
2359                 """Report information extraction."""
2360                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2361
2362         def report_extraction(self, video_id):
2363                 """Report information extraction."""
2364                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2365
2366         def _real_extract(self, url):
2367                 mobj = re.match(self._VALID_URL, url)
2368                 if mobj is None:
2369                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2370                         return
2371                 video_id = mobj.group(1).decode('utf-8')
2372
2373                 self.report_webpage(video_id)
2374
2375                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2376                 try:
2377                         webpage = urllib2.urlopen(request).read()
2378                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2379                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2380                         return
2381
2382                 self.report_extraction(video_id)
2383
2384
2385                 # Extract video URL
2386                 mobj = re.search(r'flv_url=(.+?)&', webpage)
2387                 if mobj is None:
2388                         self._downloader.trouble(u'ERROR: unable to extract video url')
2389                         return
2390                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2391
2392
2393                 # Extract title
2394                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2395                 if mobj is None:
2396                         self._downloader.trouble(u'ERROR: unable to extract video title')
2397                         return
2398                 video_title = mobj.group(1).decode('utf-8')
2399
2400
2401                 # Extract video thumbnail
2402                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
2403                 if mobj is None:
2404                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2405                         return
2406                 video_thumbnail = mobj.group(1).decode('utf-8')
2407
2408                 info = {
2409                         'id': video_id,
2410                         'url': video_url,
2411                         'uploader': None,
2412                         'upload_date': None,
2413                         'title': video_title,
2414                         'stitle': simplify_title(video_title),
2415                         'ext': 'flv',
2416                         'format': 'flv',
2417                         'thumbnail': video_thumbnail,
2418                         'description': None,
2419                         'player_url': None,
2420                 }
2421
2422                 return [info]
2423
2424
2425 class SoundcloudIE(InfoExtractor):
2426         """Information extractor for soundcloud.com
2427            To access the media, the uid of the song and a stream token
2428            must be extracted from the page source and the script must make
2429            a request to media.soundcloud.com/crossdomain.xml. Then
2430            the media can be grabbed by requesting from an url composed
2431            of the stream token and uid
2432          """
2433
2434         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2435         IE_NAME = u'soundcloud'
2436
2437         def __init__(self, downloader=None):
2438                 InfoExtractor.__init__(self, downloader)
2439
2440         def report_webpage(self, video_id):
2441                 """Report information extraction."""
2442                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2443
2444         def report_extraction(self, video_id):
2445                 """Report information extraction."""
2446                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2447
2448         def _real_extract(self, url):
2449                 mobj = re.match(self._VALID_URL, url)
2450                 if mobj is None:
2451                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2452                         return
2453
2454                 # extract uploader (which is in the url)
2455                 uploader = mobj.group(1).decode('utf-8')
2456                 # extract simple title (uploader + slug of song title)
2457                 slug_title =  mobj.group(2).decode('utf-8')
2458                 simple_title = uploader + '-' + slug_title
2459
2460                 self.report_webpage('%s/%s' % (uploader, slug_title))
2461
2462                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2463                 try:
2464                         webpage = urllib2.urlopen(request).read()
2465                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2466                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2467                         return
2468
2469                 self.report_extraction('%s/%s' % (uploader, slug_title))
2470
2471                 # extract uid and stream token that soundcloud hands out for access
2472                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2473                 if mobj:
2474                         video_id = mobj.group(1)
2475                         stream_token = mobj.group(2)
2476
2477                 # extract unsimplified title
2478                 mobj = re.search('"title":"(.*?)",', webpage)
2479                 if mobj:
2480                         title = mobj.group(1)
2481
2482                 # construct media url (with uid/token)
2483                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2484                 mediaURL = mediaURL % (video_id, stream_token)
2485
2486                 # description
2487                 description = u'No description available'
2488                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2489                 if mobj:
2490                         description = mobj.group(1)
2491                 
2492                 # upload date
2493                 upload_date = None
2494                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2495                 if mobj:
2496                         try:
2497                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2498                         except Exception, e:
2499                                 print str(e)
2500
2501                 # for soundcloud, a request to a cross domain is required for cookies
2502                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2503
2504                 return [{
2505                         'id':           video_id.decode('utf-8'),
2506                         'url':          mediaURL,
2507                         'uploader':     uploader.decode('utf-8'),
2508                         'upload_date':  upload_date,
2509                         'title':        simple_title.decode('utf-8'),
2510                         'stitle':       simple_title.decode('utf-8'),
2511                         'ext':          u'mp3',
2512                         'format':       u'NA',
2513                         'player_url':   None,
2514                         'description': description.decode('utf-8')
2515                 }]
2516
2517
2518 class InfoQIE(InfoExtractor):
2519         """Information extractor for infoq.com"""
2520
2521         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2522         IE_NAME = u'infoq'
2523
2524         def report_webpage(self, video_id):
2525                 """Report information extraction."""
2526                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2527
2528         def report_extraction(self, video_id):
2529                 """Report information extraction."""
2530                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2531
2532         def _real_extract(self, url):
2533                 mobj = re.match(self._VALID_URL, url)
2534                 if mobj is None:
2535                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2536                         return
2537
2538                 self.report_webpage(url)
2539
2540                 request = urllib2.Request(url)
2541                 try:
2542                         webpage = urllib2.urlopen(request).read()
2543                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2544                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2545                         return
2546
2547                 self.report_extraction(url)
2548
2549
2550                 # Extract video URL
2551                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2552                 if mobj is None:
2553                         self._downloader.trouble(u'ERROR: unable to extract video url')
2554                         return
2555                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2556
2557
2558                 # Extract title
2559                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2560                 if mobj is None:
2561                         self._downloader.trouble(u'ERROR: unable to extract video title')
2562                         return
2563                 video_title = mobj.group(1).decode('utf-8')
2564
2565                 # Extract description
2566                 video_description = u'No description available.'
2567                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2568                 if mobj is not None:
2569                         video_description = mobj.group(1).decode('utf-8')
2570
2571                 video_filename = video_url.split('/')[-1]
2572                 video_id, extension = video_filename.split('.')
2573
2574                 info = {
2575                         'id': video_id,
2576                         'url': video_url,
2577                         'uploader': None,
2578                         'upload_date': None,
2579                         'title': video_title,
2580                         'stitle': simplify_title(video_title),
2581                         'ext': extension,
2582                         'format': extension, # Extension is always(?) mp4, but seems to be flv
2583                         'thumbnail': None,
2584                         'description': video_description,
2585                         'player_url': None,
2586                 }
2587
2588                 return [info]
2589
2590 class MixcloudIE(InfoExtractor):
2591         """Information extractor for www.mixcloud.com"""
2592         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2593         IE_NAME = u'mixcloud'
2594
2595         def __init__(self, downloader=None):
2596                 InfoExtractor.__init__(self, downloader)
2597
2598         def report_download_json(self, file_id):
2599                 """Report JSON download."""
2600                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2601
2602         def report_extraction(self, file_id):
2603                 """Report information extraction."""
2604                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2605
2606         def get_urls(self, jsonData, fmt, bitrate='best'):
2607                 """Get urls from 'audio_formats' section in json"""
2608                 file_url = None
2609                 try:
2610                         bitrate_list = jsonData[fmt]
2611                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2612                                 bitrate = max(bitrate_list) # select highest
2613
2614                         url_list = jsonData[fmt][bitrate]
2615                 except TypeError: # we have no bitrate info.
2616                         url_list = jsonData[fmt]
2617                                 
2618                 return url_list
2619
2620         def check_urls(self, url_list):
2621                 """Returns 1st active url from list"""
2622                 for url in url_list:
2623                         try:
2624                                 urllib2.urlopen(url)
2625                                 return url
2626                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2627                                 url = None
2628
2629                 return None
2630
2631         def _print_formats(self, formats):
2632                 print 'Available formats:'
2633                 for fmt in formats.keys():
2634                         for b in formats[fmt]:
2635                                 try:
2636                                         ext = formats[fmt][b][0]
2637                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2638                                 except TypeError: # we have no bitrate info
2639                                         ext = formats[fmt][0]
2640                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2641                                         break
2642
2643         def _real_extract(self, url):
2644                 mobj = re.match(self._VALID_URL, url)
2645                 if mobj is None:
2646                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2647                         return
2648                 # extract uploader & filename from url
2649                 uploader = mobj.group(1).decode('utf-8')
2650                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2651
2652                 # construct API request
2653                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2654                 # retrieve .json file with links to files
2655                 request = urllib2.Request(file_url)
2656                 try:
2657                         self.report_download_json(file_url)
2658                         jsonData = urllib2.urlopen(request).read()
2659                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2660                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2661                         return
2662
2663                 # parse JSON
2664                 json_data = json.loads(jsonData)
2665                 player_url = json_data['player_swf_url']
2666                 formats = dict(json_data['audio_formats'])
2667
2668                 req_format = self._downloader.params.get('format', None)
2669                 bitrate = None
2670
2671                 if self._downloader.params.get('listformats', None):
2672                         self._print_formats(formats)
2673                         return
2674
2675                 if req_format is None or req_format == 'best':
2676                         for format_param in formats.keys():
2677                                 url_list = self.get_urls(formats, format_param)
2678                                 # check urls
2679                                 file_url = self.check_urls(url_list)
2680                                 if file_url is not None:
2681                                         break # got it!
2682                 else:
2683                         if req_format not in formats.keys():
2684                                 self._downloader.trouble(u'ERROR: format is not available')
2685                                 return
2686
2687                         url_list = self.get_urls(formats, req_format)
2688                         file_url = self.check_urls(url_list)
2689                         format_param = req_format
2690
2691                 return [{
2692                         'id': file_id.decode('utf-8'),
2693                         'url': file_url.decode('utf-8'),
2694                         'uploader':     uploader.decode('utf-8'),
2695                         'upload_date': u'NA',
2696                         'title': json_data['name'],
2697                         'stitle': simplify_title(json_data['name']),
2698                         'ext': file_url.split('.')[-1].decode('utf-8'),
2699                         'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2700                         'thumbnail': json_data['thumbnail_url'],
2701                         'description': json_data['description'],
2702                         'player_url': player_url.decode('utf-8'),
2703                 }]
2704
2705 class StanfordOpenClassroomIE(InfoExtractor):
2706         """Information extractor for Stanford's Open ClassRoom"""
2707
2708         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2709         IE_NAME = u'stanfordoc'
2710
2711         def report_download_webpage(self, objid):
2712                 """Report information extraction."""
2713                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2714
2715         def report_extraction(self, video_id):
2716                 """Report information extraction."""
2717                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2718
2719         def _real_extract(self, url):
2720                 mobj = re.match(self._VALID_URL, url)
2721                 if mobj is None:
2722                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2723                         return
2724
2725                 if mobj.group('course') and mobj.group('video'): # A specific video
2726                         course = mobj.group('course')
2727                         video = mobj.group('video')
2728                         info = {
2729                                 'id': simplify_title(course + '_' + video),
2730                         }
2731         
2732                         self.report_extraction(info['id'])
2733                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2734                         xmlUrl = baseUrl + video + '.xml'
2735                         try:
2736                                 metaXml = urllib2.urlopen(xmlUrl).read()
2737                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2738                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2739                                 return
2740                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2741                         try:
2742                                 info['title'] = mdoc.findall('./title')[0].text
2743                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2744                         except IndexError:
2745                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2746                                 return
2747                         info['stitle'] = simplify_title(info['title'])
2748                         info['ext'] = info['url'].rpartition('.')[2]
2749                         info['format'] = info['ext']
2750                         return [info]
2751                 elif mobj.group('course'): # A course page
2752                         course = mobj.group('course')
2753                         info = {
2754                                 'id': simplify_title(course),
2755                                 'type': 'playlist',
2756                         }
2757
2758                         self.report_download_webpage(info['id'])
2759                         try:
2760                                 coursepage = urllib2.urlopen(url).read()
2761                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2762                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2763                                 return
2764
2765                         m = re.search('<h1>([^<]+)</h1>', coursepage)
2766                         if m:
2767                                 info['title'] = unescapeHTML(m.group(1))
2768                         else:
2769                                 info['title'] = info['id']
2770                         info['stitle'] = simplify_title(info['title'])
2771
2772                         m = re.search('<description>([^<]+)</description>', coursepage)
2773                         if m:
2774                                 info['description'] = unescapeHTML(m.group(1))
2775
2776                         links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2777                         info['list'] = [
2778                                 {
2779                                         'type': 'reference',
2780                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2781                                 }
2782                                         for vpage in links]
2783                         results = []
2784                         for entry in info['list']:
2785                                 assert entry['type'] == 'reference'
2786                                 results += self.extract(entry['url'])
2787                         return results
2788                         
2789                 else: # Root page
2790                         info = {
2791                                 'id': 'Stanford OpenClassroom',
2792                                 'type': 'playlist',
2793                         }
2794
2795                         self.report_download_webpage(info['id'])
2796                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2797                         try:
2798                                 rootpage = urllib2.urlopen(rootURL).read()
2799                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2800                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2801                                 return
2802
2803                         info['title'] = info['id']
2804                         info['stitle'] = simplify_title(info['title'])
2805
2806                         links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2807                         info['list'] = [
2808                                 {
2809                                         'type': 'reference',
2810                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2811                                 }
2812                                         for cpage in links]
2813
2814                         results = []
2815                         for entry in info['list']:
2816                                 assert entry['type'] == 'reference'
2817                                 results += self.extract(entry['url'])
2818                         return results
2819
2820 class MTVIE(InfoExtractor):
2821         """Information extractor for MTV.com"""
2822
2823         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2824         IE_NAME = u'mtv'
2825
2826         def report_webpage(self, video_id):
2827                 """Report information extraction."""
2828                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2829
2830         def report_extraction(self, video_id):
2831                 """Report information extraction."""
2832                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2833
2834         def _real_extract(self, url):
2835                 mobj = re.match(self._VALID_URL, url)
2836                 if mobj is None:
2837                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2838                         return
2839                 if not mobj.group('proto'):
2840                         url = 'http://' + url
2841                 video_id = mobj.group('videoid')
2842                 self.report_webpage(video_id)
2843
2844                 request = urllib2.Request(url)
2845                 try:
2846                         webpage = urllib2.urlopen(request).read()
2847                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2848                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2849                         return
2850
2851                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2852                 if mobj is None:
2853                         self._downloader.trouble(u'ERROR: unable to extract song name')
2854                         return
2855                 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2856                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2857                 if mobj is None:
2858                         self._downloader.trouble(u'ERROR: unable to extract performer')
2859                         return
2860                 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2861                 video_title = performer + ' - ' + song_name 
2862
2863                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2864                 if mobj is None:
2865                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
2866                         return
2867                 mtvn_uri = mobj.group(1)
2868
2869                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2870                 if mobj is None:
2871                         self._downloader.trouble(u'ERROR: unable to extract content id')
2872                         return
2873                 content_id = mobj.group(1)
2874
2875                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2876                 self.report_extraction(video_id)
2877                 request = urllib2.Request(videogen_url)
2878                 try:
2879                         metadataXml = urllib2.urlopen(request).read()
2880                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2881                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
2882                         return
2883
2884                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2885                 renditions = mdoc.findall('.//rendition')
2886
2887                 # For now, always pick the highest quality.
2888                 rendition = renditions[-1]
2889
2890                 try:
2891                         _,_,ext = rendition.attrib['type'].partition('/')
2892                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2893                         video_url = rendition.find('./src').text
2894                 except KeyError:
2895                         self._downloader.trouble('Invalid rendition field.')
2896                         return
2897
2898                 info = {
2899                         'id': video_id,
2900                         'url': video_url,
2901                         'uploader': performer,
2902                         'title': video_title,
2903                         'stitle': simplify_title(video_title),
2904                         'ext': ext,
2905                         'format': format,
2906                 }
2907
2908                 return [info]