reorganized the titles sanitizing: now title is the untouched title
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import datetime
5 import HTMLParser
6 import httplib
7 import netrc
8 import os
9 import re
10 import socket
11 import time
12 import urllib
13 import urllib2
14 import email.utils
15 import xml.etree.ElementTree
16 from urlparse import parse_qs
17
18 try:
19         import cStringIO as StringIO
20 except ImportError:
21         import StringIO
22
23 from utils import *
24
25
26 class InfoExtractor(object):
27         """Information Extractor class.
28
29         Information extractors are the classes that, given a URL, extract
30         information from the video (or videos) the URL refers to. This
31         information includes the real video URL, the video title and simplified
32         title, author and others. The information is stored in a dictionary
33         which is then passed to the FileDownloader. The FileDownloader
34         processes this information possibly downloading the video to the file
35         system, among other possible outcomes. The dictionaries must include
36         the following fields:
37
38         id:             Video identifier.
39         url:            Final video URL.
40         uploader:       Nickname of the video uploader.
41         title:          Literal title.
42         ext:            Video filename extension.
43         format:         Video format.
44         player_url:     SWF Player URL (may be None).
45
46         The following fields are optional. Their primary purpose is to allow
47         youtube-dl to serve as the backend for a video search function, such
48         as the one in youtube2mp3.  They are only used when their respective
49         forced printing functions are called:
50
51         thumbnail:      Full URL to a video thumbnail image.
52         description:    One-line video description.
53
54         Subclasses of this one should re-define the _real_initialize() and
55         _real_extract() methods and define a _VALID_URL regexp.
56         Probably, they should also be added to the list of extractors.
57         """
58
59         _ready = False
60         _downloader = None
61
62         def __init__(self, downloader=None):
63                 """Constructor. Receives an optional downloader."""
64                 self._ready = False
65                 self.set_downloader(downloader)
66
67         def suitable(self, url):
68                 """Receives a URL and returns True if suitable for this IE."""
69                 return re.match(self._VALID_URL, url) is not None
70
71         def initialize(self):
72                 """Initializes an instance (authentication, etc)."""
73                 if not self._ready:
74                         self._real_initialize()
75                         self._ready = True
76
77         def extract(self, url):
78                 """Extracts URL information and returns it in list of dicts."""
79                 self.initialize()
80                 return self._real_extract(url)
81
82         def set_downloader(self, downloader):
83                 """Sets the downloader for this IE."""
84                 self._downloader = downloader
85
86         def _real_initialize(self):
87                 """Real initialization process. Redefine in subclasses."""
88                 pass
89
90         def _real_extract(self, url):
91                 """Real extraction process. Redefine in subclasses."""
92                 pass
93
94
95 class YoutubeIE(InfoExtractor):
96         """Information extractor for youtube.com."""
97
98         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
99         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
100         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
101         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
102         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
103         _NETRC_MACHINE = 'youtube'
104         # Listed in order of quality
105         _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
106         _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
107         _video_extensions = {
108                 '13': '3gp',
109                 '17': 'mp4',
110                 '18': 'mp4',
111                 '22': 'mp4',
112                 '37': 'mp4',
113                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
114                 '43': 'webm',
115                 '44': 'webm',
116                 '45': 'webm',
117                 '46': 'webm',
118         }
119         _video_dimensions = {
120                 '5': '240x400',
121                 '6': '???',
122                 '13': '???',
123                 '17': '144x176',
124                 '18': '360x640',
125                 '22': '720x1280',
126                 '34': '360x640',
127                 '35': '480x854',
128                 '37': '1080x1920',
129                 '38': '3072x4096',
130                 '43': '360x640',
131                 '44': '480x854',
132                 '45': '720x1280',
133                 '46': '1080x1920',
134         }       
135         IE_NAME = u'youtube'
136
137         def report_lang(self):
138                 """Report attempt to set language."""
139                 self._downloader.to_screen(u'[youtube] Setting language')
140
141         def report_login(self):
142                 """Report attempt to log in."""
143                 self._downloader.to_screen(u'[youtube] Logging in')
144
145         def report_age_confirmation(self):
146                 """Report attempt to confirm age."""
147                 self._downloader.to_screen(u'[youtube] Confirming age')
148
149         def report_video_webpage_download(self, video_id):
150                 """Report attempt to download video webpage."""
151                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
152
153         def report_video_info_webpage_download(self, video_id):
154                 """Report attempt to download video info webpage."""
155                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
156
157         def report_video_subtitles_download(self, video_id):
158                 """Report attempt to download video info webpage."""
159                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
160
161         def report_information_extraction(self, video_id):
162                 """Report attempt to extract video information."""
163                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
164
165         def report_unavailable_format(self, video_id, format):
166                 """Report extracted video URL."""
167                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
168
169         def report_rtmp_download(self):
170                 """Indicate the download will use the RTMP protocol."""
171                 self._downloader.to_screen(u'[youtube] RTMP download detected')
172
173         def _closed_captions_xml_to_srt(self, xml_string):
174                 srt = ''
175                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
176                 # TODO parse xml instead of regex
177                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
178                         if not dur: dur = '4'
179                         start = float(start)
180                         end = start + float(dur)
181                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
182                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
183                         caption = unescapeHTML(caption)
184                         caption = unescapeHTML(caption) # double cycle, inentional
185                         srt += str(n) + '\n'
186                         srt += start + ' --> ' + end + '\n'
187                         srt += caption + '\n\n'
188                 return srt
189
190         def _print_formats(self, formats):
191                 print 'Available formats:'
192                 for x in formats:
193                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
194
195         def _real_initialize(self):
196                 if self._downloader is None:
197                         return
198
199                 username = None
200                 password = None
201                 downloader_params = self._downloader.params
202
203                 # Attempt to use provided username and password or .netrc data
204                 if downloader_params.get('username', None) is not None:
205                         username = downloader_params['username']
206                         password = downloader_params['password']
207                 elif downloader_params.get('usenetrc', False):
208                         try:
209                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
210                                 if info is not None:
211                                         username = info[0]
212                                         password = info[2]
213                                 else:
214                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
215                         except (IOError, netrc.NetrcParseError), err:
216                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
217                                 return
218
219                 # Set language
220                 request = urllib2.Request(self._LANG_URL)
221                 try:
222                         self.report_lang()
223                         urllib2.urlopen(request).read()
224                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
225                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
226                         return
227
228                 # No authentication to be performed
229                 if username is None:
230                         return
231
232                 # Log in
233                 login_form = {
234                                 'current_form': 'loginForm',
235                                 'next':         '/',
236                                 'action_login': 'Log In',
237                                 'username':     username,
238                                 'password':     password,
239                                 }
240                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
241                 try:
242                         self.report_login()
243                         login_results = urllib2.urlopen(request).read()
244                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
245                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
246                                 return
247                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
248                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
249                         return
250
251                 # Confirm age
252                 age_form = {
253                                 'next_url':             '/',
254                                 'action_confirm':       'Confirm',
255                                 }
256                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
257                 try:
258                         self.report_age_confirmation()
259                         age_results = urllib2.urlopen(request).read()
260                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
261                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
262                         return
263
264         def _real_extract(self, url):
265                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
266                 mobj = re.search(self._NEXT_URL_RE, url)
267                 if mobj:
268                         url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
269
270                 # Extract video id from URL
271                 mobj = re.match(self._VALID_URL, url)
272                 if mobj is None:
273                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
274                         return
275                 video_id = mobj.group(2)
276
277                 # Get video webpage
278                 self.report_video_webpage_download(video_id)
279                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
280                 try:
281                         video_webpage = urllib2.urlopen(request).read()
282                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
283                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
284                         return
285
286                 # Attempt to extract SWF player URL
287                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
288                 if mobj is not None:
289                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
290                 else:
291                         player_url = None
292
293                 # Get video info
294                 self.report_video_info_webpage_download(video_id)
295                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
296                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
297                                         % (video_id, el_type))
298                         request = urllib2.Request(video_info_url)
299                         try:
300                                 video_info_webpage = urllib2.urlopen(request).read()
301                                 video_info = parse_qs(video_info_webpage)
302                                 if 'token' in video_info:
303                                         break
304                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
305                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
306                                 return
307                 if 'token' not in video_info:
308                         if 'reason' in video_info:
309                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
310                         else:
311                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
312                         return
313
314                 # Start extracting information
315                 self.report_information_extraction(video_id)
316
317                 # uploader
318                 if 'author' not in video_info:
319                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
320                         return
321                 video_uploader = urllib.unquote_plus(video_info['author'][0])
322
323                 # title
324                 if 'title' not in video_info:
325                         self._downloader.trouble(u'ERROR: unable to extract video title')
326                         return
327                 video_title = urllib.unquote_plus(video_info['title'][0])
328                 video_title = video_title.decode('utf-8')
329
330                 # thumbnail image
331                 if 'thumbnail_url' not in video_info:
332                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
333                         video_thumbnail = ''
334                 else:   # don't panic if we can't find it
335                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
336
337                 # upload date
338                 upload_date = u'NA'
339                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
340                 if mobj is not None:
341                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
342                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
343                         for expression in format_expressions:
344                                 try:
345                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
346                                 except:
347                                         pass
348
349                 # description
350                 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
351                 if video_description: video_description = clean_html(video_description)
352                 else: video_description = ''
353                         
354                 # closed captions
355                 video_subtitles = None
356                 if self._downloader.params.get('writesubtitles', False):
357                         try:
358                                 self.report_video_subtitles_download(video_id)
359                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
360                                 try:
361                                         srt_list = urllib2.urlopen(request).read()
362                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
363                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
364                                 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
365                                 if not srt_lang_list:
366                                         raise Trouble(u'WARNING: video has no closed captions')
367                                 if self._downloader.params.get('subtitleslang', False):
368                                         srt_lang = self._downloader.params.get('subtitleslang')
369                                 elif 'en' in srt_lang_list:
370                                         srt_lang = 'en'
371                                 else:
372                                         srt_lang = srt_lang_list[0]
373                                 if not srt_lang in srt_lang_list:
374                                         raise Trouble(u'WARNING: no closed captions found in the specified language')
375                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
376                                 try:
377                                         srt_xml = urllib2.urlopen(request).read()
378                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
379                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
380                                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
381                         except Trouble as trouble:
382                                 self._downloader.trouble(trouble[0])
383
384                 # token
385                 video_token = urllib.unquote_plus(video_info['token'][0])
386
387                 # Decide which formats to download
388                 req_format = self._downloader.params.get('format', None)
389
390                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
391                         self.report_rtmp_download()
392                         video_url_list = [(None, video_info['conn'][0])]
393                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
394                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
395                         url_data = [parse_qs(uds) for uds in url_data_strs]
396                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
397                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
398
399                         format_limit = self._downloader.params.get('format_limit', None)
400                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
401                         if format_limit is not None and format_limit in available_formats:
402                                 format_list = available_formats[available_formats.index(format_limit):]
403                         else:
404                                 format_list = available_formats
405                         existing_formats = [x for x in format_list if x in url_map]
406                         if len(existing_formats) == 0:
407                                 self._downloader.trouble(u'ERROR: no known formats available for video')
408                                 return
409                         if self._downloader.params.get('listformats', None):
410                                 self._print_formats(existing_formats)
411                                 return
412                         if req_format is None or req_format == 'best':
413                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
414                         elif req_format == 'worst':
415                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
416                         elif req_format in ('-1', 'all'):
417                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
418                         else:
419                                 # Specific formats. We pick the first in a slash-delimeted sequence.
420                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
421                                 req_formats = req_format.split('/')
422                                 video_url_list = None
423                                 for rf in req_formats:
424                                         if rf in url_map:
425                                                 video_url_list = [(rf, url_map[rf])]
426                                                 break
427                                 if video_url_list is None:
428                                         self._downloader.trouble(u'ERROR: requested format not available')
429                                         return
430                 else:
431                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
432                         return
433
434                 results = []
435                 for format_param, video_real_url in video_url_list:
436                         # Extension
437                         video_extension = self._video_extensions.get(format_param, 'flv')
438
439                         results.append({
440                                 'id':           video_id.decode('utf-8'),
441                                 'url':          video_real_url.decode('utf-8'),
442                                 'uploader':     video_uploader.decode('utf-8'),
443                                 'upload_date':  upload_date,
444                                 'title':        video_title,
445                                 'ext':          video_extension.decode('utf-8'),
446                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
447                                 'thumbnail':    video_thumbnail.decode('utf-8'),
448                                 'description':  video_description,
449                                 'player_url':   player_url,
450                                 'subtitles':    video_subtitles
451                         })
452                 return results
453
454
455 class MetacafeIE(InfoExtractor):
456         """Information Extractor for metacafe.com."""
457
458         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
459         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
460         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
461         IE_NAME = u'metacafe'
462
463         def __init__(self, downloader=None):
464                 InfoExtractor.__init__(self, downloader)
465
466         def report_disclaimer(self):
467                 """Report disclaimer retrieval."""
468                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
469
470         def report_age_confirmation(self):
471                 """Report attempt to confirm age."""
472                 self._downloader.to_screen(u'[metacafe] Confirming age')
473
474         def report_download_webpage(self, video_id):
475                 """Report webpage download."""
476                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
477
478         def report_extraction(self, video_id):
479                 """Report information extraction."""
480                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
481
482         def _real_initialize(self):
483                 # Retrieve disclaimer
484                 request = urllib2.Request(self._DISCLAIMER)
485                 try:
486                         self.report_disclaimer()
487                         disclaimer = urllib2.urlopen(request).read()
488                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
489                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
490                         return
491
492                 # Confirm age
493                 disclaimer_form = {
494                         'filters': '0',
495                         'submit': "Continue - I'm over 18",
496                         }
497                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
498                 try:
499                         self.report_age_confirmation()
500                         disclaimer = urllib2.urlopen(request).read()
501                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
502                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
503                         return
504
505         def _real_extract(self, url):
506                 # Extract id and simplified title from URL
507                 mobj = re.match(self._VALID_URL, url)
508                 if mobj is None:
509                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
510                         return
511
512                 video_id = mobj.group(1)
513
514                 # Check if video comes from YouTube
515                 mobj2 = re.match(r'^yt-(.*)$', video_id)
516                 if mobj2 is not None:
517                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
518                         return
519
520                 # Retrieve video webpage to extract further information
521                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
522                 try:
523                         self.report_download_webpage(video_id)
524                         webpage = urllib2.urlopen(request).read()
525                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
526                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
527                         return
528
529                 # Extract URL, uploader and title from webpage
530                 self.report_extraction(video_id)
531                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
532                 if mobj is not None:
533                         mediaURL = urllib.unquote(mobj.group(1))
534                         video_extension = mediaURL[-3:]
535
536                         # Extract gdaKey if available
537                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
538                         if mobj is None:
539                                 video_url = mediaURL
540                         else:
541                                 gdaKey = mobj.group(1)
542                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
543                 else:
544                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
545                         if mobj is None:
546                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
547                                 return
548                         vardict = parse_qs(mobj.group(1))
549                         if 'mediaData' not in vardict:
550                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
551                                 return
552                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
553                         if mobj is None:
554                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
555                                 return
556                         mediaURL = mobj.group(1).replace('\\/', '/')
557                         video_extension = mediaURL[-3:]
558                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
559
560                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
561                 if mobj is None:
562                         self._downloader.trouble(u'ERROR: unable to extract title')
563                         return
564                 video_title = mobj.group(1).decode('utf-8')
565
566                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
567                 if mobj is None:
568                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
569                         return
570                 video_uploader = mobj.group(1)
571
572                 return [{
573                         'id':           video_id.decode('utf-8'),
574                         'url':          video_url.decode('utf-8'),
575                         'uploader':     video_uploader.decode('utf-8'),
576                         'upload_date':  u'NA',
577                         'title':        video_title,
578                         'ext':          video_extension.decode('utf-8'),
579                         'format':       u'NA',
580                         'player_url':   None,
581                 }]
582
583
584 class DailymotionIE(InfoExtractor):
585         """Information Extractor for Dailymotion"""
586
587         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
588         IE_NAME = u'dailymotion'
589
590         def __init__(self, downloader=None):
591                 InfoExtractor.__init__(self, downloader)
592
593         def report_download_webpage(self, video_id):
594                 """Report webpage download."""
595                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
596
597         def report_extraction(self, video_id):
598                 """Report information extraction."""
599                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
600
601         def _real_extract(self, url):
602                 # Extract id and simplified title from URL
603                 mobj = re.match(self._VALID_URL, url)
604                 if mobj is None:
605                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
606                         return
607
608                 video_id = mobj.group(1)
609
610                 video_extension = 'flv'
611
612                 # Retrieve video webpage to extract further information
613                 request = urllib2.Request(url)
614                 request.add_header('Cookie', 'family_filter=off')
615                 try:
616                         self.report_download_webpage(video_id)
617                         webpage = urllib2.urlopen(request).read()
618                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
619                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
620                         return
621
622                 # Extract URL, uploader and title from webpage
623                 self.report_extraction(video_id)
624                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
625                 if mobj is None:
626                         self._downloader.trouble(u'ERROR: unable to extract media URL')
627                         return
628                 sequence = urllib.unquote(mobj.group(1))
629                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
630                 if mobj is None:
631                         self._downloader.trouble(u'ERROR: unable to extract media URL')
632                         return
633                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
634
635                 # if needed add http://www.dailymotion.com/ if relative URL
636
637                 video_url = mediaURL
638
639                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
640                 if mobj is None:
641                         self._downloader.trouble(u'ERROR: unable to extract title')
642                         return
643                 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
644
645                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
646                 if mobj is None:
647                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
648                         return
649                 video_uploader = mobj.group(1)
650
651                 return [{
652                         'id':           video_id.decode('utf-8'),
653                         'url':          video_url.decode('utf-8'),
654                         'uploader':     video_uploader.decode('utf-8'),
655                         'upload_date':  u'NA',
656                         'title':        video_title,
657                         'ext':          video_extension.decode('utf-8'),
658                         'format':       u'NA',
659                         'player_url':   None,
660                 }]
661
662
663 class GoogleIE(InfoExtractor):
664         """Information extractor for video.google.com."""
665
666         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
667         IE_NAME = u'video.google'
668
669         def __init__(self, downloader=None):
670                 InfoExtractor.__init__(self, downloader)
671
672         def report_download_webpage(self, video_id):
673                 """Report webpage download."""
674                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
675
676         def report_extraction(self, video_id):
677                 """Report information extraction."""
678                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
679
680         def _real_extract(self, url):
681                 # Extract id from URL
682                 mobj = re.match(self._VALID_URL, url)
683                 if mobj is None:
684                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
685                         return
686
687                 video_id = mobj.group(1)
688
689                 video_extension = 'mp4'
690
691                 # Retrieve video webpage to extract further information
692                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
693                 try:
694                         self.report_download_webpage(video_id)
695                         webpage = urllib2.urlopen(request).read()
696                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
697                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
698                         return
699
700                 # Extract URL, uploader, and title from webpage
701                 self.report_extraction(video_id)
702                 mobj = re.search(r"download_url:'([^']+)'", webpage)
703                 if mobj is None:
704                         video_extension = 'flv'
705                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
706                 if mobj is None:
707                         self._downloader.trouble(u'ERROR: unable to extract media URL')
708                         return
709                 mediaURL = urllib.unquote(mobj.group(1))
710                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
711                 mediaURL = mediaURL.replace('\\x26', '\x26')
712
713                 video_url = mediaURL
714
715                 mobj = re.search(r'<title>(.*)</title>', webpage)
716                 if mobj is None:
717                         self._downloader.trouble(u'ERROR: unable to extract title')
718                         return
719                 video_title = mobj.group(1).decode('utf-8')
720
721                 # Extract video description
722                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
723                 if mobj is None:
724                         self._downloader.trouble(u'ERROR: unable to extract video description')
725                         return
726                 video_description = mobj.group(1).decode('utf-8')
727                 if not video_description:
728                         video_description = 'No description available.'
729
730                 # Extract video thumbnail
731                 if self._downloader.params.get('forcethumbnail', False):
732                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
733                         try:
734                                 webpage = urllib2.urlopen(request).read()
735                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
736                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
737                                 return
738                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
739                         if mobj is None:
740                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
741                                 return
742                         video_thumbnail = mobj.group(1)
743                 else:   # we need something to pass to process_info
744                         video_thumbnail = ''
745
746                 return [{
747                         'id':           video_id.decode('utf-8'),
748                         'url':          video_url.decode('utf-8'),
749                         'uploader':     u'NA',
750                         'upload_date':  u'NA',
751                         'title':        video_title,
752                         'ext':          video_extension.decode('utf-8'),
753                         'format':       u'NA',
754                         'player_url':   None,
755                 }]
756
757
758 class PhotobucketIE(InfoExtractor):
759         """Information extractor for photobucket.com."""
760
761         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
762         IE_NAME = u'photobucket'
763
764         def __init__(self, downloader=None):
765                 InfoExtractor.__init__(self, downloader)
766
767         def report_download_webpage(self, video_id):
768                 """Report webpage download."""
769                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
770
771         def report_extraction(self, video_id):
772                 """Report information extraction."""
773                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
774
775         def _real_extract(self, url):
776                 # Extract id from URL
777                 mobj = re.match(self._VALID_URL, url)
778                 if mobj is None:
779                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
780                         return
781
782                 video_id = mobj.group(1)
783
784                 video_extension = 'flv'
785
786                 # Retrieve video webpage to extract further information
787                 request = urllib2.Request(url)
788                 try:
789                         self.report_download_webpage(video_id)
790                         webpage = urllib2.urlopen(request).read()
791                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
792                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
793                         return
794
795                 # Extract URL, uploader, and title from webpage
796                 self.report_extraction(video_id)
797                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
798                 if mobj is None:
799                         self._downloader.trouble(u'ERROR: unable to extract media URL')
800                         return
801                 mediaURL = urllib.unquote(mobj.group(1))
802
803                 video_url = mediaURL
804
805                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
806                 if mobj is None:
807                         self._downloader.trouble(u'ERROR: unable to extract title')
808                         return
809                 video_title = mobj.group(1).decode('utf-8')
810
811                 video_uploader = mobj.group(2).decode('utf-8')
812
813                 return [{
814                         'id':           video_id.decode('utf-8'),
815                         'url':          video_url.decode('utf-8'),
816                         'uploader':     video_uploader,
817                         'upload_date':  u'NA',
818                         'title':        video_title,
819                         'ext':          video_extension.decode('utf-8'),
820                         'format':       u'NA',
821                         'player_url':   None,
822                 }]
823
824
825 class YahooIE(InfoExtractor):
826         """Information extractor for video.yahoo.com."""
827
828         # _VALID_URL matches all Yahoo! Video URLs
829         # _VPAGE_URL matches only the extractable '/watch/' URLs
830         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
831         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
832         IE_NAME = u'video.yahoo'
833
834         def __init__(self, downloader=None):
835                 InfoExtractor.__init__(self, downloader)
836
837         def report_download_webpage(self, video_id):
838                 """Report webpage download."""
839                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
840
841         def report_extraction(self, video_id):
842                 """Report information extraction."""
843                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
844
845         def _real_extract(self, url, new_video=True):
846                 # Extract ID from URL
847                 mobj = re.match(self._VALID_URL, url)
848                 if mobj is None:
849                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
850                         return
851
852                 video_id = mobj.group(2)
853                 video_extension = 'flv'
854
855                 # Rewrite valid but non-extractable URLs as
856                 # extractable English language /watch/ URLs
857                 if re.match(self._VPAGE_URL, url) is None:
858                         request = urllib2.Request(url)
859                         try:
860                                 webpage = urllib2.urlopen(request).read()
861                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
862                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
863                                 return
864
865                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
866                         if mobj is None:
867                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
868                                 return
869                         yahoo_id = mobj.group(1)
870
871                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
872                         if mobj is None:
873                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
874                                 return
875                         yahoo_vid = mobj.group(1)
876
877                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
878                         return self._real_extract(url, new_video=False)
879
880                 # Retrieve video webpage to extract further information
881                 request = urllib2.Request(url)
882                 try:
883                         self.report_download_webpage(video_id)
884                         webpage = urllib2.urlopen(request).read()
885                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
886                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
887                         return
888
889                 # Extract uploader and title from webpage
890                 self.report_extraction(video_id)
891                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
892                 if mobj is None:
893                         self._downloader.trouble(u'ERROR: unable to extract video title')
894                         return
895                 video_title = mobj.group(1).decode('utf-8')
896
897                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
898                 if mobj is None:
899                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
900                         return
901                 video_uploader = mobj.group(1).decode('utf-8')
902
903                 # Extract video thumbnail
904                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
905                 if mobj is None:
906                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
907                         return
908                 video_thumbnail = mobj.group(1).decode('utf-8')
909
910                 # Extract video description
911                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
912                 if mobj is None:
913                         self._downloader.trouble(u'ERROR: unable to extract video description')
914                         return
915                 video_description = mobj.group(1).decode('utf-8')
916                 if not video_description:
917                         video_description = 'No description available.'
918
919                 # Extract video height and width
920                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
921                 if mobj is None:
922                         self._downloader.trouble(u'ERROR: unable to extract video height')
923                         return
924                 yv_video_height = mobj.group(1)
925
926                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
927                 if mobj is None:
928                         self._downloader.trouble(u'ERROR: unable to extract video width')
929                         return
930                 yv_video_width = mobj.group(1)
931
932                 # Retrieve video playlist to extract media URL
933                 # I'm not completely sure what all these options are, but we
934                 # seem to need most of them, otherwise the server sends a 401.
935                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
936                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
937                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
938                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
939                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
940                 try:
941                         self.report_download_webpage(video_id)
942                         webpage = urllib2.urlopen(request).read()
943                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
944                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
945                         return
946
947                 # Extract media URL from playlist XML
948                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
949                 if mobj is None:
950                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
951                         return
952                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
953                 video_url = unescapeHTML(video_url)
954
955                 return [{
956                         'id':           video_id.decode('utf-8'),
957                         'url':          video_url,
958                         'uploader':     video_uploader,
959                         'upload_date':  u'NA',
960                         'title':        video_title,
961                         'ext':          video_extension.decode('utf-8'),
962                         'thumbnail':    video_thumbnail.decode('utf-8'),
963                         'description':  video_description,
964                         'thumbnail':    video_thumbnail,
965                         'player_url':   None,
966                 }]
967
968
969 class VimeoIE(InfoExtractor):
970         """Information extractor for vimeo.com."""
971
972         # _VALID_URL matches Vimeo URLs
973         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
974         IE_NAME = u'vimeo'
975
976         def __init__(self, downloader=None):
977                 InfoExtractor.__init__(self, downloader)
978
979         def report_download_webpage(self, video_id):
980                 """Report webpage download."""
981                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
982
983         def report_extraction(self, video_id):
984                 """Report information extraction."""
985                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
986
987         def _real_extract(self, url, new_video=True):
988                 # Extract ID from URL
989                 mobj = re.match(self._VALID_URL, url)
990                 if mobj is None:
991                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
992                         return
993
994                 video_id = mobj.group(1)
995
996                 # Retrieve video webpage to extract further information
997                 request = urllib2.Request(url, None, std_headers)
998                 try:
999                         self.report_download_webpage(video_id)
1000                         webpage = urllib2.urlopen(request).read()
1001                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1002                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1003                         return
1004
1005                 # Now we begin extracting as much information as we can from what we
1006                 # retrieved. First we extract the information common to all extractors,
1007                 # and latter we extract those that are Vimeo specific.
1008                 self.report_extraction(video_id)
1009
1010                 # Extract the config JSON
1011                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1012                 try:
1013                         config = json.loads(config)
1014                 except:
1015                         self._downloader.trouble(u'ERROR: unable to extract info section')
1016                         return
1017                 
1018                 # Extract title
1019                 video_title = config["video"]["title"]
1020
1021                 # Extract uploader
1022                 video_uploader = config["video"]["owner"]["name"]
1023
1024                 # Extract video thumbnail
1025                 video_thumbnail = config["video"]["thumbnail"]
1026
1027                 # Extract video description
1028                 video_description = get_element_by_id("description", webpage.decode('utf8'))
1029                 if video_description: video_description = clean_html(video_description)
1030                 else: video_description = ''
1031
1032                 # Extract upload date
1033                 video_upload_date = u'NA'
1034                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1035                 if mobj is not None:
1036                         video_upload_date = mobj.group(1)
1037
1038                 # Vimeo specific: extract request signature and timestamp
1039                 sig = config['request']['signature']
1040                 timestamp = config['request']['timestamp']
1041
1042                 # Vimeo specific: extract video codec and quality information
1043                 # TODO bind to format param
1044                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1045                 for codec in codecs:
1046                         if codec[0] in config["video"]["files"]:
1047                                 video_codec = codec[0]
1048                                 video_extension = codec[1]
1049                                 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1050                                 else: quality = 'sd'
1051                                 break
1052                 else:
1053                         self._downloader.trouble(u'ERROR: no known codec found')
1054                         return
1055
1056                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1057                                         %(video_id, sig, timestamp, quality, video_codec.upper())
1058
1059                 return [{
1060                         'id':           video_id,
1061                         'url':          video_url,
1062                         'uploader':     video_uploader,
1063                         'upload_date':  video_upload_date,
1064                         'title':        video_title,
1065                         'ext':          video_extension,
1066                         'thumbnail':    video_thumbnail,
1067                         'description':  video_description,
1068                         'player_url':   None,
1069                 }]
1070
1071
1072 class GenericIE(InfoExtractor):
1073         """Generic last-resort information extractor."""
1074
1075         _VALID_URL = r'.*'
1076         IE_NAME = u'generic'
1077
1078         def __init__(self, downloader=None):
1079                 InfoExtractor.__init__(self, downloader)
1080
1081         def report_download_webpage(self, video_id):
1082                 """Report webpage download."""
1083                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1084                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1085
1086         def report_extraction(self, video_id):
1087                 """Report information extraction."""
1088                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1089
1090         def report_following_redirect(self, new_url):
1091                 """Report information extraction."""
1092                 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1093                 
1094         def _test_redirect(self, url):
1095                 """Check if it is a redirect, like url shorteners, in case restart chain."""
1096                 class HeadRequest(urllib2.Request):
1097                         def get_method(self):
1098                                 return "HEAD"
1099
1100                 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1101                         """
1102                         Subclass the HTTPRedirectHandler to make it use our 
1103                         HeadRequest also on the redirected URL
1104                         """
1105                         def redirect_request(self, req, fp, code, msg, headers, newurl): 
1106                                 if code in (301, 302, 303, 307):
1107                                         newurl = newurl.replace(' ', '%20') 
1108                                         newheaders = dict((k,v) for k,v in req.headers.items()
1109                                                                           if k.lower() not in ("content-length", "content-type"))
1110                                         return HeadRequest(newurl, 
1111                                                                            headers=newheaders,
1112                                                                            origin_req_host=req.get_origin_req_host(), 
1113                                                                            unverifiable=True) 
1114                                 else: 
1115                                         raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp) 
1116
1117                 class HTTPMethodFallback(urllib2.BaseHandler):
1118                         """
1119                         Fallback to GET if HEAD is not allowed (405 HTTP error)
1120                         """
1121                         def http_error_405(self, req, fp, code, msg, headers): 
1122                                 fp.read()
1123                                 fp.close()
1124
1125                                 newheaders = dict((k,v) for k,v in req.headers.items()
1126                                                                   if k.lower() not in ("content-length", "content-type"))
1127                                 return self.parent.open(urllib2.Request(req.get_full_url(), 
1128                                                                                                  headers=newheaders, 
1129                                                                                                  origin_req_host=req.get_origin_req_host(), 
1130                                                                                                  unverifiable=True))
1131
1132                 # Build our opener
1133                 opener = urllib2.OpenerDirector() 
1134                 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1135                                                 HTTPMethodFallback, HEADRedirectHandler,
1136                                                 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1137                         opener.add_handler(handler())
1138
1139                 response = opener.open(HeadRequest(url))
1140                 new_url = response.geturl()
1141                 
1142                 if url == new_url: return False
1143                 
1144                 self.report_following_redirect(new_url)
1145                 self._downloader.download([new_url])
1146                 return True
1147
1148         def _real_extract(self, url):
1149                 if self._test_redirect(url): return
1150
1151                 video_id = url.split('/')[-1]
1152                 request = urllib2.Request(url)
1153                 try:
1154                         self.report_download_webpage(video_id)
1155                         webpage = urllib2.urlopen(request).read()
1156                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1157                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1158                         return
1159                 except ValueError, err:
1160                         # since this is the last-resort InfoExtractor, if
1161                         # this error is thrown, it'll be thrown here
1162                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1163                         return
1164
1165                 self.report_extraction(video_id)
1166                 # Start with something easy: JW Player in SWFObject
1167                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1168                 if mobj is None:
1169                         # Broaden the search a little bit
1170                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1171                 if mobj is None:
1172                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1173                         return
1174
1175                 # It's possible that one of the regexes
1176                 # matched, but returned an empty group:
1177                 if mobj.group(1) is None:
1178                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1179                         return
1180
1181                 video_url = urllib.unquote(mobj.group(1))
1182                 video_id = os.path.basename(video_url)
1183
1184                 # here's a fun little line of code for you:
1185                 video_extension = os.path.splitext(video_id)[1][1:]
1186                 video_id = os.path.splitext(video_id)[0]
1187
1188                 # it's tempting to parse this further, but you would
1189                 # have to take into account all the variations like
1190                 #   Video Title - Site Name
1191                 #   Site Name | Video Title
1192                 #   Video Title - Tagline | Site Name
1193                 # and so on and so forth; it's just not practical
1194                 mobj = re.search(r'<title>(.*)</title>', webpage)
1195                 if mobj is None:
1196                         self._downloader.trouble(u'ERROR: unable to extract title')
1197                         return
1198                 video_title = mobj.group(1).decode('utf-8')
1199
1200                 # video uploader is domain name
1201                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1202                 if mobj is None:
1203                         self._downloader.trouble(u'ERROR: unable to extract title')
1204                         return
1205                 video_uploader = mobj.group(1).decode('utf-8')
1206
1207                 return [{
1208                         'id':           video_id.decode('utf-8'),
1209                         'url':          video_url.decode('utf-8'),
1210                         'uploader':     video_uploader,
1211                         'upload_date':  u'NA',
1212                         'title':        video_title,
1213                         'ext':          video_extension.decode('utf-8'),
1214                         'format':       u'NA',
1215                         'player_url':   None,
1216                 }]
1217
1218
1219 class YoutubeSearchIE(InfoExtractor):
1220         """Information Extractor for YouTube search queries."""
1221         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1222         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1223         _max_youtube_results = 1000
1224         IE_NAME = u'youtube:search'
1225
1226         def __init__(self, downloader=None):
1227                 InfoExtractor.__init__(self, downloader)
1228
1229         def report_download_page(self, query, pagenum):
1230                 """Report attempt to download playlist page with given number."""
1231                 query = query.decode(preferredencoding())
1232                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1233
1234         def _real_extract(self, query):
1235                 mobj = re.match(self._VALID_URL, query)
1236                 if mobj is None:
1237                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1238                         return
1239
1240                 prefix, query = query.split(':')
1241                 prefix = prefix[8:]
1242                 query = query.encode('utf-8')
1243                 if prefix == '':
1244                         self._download_n_results(query, 1)
1245                         return
1246                 elif prefix == 'all':
1247                         self._download_n_results(query, self._max_youtube_results)
1248                         return
1249                 else:
1250                         try:
1251                                 n = long(prefix)
1252                                 if n <= 0:
1253                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1254                                         return
1255                                 elif n > self._max_youtube_results:
1256                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1257                                         n = self._max_youtube_results
1258                                 self._download_n_results(query, n)
1259                                 return
1260                         except ValueError: # parsing prefix as integer fails
1261                                 self._download_n_results(query, 1)
1262                                 return
1263
1264         def _download_n_results(self, query, n):
1265                 """Downloads a specified number of results for a query"""
1266
1267                 video_ids = []
1268                 pagenum = 0
1269                 limit = n
1270
1271                 while (50 * pagenum) < limit:
1272                         self.report_download_page(query, pagenum+1)
1273                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1274                         request = urllib2.Request(result_url)
1275                         try:
1276                                 data = urllib2.urlopen(request).read()
1277                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1278                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1279                                 return
1280                         api_response = json.loads(data)['data']
1281
1282                         new_ids = list(video['id'] for video in api_response['items'])
1283                         video_ids += new_ids
1284
1285                         limit = min(n, api_response['totalItems'])
1286                         pagenum += 1
1287
1288                 if len(video_ids) > n:
1289                         video_ids = video_ids[:n]
1290                 for id in video_ids:
1291                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1292                 return
1293
1294
1295 class GoogleSearchIE(InfoExtractor):
1296         """Information Extractor for Google Video search queries."""
1297         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1298         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1299         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1300         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1301         _max_google_results = 1000
1302         IE_NAME = u'video.google:search'
1303
1304         def __init__(self, downloader=None):
1305                 InfoExtractor.__init__(self, downloader)
1306
1307         def report_download_page(self, query, pagenum):
1308                 """Report attempt to download playlist page with given number."""
1309                 query = query.decode(preferredencoding())
1310                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1311
1312         def _real_extract(self, query):
1313                 mobj = re.match(self._VALID_URL, query)
1314                 if mobj is None:
1315                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1316                         return
1317
1318                 prefix, query = query.split(':')
1319                 prefix = prefix[8:]
1320                 query = query.encode('utf-8')
1321                 if prefix == '':
1322                         self._download_n_results(query, 1)
1323                         return
1324                 elif prefix == 'all':
1325                         self._download_n_results(query, self._max_google_results)
1326                         return
1327                 else:
1328                         try:
1329                                 n = long(prefix)
1330                                 if n <= 0:
1331                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1332                                         return
1333                                 elif n > self._max_google_results:
1334                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1335                                         n = self._max_google_results
1336                                 self._download_n_results(query, n)
1337                                 return
1338                         except ValueError: # parsing prefix as integer fails
1339                                 self._download_n_results(query, 1)
1340                                 return
1341
1342         def _download_n_results(self, query, n):
1343                 """Downloads a specified number of results for a query"""
1344
1345                 video_ids = []
1346                 pagenum = 0
1347
1348                 while True:
1349                         self.report_download_page(query, pagenum)
1350                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1351                         request = urllib2.Request(result_url)
1352                         try:
1353                                 page = urllib2.urlopen(request).read()
1354                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1355                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1356                                 return
1357
1358                         # Extract video identifiers
1359                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1360                                 video_id = mobj.group(1)
1361                                 if video_id not in video_ids:
1362                                         video_ids.append(video_id)
1363                                         if len(video_ids) == n:
1364                                                 # Specified n videos reached
1365                                                 for id in video_ids:
1366                                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1367                                                 return
1368
1369                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1370                                 for id in video_ids:
1371                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1372                                 return
1373
1374                         pagenum = pagenum + 1
1375
1376
1377 class YahooSearchIE(InfoExtractor):
1378         """Information Extractor for Yahoo! Video search queries."""
1379         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1380         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1381         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1382         _MORE_PAGES_INDICATOR = r'\s*Next'
1383         _max_yahoo_results = 1000
1384         IE_NAME = u'video.yahoo:search'
1385
1386         def __init__(self, downloader=None):
1387                 InfoExtractor.__init__(self, downloader)
1388
1389         def report_download_page(self, query, pagenum):
1390                 """Report attempt to download playlist page with given number."""
1391                 query = query.decode(preferredencoding())
1392                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1393
1394         def _real_extract(self, query):
1395                 mobj = re.match(self._VALID_URL, query)
1396                 if mobj is None:
1397                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1398                         return
1399
1400                 prefix, query = query.split(':')
1401                 prefix = prefix[8:]
1402                 query = query.encode('utf-8')
1403                 if prefix == '':
1404                         self._download_n_results(query, 1)
1405                         return
1406                 elif prefix == 'all':
1407                         self._download_n_results(query, self._max_yahoo_results)
1408                         return
1409                 else:
1410                         try:
1411                                 n = long(prefix)
1412                                 if n <= 0:
1413                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1414                                         return
1415                                 elif n > self._max_yahoo_results:
1416                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1417                                         n = self._max_yahoo_results
1418                                 self._download_n_results(query, n)
1419                                 return
1420                         except ValueError: # parsing prefix as integer fails
1421                                 self._download_n_results(query, 1)
1422                                 return
1423
1424         def _download_n_results(self, query, n):
1425                 """Downloads a specified number of results for a query"""
1426
1427                 video_ids = []
1428                 already_seen = set()
1429                 pagenum = 1
1430
1431                 while True:
1432                         self.report_download_page(query, pagenum)
1433                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1434                         request = urllib2.Request(result_url)
1435                         try:
1436                                 page = urllib2.urlopen(request).read()
1437                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1438                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1439                                 return
1440
1441                         # Extract video identifiers
1442                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1443                                 video_id = mobj.group(1)
1444                                 if video_id not in already_seen:
1445                                         video_ids.append(video_id)
1446                                         already_seen.add(video_id)
1447                                         if len(video_ids) == n:
1448                                                 # Specified n videos reached
1449                                                 for id in video_ids:
1450                                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1451                                                 return
1452
1453                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1454                                 for id in video_ids:
1455                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1456                                 return
1457
1458                         pagenum = pagenum + 1
1459
1460
1461 class YoutubePlaylistIE(InfoExtractor):
1462         """Information Extractor for YouTube playlists."""
1463
1464         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1465         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1466         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=PL%s&'
1467         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1468         IE_NAME = u'youtube:playlist'
1469
1470         def __init__(self, downloader=None):
1471                 InfoExtractor.__init__(self, downloader)
1472
1473         def report_download_page(self, playlist_id, pagenum):
1474                 """Report attempt to download playlist page with given number."""
1475                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1476
1477         def _real_extract(self, url):
1478                 # Extract playlist id
1479                 mobj = re.match(self._VALID_URL, url)
1480                 if mobj is None:
1481                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1482                         return
1483
1484                 # Single video case
1485                 if mobj.group(3) is not None:
1486                         self._downloader.download([mobj.group(3)])
1487                         return
1488
1489                 # Download playlist pages
1490                 # prefix is 'p' as default for playlists but there are other types that need extra care
1491                 playlist_prefix = mobj.group(1)
1492                 if playlist_prefix == 'a':
1493                         playlist_access = 'artist'
1494                 else:
1495                         playlist_prefix = 'p'
1496                         playlist_access = 'view_play_list'
1497                 playlist_id = mobj.group(2)
1498                 video_ids = []
1499                 pagenum = 1
1500
1501                 while True:
1502                         self.report_download_page(playlist_id, pagenum)
1503                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1504                         request = urllib2.Request(url)
1505                         try:
1506                                 page = urllib2.urlopen(request).read()
1507                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1508                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1509                                 return
1510
1511                         # Extract video identifiers
1512                         ids_in_page = []
1513                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1514                                 if mobj.group(1) not in ids_in_page:
1515                                         ids_in_page.append(mobj.group(1))
1516                         video_ids.extend(ids_in_page)
1517
1518                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1519                                 break
1520                         pagenum = pagenum + 1
1521
1522                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1523                 playlistend = self._downloader.params.get('playlistend', -1)
1524                 if playlistend == -1:
1525                         video_ids = video_ids[playliststart:]
1526                 else:
1527                         video_ids = video_ids[playliststart:playlistend]
1528
1529                 for id in video_ids:
1530                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1531                 return
1532
1533
1534 class YoutubeUserIE(InfoExtractor):
1535         """Information Extractor for YouTube users."""
1536
1537         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1538         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1539         _GDATA_PAGE_SIZE = 50
1540         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1541         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1542         IE_NAME = u'youtube:user'
1543
1544         def __init__(self, downloader=None):
1545                 InfoExtractor.__init__(self, downloader)
1546
1547         def report_download_page(self, username, start_index):
1548                 """Report attempt to download user page."""
1549                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1550                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1551
1552         def _real_extract(self, url):
1553                 # Extract username
1554                 mobj = re.match(self._VALID_URL, url)
1555                 if mobj is None:
1556                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1557                         return
1558
1559                 username = mobj.group(1)
1560
1561                 # Download video ids using YouTube Data API. Result size per
1562                 # query is limited (currently to 50 videos) so we need to query
1563                 # page by page until there are no video ids - it means we got
1564                 # all of them.
1565
1566                 video_ids = []
1567                 pagenum = 0
1568
1569                 while True:
1570                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1571                         self.report_download_page(username, start_index)
1572
1573                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1574
1575                         try:
1576                                 page = urllib2.urlopen(request).read()
1577                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1578                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1579                                 return
1580
1581                         # Extract video identifiers
1582                         ids_in_page = []
1583
1584                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1585                                 if mobj.group(1) not in ids_in_page:
1586                                         ids_in_page.append(mobj.group(1))
1587
1588                         video_ids.extend(ids_in_page)
1589
1590                         # A little optimization - if current page is not
1591                         # "full", ie. does not contain PAGE_SIZE video ids then
1592                         # we can assume that this page is the last one - there
1593                         # are no more ids on further pages - no need to query
1594                         # again.
1595
1596                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1597                                 break
1598
1599                         pagenum += 1
1600
1601                 all_ids_count = len(video_ids)
1602                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1603                 playlistend = self._downloader.params.get('playlistend', -1)
1604
1605                 if playlistend == -1:
1606                         video_ids = video_ids[playliststart:]
1607                 else:
1608                         video_ids = video_ids[playliststart:playlistend]
1609
1610                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1611                                 (username, all_ids_count, len(video_ids)))
1612
1613                 for video_id in video_ids:
1614                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1615
1616
1617 class DepositFilesIE(InfoExtractor):
1618         """Information extractor for depositfiles.com"""
1619
1620         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1621         IE_NAME = u'DepositFiles'
1622
1623         def __init__(self, downloader=None):
1624                 InfoExtractor.__init__(self, downloader)
1625
1626         def report_download_webpage(self, file_id):
1627                 """Report webpage download."""
1628                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1629
1630         def report_extraction(self, file_id):
1631                 """Report information extraction."""
1632                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1633
1634         def _real_extract(self, url):
1635                 file_id = url.split('/')[-1]
1636                 # Rebuild url in english locale
1637                 url = 'http://depositfiles.com/en/files/' + file_id
1638
1639                 # Retrieve file webpage with 'Free download' button pressed
1640                 free_download_indication = { 'gateway_result' : '1' }
1641                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1642                 try:
1643                         self.report_download_webpage(file_id)
1644                         webpage = urllib2.urlopen(request).read()
1645                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1646                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1647                         return
1648
1649                 # Search for the real file URL
1650                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1651                 if (mobj is None) or (mobj.group(1) is None):
1652                         # Try to figure out reason of the error.
1653                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1654                         if (mobj is not None) and (mobj.group(1) is not None):
1655                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1656                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1657                         else:
1658                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1659                         return
1660
1661                 file_url = mobj.group(1)
1662                 file_extension = os.path.splitext(file_url)[1][1:]
1663
1664                 # Search for file title
1665                 mobj = re.search(r'<b title="(.*?)">', webpage)
1666                 if mobj is None:
1667                         self._downloader.trouble(u'ERROR: unable to extract title')
1668                         return
1669                 file_title = mobj.group(1).decode('utf-8')
1670
1671                 return [{
1672                         'id':           file_id.decode('utf-8'),
1673                         'url':          file_url.decode('utf-8'),
1674                         'uploader':     u'NA',
1675                         'upload_date':  u'NA',
1676                         'title':        file_title,
1677                         'ext':          file_extension.decode('utf-8'),
1678                         'format':       u'NA',
1679                         'player_url':   None,
1680                 }]
1681
1682
1683 class FacebookIE(InfoExtractor):
1684         """Information Extractor for Facebook"""
1685
1686         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1687         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1688         _NETRC_MACHINE = 'facebook'
1689         _available_formats = ['video', 'highqual', 'lowqual']
1690         _video_extensions = {
1691                 'video': 'mp4',
1692                 'highqual': 'mp4',
1693                 'lowqual': 'mp4',
1694         }
1695         IE_NAME = u'facebook'
1696
1697         def __init__(self, downloader=None):
1698                 InfoExtractor.__init__(self, downloader)
1699
1700         def _reporter(self, message):
1701                 """Add header and report message."""
1702                 self._downloader.to_screen(u'[facebook] %s' % message)
1703
1704         def report_login(self):
1705                 """Report attempt to log in."""
1706                 self._reporter(u'Logging in')
1707
1708         def report_video_webpage_download(self, video_id):
1709                 """Report attempt to download video webpage."""
1710                 self._reporter(u'%s: Downloading video webpage' % video_id)
1711
1712         def report_information_extraction(self, video_id):
1713                 """Report attempt to extract video information."""
1714                 self._reporter(u'%s: Extracting video information' % video_id)
1715
1716         def _parse_page(self, video_webpage):
1717                 """Extract video information from page"""
1718                 # General data
1719                 data = {'title': r'\("video_title", "(.*?)"\)',
1720                         'description': r'<div class="datawrap">(.*?)</div>',
1721                         'owner': r'\("video_owner_name", "(.*?)"\)',
1722                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1723                         }
1724                 video_info = {}
1725                 for piece in data.keys():
1726                         mobj = re.search(data[piece], video_webpage)
1727                         if mobj is not None:
1728                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1729
1730                 # Video urls
1731                 video_urls = {}
1732                 for fmt in self._available_formats:
1733                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1734                         if mobj is not None:
1735                                 # URL is in a Javascript segment inside an escaped Unicode format within
1736                                 # the generally utf-8 page
1737                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1738                 video_info['video_urls'] = video_urls
1739
1740                 return video_info
1741
1742         def _real_initialize(self):
1743                 if self._downloader is None:
1744                         return
1745
1746                 useremail = None
1747                 password = None
1748                 downloader_params = self._downloader.params
1749
1750                 # Attempt to use provided username and password or .netrc data
1751                 if downloader_params.get('username', None) is not None:
1752                         useremail = downloader_params['username']
1753                         password = downloader_params['password']
1754                 elif downloader_params.get('usenetrc', False):
1755                         try:
1756                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1757                                 if info is not None:
1758                                         useremail = info[0]
1759                                         password = info[2]
1760                                 else:
1761                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1762                         except (IOError, netrc.NetrcParseError), err:
1763                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1764                                 return
1765
1766                 if useremail is None:
1767                         return
1768
1769                 # Log in
1770                 login_form = {
1771                         'email': useremail,
1772                         'pass': password,
1773                         'login': 'Log+In'
1774                         }
1775                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1776                 try:
1777                         self.report_login()
1778                         login_results = urllib2.urlopen(request).read()
1779                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1780                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1781                                 return
1782                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1783                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1784                         return
1785
1786         def _real_extract(self, url):
1787                 mobj = re.match(self._VALID_URL, url)
1788                 if mobj is None:
1789                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1790                         return
1791                 video_id = mobj.group('ID')
1792
1793                 # Get video webpage
1794                 self.report_video_webpage_download(video_id)
1795                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1796                 try:
1797                         page = urllib2.urlopen(request)
1798                         video_webpage = page.read()
1799                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1800                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1801                         return
1802
1803                 # Start extracting information
1804                 self.report_information_extraction(video_id)
1805
1806                 # Extract information
1807                 video_info = self._parse_page(video_webpage)
1808
1809                 # uploader
1810                 if 'owner' not in video_info:
1811                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1812                         return
1813                 video_uploader = video_info['owner']
1814
1815                 # title
1816                 if 'title' not in video_info:
1817                         self._downloader.trouble(u'ERROR: unable to extract video title')
1818                         return
1819                 video_title = video_info['title']
1820                 video_title = video_title.decode('utf-8')
1821
1822                 # thumbnail image
1823                 if 'thumbnail' not in video_info:
1824                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1825                         video_thumbnail = ''
1826                 else:
1827                         video_thumbnail = video_info['thumbnail']
1828
1829                 # upload date
1830                 upload_date = u'NA'
1831                 if 'upload_date' in video_info:
1832                         upload_time = video_info['upload_date']
1833                         timetuple = email.utils.parsedate_tz(upload_time)
1834                         if timetuple is not None:
1835                                 try:
1836                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1837                                 except:
1838                                         pass
1839
1840                 # description
1841                 video_description = video_info.get('description', 'No description available.')
1842
1843                 url_map = video_info['video_urls']
1844                 if len(url_map.keys()) > 0:
1845                         # Decide which formats to download
1846                         req_format = self._downloader.params.get('format', None)
1847                         format_limit = self._downloader.params.get('format_limit', None)
1848
1849                         if format_limit is not None and format_limit in self._available_formats:
1850                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1851                         else:
1852                                 format_list = self._available_formats
1853                         existing_formats = [x for x in format_list if x in url_map]
1854                         if len(existing_formats) == 0:
1855                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1856                                 return
1857                         if req_format is None:
1858                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1859                         elif req_format == 'worst':
1860                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1861                         elif req_format == '-1':
1862                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1863                         else:
1864                                 # Specific format
1865                                 if req_format not in url_map:
1866                                         self._downloader.trouble(u'ERROR: requested format not available')
1867                                         return
1868                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1869
1870                 results = []
1871                 for format_param, video_real_url in video_url_list:
1872                         # Extension
1873                         video_extension = self._video_extensions.get(format_param, 'mp4')
1874
1875                         results.append({
1876                                 'id':           video_id.decode('utf-8'),
1877                                 'url':          video_real_url.decode('utf-8'),
1878                                 'uploader':     video_uploader.decode('utf-8'),
1879                                 'upload_date':  upload_date,
1880                                 'title':        video_title,
1881                                 'ext':          video_extension.decode('utf-8'),
1882                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1883                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1884                                 'description':  video_description.decode('utf-8'),
1885                                 'player_url':   None,
1886                         })
1887                 return results
1888
1889 class BlipTVIE(InfoExtractor):
1890         """Information extractor for blip.tv"""
1891
1892         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1893         _URL_EXT = r'^.*\.([a-z0-9]+)$'
1894         IE_NAME = u'blip.tv'
1895
1896         def report_extraction(self, file_id):
1897                 """Report information extraction."""
1898                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
1899
1900         def report_direct_download(self, title):
1901                 """Report information extraction."""
1902                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
1903
1904         def _real_extract(self, url):
1905                 mobj = re.match(self._VALID_URL, url)
1906                 if mobj is None:
1907                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1908                         return
1909
1910                 if '?' in url:
1911                         cchar = '&'
1912                 else:
1913                         cchar = '?'
1914                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1915                 request = urllib2.Request(json_url)
1916                 self.report_extraction(mobj.group(1))
1917                 info = None
1918                 try:
1919                         urlh = urllib2.urlopen(request)
1920                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1921                                 basename = url.split('/')[-1]
1922                                 title,ext = os.path.splitext(basename)
1923                                 title = title.decode('UTF-8')
1924                                 ext = ext.replace('.', '')
1925                                 self.report_direct_download(title)
1926                                 info = {
1927                                         'id': title,
1928                                         'url': url,
1929                                         'title': title,
1930                                         'ext': ext,
1931                                         'urlhandle': urlh
1932                                 }
1933                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1934                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1935                         return
1936                 if info is None: # Regular URL
1937                         try:
1938                                 json_code = urlh.read()
1939                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1940                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
1941                                 return
1942
1943                         try:
1944                                 json_data = json.loads(json_code)
1945                                 if 'Post' in json_data:
1946                                         data = json_data['Post']
1947                                 else:
1948                                         data = json_data
1949
1950                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1951                                 video_url = data['media']['url']
1952                                 umobj = re.match(self._URL_EXT, video_url)
1953                                 if umobj is None:
1954                                         raise ValueError('Can not determine filename extension')
1955                                 ext = umobj.group(1)
1956
1957                                 info = {
1958                                         'id': data['item_id'],
1959                                         'url': video_url,
1960                                         'uploader': data['display_name'],
1961                                         'upload_date': upload_date,
1962                                         'title': data['title'],
1963                                         'ext': ext,
1964                                         'format': data['media']['mimeType'],
1965                                         'thumbnail': data['thumbnailUrl'],
1966                                         'description': data['description'],
1967                                         'player_url': data['embedUrl']
1968                                 }
1969                         except (ValueError,KeyError), err:
1970                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
1971                                 return
1972
1973                 return [info]
1974
1975
1976 class MyVideoIE(InfoExtractor):
1977         """Information Extractor for myvideo.de."""
1978
1979         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
1980         IE_NAME = u'myvideo'
1981
1982         def __init__(self, downloader=None):
1983                 InfoExtractor.__init__(self, downloader)
1984         
1985         def report_download_webpage(self, video_id):
1986                 """Report webpage download."""
1987                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
1988
1989         def report_extraction(self, video_id):
1990                 """Report information extraction."""
1991                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
1992
1993         def _real_extract(self,url):
1994                 mobj = re.match(self._VALID_URL, url)
1995                 if mobj is None:
1996                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
1997                         return
1998
1999                 video_id = mobj.group(1)
2000
2001                 # Get video webpage
2002                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2003                 try:
2004                         self.report_download_webpage(video_id)
2005                         webpage = urllib2.urlopen(request).read()
2006                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2007                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2008                         return
2009
2010                 self.report_extraction(video_id)
2011                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2012                                  webpage)
2013                 if mobj is None:
2014                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2015                         return
2016                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2017
2018                 mobj = re.search('<title>([^<]+)</title>', webpage)
2019                 if mobj is None:
2020                         self._downloader.trouble(u'ERROR: unable to extract title')
2021                         return
2022
2023                 video_title = mobj.group(1)
2024
2025                 return [{
2026                         'id':           video_id,
2027                         'url':          video_url,
2028                         'uploader':     u'NA',
2029                         'upload_date':  u'NA',
2030                         'title':        video_title,
2031                         'ext':          u'flv',
2032                         'format':       u'NA',
2033                         'player_url':   None,
2034                 }]
2035
2036 class ComedyCentralIE(InfoExtractor):
2037         """Information extractor for The Daily Show and Colbert Report """
2038
2039         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2040         IE_NAME = u'comedycentral'
2041
2042         def report_extraction(self, episode_id):
2043                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2044
2045         def report_config_download(self, episode_id):
2046                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2047
2048         def report_index_download(self, episode_id):
2049                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2050
2051         def report_player_url(self, episode_id):
2052                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2053
2054         def _real_extract(self, url):
2055                 mobj = re.match(self._VALID_URL, url)
2056                 if mobj is None:
2057                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2058                         return
2059
2060                 if mobj.group('shortname'):
2061                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
2062                                 url = u'http://www.thedailyshow.com/full-episodes/'
2063                         else:
2064                                 url = u'http://www.colbertnation.com/full-episodes/'
2065                         mobj = re.match(self._VALID_URL, url)
2066                         assert mobj is not None
2067
2068                 dlNewest = not mobj.group('episode')
2069                 if dlNewest:
2070                         epTitle = mobj.group('showname')
2071                 else:
2072                         epTitle = mobj.group('episode')
2073
2074                 req = urllib2.Request(url)
2075                 self.report_extraction(epTitle)
2076                 try:
2077                         htmlHandle = urllib2.urlopen(req)
2078                         html = htmlHandle.read()
2079                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2080                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2081                         return
2082                 if dlNewest:
2083                         url = htmlHandle.geturl()
2084                         mobj = re.match(self._VALID_URL, url)
2085                         if mobj is None:
2086                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2087                                 return
2088                         if mobj.group('episode') == '':
2089                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2090                                 return
2091                         epTitle = mobj.group('episode')
2092
2093                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2094                 if len(mMovieParams) == 0:
2095                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2096                         return
2097
2098                 playerUrl_raw = mMovieParams[0][0]
2099                 self.report_player_url(epTitle)
2100                 try:
2101                         urlHandle = urllib2.urlopen(playerUrl_raw)
2102                         playerUrl = urlHandle.geturl()
2103                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2104                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2105                         return
2106
2107                 uri = mMovieParams[0][1]
2108                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2109                 self.report_index_download(epTitle)
2110                 try:
2111                         indexXml = urllib2.urlopen(indexUrl).read()
2112                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2113                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2114                         return
2115
2116                 results = []
2117
2118                 idoc = xml.etree.ElementTree.fromstring(indexXml)
2119                 itemEls = idoc.findall('.//item')
2120                 for itemEl in itemEls:
2121                         mediaId = itemEl.findall('./guid')[0].text
2122                         shortMediaId = mediaId.split(':')[-1]
2123                         showId = mediaId.split(':')[-2].replace('.com', '')
2124                         officialTitle = itemEl.findall('./title')[0].text
2125                         officialDate = itemEl.findall('./pubDate')[0].text
2126
2127                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2128                                                 urllib.urlencode({'uri': mediaId}))
2129                         configReq = urllib2.Request(configUrl)
2130                         self.report_config_download(epTitle)
2131                         try:
2132                                 configXml = urllib2.urlopen(configReq).read()
2133                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2134                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2135                                 return
2136
2137                         cdoc = xml.etree.ElementTree.fromstring(configXml)
2138                         turls = []
2139                         for rendition in cdoc.findall('.//rendition'):
2140                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2141                                 turls.append(finfo)
2142
2143                         if len(turls) == 0:
2144                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2145                                 continue
2146
2147                         # For now, just pick the highest bitrate
2148                         format,video_url = turls[-1]
2149
2150                         effTitle = showId + u'-' + epTitle
2151                         info = {
2152                                 'id': shortMediaId,
2153                                 'url': video_url,
2154                                 'uploader': showId,
2155                                 'upload_date': officialDate,
2156                                 'title': effTitle,
2157                                 'ext': 'mp4',
2158                                 'format': format,
2159                                 'thumbnail': None,
2160                                 'description': officialTitle,
2161                                 'player_url': playerUrl
2162                         }
2163
2164                         results.append(info)
2165                         
2166                 return results
2167
2168
2169 class EscapistIE(InfoExtractor):
2170         """Information extractor for The Escapist """
2171
2172         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2173         IE_NAME = u'escapist'
2174
2175         def report_extraction(self, showName):
2176                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2177
2178         def report_config_download(self, showName):
2179                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2180
2181         def _real_extract(self, url):
2182                 mobj = re.match(self._VALID_URL, url)
2183                 if mobj is None:
2184                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2185                         return
2186                 showName = mobj.group('showname')
2187                 videoId = mobj.group('episode')
2188
2189                 self.report_extraction(showName)
2190                 try:
2191                         webPage = urllib2.urlopen(url).read()
2192                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2193                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2194                         return
2195
2196                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2197                 description = unescapeHTML(descMatch.group(1))
2198                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2199                 imgUrl = unescapeHTML(imgMatch.group(1))
2200                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2201                 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2202                 configUrlMatch = re.search('config=(.*)$', playerUrl)
2203                 configUrl = urllib2.unquote(configUrlMatch.group(1))
2204
2205                 self.report_config_download(showName)
2206                 try:
2207                         configJSON = urllib2.urlopen(configUrl).read()
2208                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2209                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2210                         return
2211
2212                 # Technically, it's JavaScript, not JSON
2213                 configJSON = configJSON.replace("'", '"')
2214
2215                 try:
2216                         config = json.loads(configJSON)
2217                 except (ValueError,), err:
2218                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2219                         return
2220
2221                 playlist = config['playlist']
2222                 videoUrl = playlist[1]['url']
2223
2224                 info = {
2225                         'id': videoId,
2226                         'url': videoUrl,
2227                         'uploader': showName,
2228                         'upload_date': None,
2229                         'title': showName,
2230                         'ext': 'flv',
2231                         'format': 'flv',
2232                         'thumbnail': imgUrl,
2233                         'description': description,
2234                         'player_url': playerUrl,
2235                 }
2236
2237                 return [info]
2238
2239
2240 class CollegeHumorIE(InfoExtractor):
2241         """Information extractor for collegehumor.com"""
2242
2243         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2244         IE_NAME = u'collegehumor'
2245
2246         def report_webpage(self, video_id):
2247                 """Report information extraction."""
2248                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2249
2250         def report_extraction(self, video_id):
2251                 """Report information extraction."""
2252                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2253
2254         def _real_extract(self, url):
2255                 mobj = re.match(self._VALID_URL, url)
2256                 if mobj is None:
2257                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2258                         return
2259                 video_id = mobj.group('videoid')
2260
2261                 self.report_webpage(video_id)
2262                 request = urllib2.Request(url)
2263                 try:
2264                         webpage = urllib2.urlopen(request).read()
2265                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2266                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2267                         return
2268
2269                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2270                 if m is None:
2271                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2272                         return
2273                 internal_video_id = m.group('internalvideoid')
2274
2275                 info = {
2276                         'id': video_id,
2277                         'internal_id': internal_video_id,
2278                 }
2279
2280                 self.report_extraction(video_id)
2281                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2282                 try:
2283                         metaXml = urllib2.urlopen(xmlUrl).read()
2284                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2285                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2286                         return
2287
2288                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2289                 try:
2290                         videoNode = mdoc.findall('./video')[0]
2291                         info['description'] = videoNode.findall('./description')[0].text
2292                         info['title'] = videoNode.findall('./caption')[0].text
2293                         info['url'] = videoNode.findall('./file')[0].text
2294                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2295                         info['ext'] = info['url'].rpartition('.')[2]
2296                         info['format'] = info['ext']
2297                 except IndexError:
2298                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2299                         return
2300
2301                 return [info]
2302
2303
2304 class XVideosIE(InfoExtractor):
2305         """Information extractor for xvideos.com"""
2306
2307         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2308         IE_NAME = u'xvideos'
2309
2310         def report_webpage(self, video_id):
2311                 """Report information extraction."""
2312                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2313
2314         def report_extraction(self, video_id):
2315                 """Report information extraction."""
2316                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2317
2318         def _real_extract(self, url):
2319                 mobj = re.match(self._VALID_URL, url)
2320                 if mobj is None:
2321                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2322                         return
2323                 video_id = mobj.group(1).decode('utf-8')
2324
2325                 self.report_webpage(video_id)
2326
2327                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2328                 try:
2329                         webpage = urllib2.urlopen(request).read()
2330                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2331                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2332                         return
2333
2334                 self.report_extraction(video_id)
2335
2336
2337                 # Extract video URL
2338                 mobj = re.search(r'flv_url=(.+?)&', webpage)
2339                 if mobj is None:
2340                         self._downloader.trouble(u'ERROR: unable to extract video url')
2341                         return
2342                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2343
2344
2345                 # Extract title
2346                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2347                 if mobj is None:
2348                         self._downloader.trouble(u'ERROR: unable to extract video title')
2349                         return
2350                 video_title = mobj.group(1).decode('utf-8')
2351
2352
2353                 # Extract video thumbnail
2354                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
2355                 if mobj is None:
2356                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2357                         return
2358                 video_thumbnail = mobj.group(1).decode('utf-8')
2359
2360                 info = {
2361                         'id': video_id,
2362                         'url': video_url,
2363                         'uploader': None,
2364                         'upload_date': None,
2365                         'title': video_title,
2366                         'ext': 'flv',
2367                         'format': 'flv',
2368                         'thumbnail': video_thumbnail,
2369                         'description': None,
2370                         'player_url': None,
2371                 }
2372
2373                 return [info]
2374
2375
2376 class SoundcloudIE(InfoExtractor):
2377         """Information extractor for soundcloud.com
2378            To access the media, the uid of the song and a stream token
2379            must be extracted from the page source and the script must make
2380            a request to media.soundcloud.com/crossdomain.xml. Then
2381            the media can be grabbed by requesting from an url composed
2382            of the stream token and uid
2383          """
2384
2385         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2386         IE_NAME = u'soundcloud'
2387
2388         def __init__(self, downloader=None):
2389                 InfoExtractor.__init__(self, downloader)
2390
2391         def report_webpage(self, video_id):
2392                 """Report information extraction."""
2393                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2394
2395         def report_extraction(self, video_id):
2396                 """Report information extraction."""
2397                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2398
2399         def _real_extract(self, url):
2400                 mobj = re.match(self._VALID_URL, url)
2401                 if mobj is None:
2402                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2403                         return
2404
2405                 # extract uploader (which is in the url)
2406                 uploader = mobj.group(1).decode('utf-8')
2407                 # extract simple title (uploader + slug of song title)
2408                 slug_title =  mobj.group(2).decode('utf-8')
2409                 simple_title = uploader + u'-' + slug_title
2410
2411                 self.report_webpage('%s/%s' % (uploader, slug_title))
2412
2413                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2414                 try:
2415                         webpage = urllib2.urlopen(request).read()
2416                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2417                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2418                         return
2419
2420                 self.report_extraction('%s/%s' % (uploader, slug_title))
2421
2422                 # extract uid and stream token that soundcloud hands out for access
2423                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2424                 if mobj:
2425                         video_id = mobj.group(1)
2426                         stream_token = mobj.group(2)
2427
2428                 # extract unsimplified title
2429                 mobj = re.search('"title":"(.*?)",', webpage)
2430                 if mobj:
2431                         title = mobj.group(1).decode('utf-8')
2432                 else:
2433                         title = simple_title
2434
2435                 # construct media url (with uid/token)
2436                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2437                 mediaURL = mediaURL % (video_id, stream_token)
2438
2439                 # description
2440                 description = u'No description available'
2441                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2442                 if mobj:
2443                         description = mobj.group(1)
2444                 
2445                 # upload date
2446                 upload_date = None
2447                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2448                 if mobj:
2449                         try:
2450                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2451                         except Exception, e:
2452                                 print str(e)
2453
2454                 # for soundcloud, a request to a cross domain is required for cookies
2455                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2456
2457                 return [{
2458                         'id':           video_id.decode('utf-8'),
2459                         'url':          mediaURL,
2460                         'uploader':     uploader.decode('utf-8'),
2461                         'upload_date':  upload_date,
2462                         'title':        title,
2463                         'ext':          u'mp3',
2464                         'format':       u'NA',
2465                         'player_url':   None,
2466                         'description': description.decode('utf-8')
2467                 }]
2468
2469
2470 class InfoQIE(InfoExtractor):
2471         """Information extractor for infoq.com"""
2472
2473         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2474         IE_NAME = u'infoq'
2475
2476         def report_webpage(self, video_id):
2477                 """Report information extraction."""
2478                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2479
2480         def report_extraction(self, video_id):
2481                 """Report information extraction."""
2482                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2483
2484         def _real_extract(self, url):
2485                 mobj = re.match(self._VALID_URL, url)
2486                 if mobj is None:
2487                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2488                         return
2489
2490                 self.report_webpage(url)
2491
2492                 request = urllib2.Request(url)
2493                 try:
2494                         webpage = urllib2.urlopen(request).read()
2495                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2496                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2497                         return
2498
2499                 self.report_extraction(url)
2500
2501
2502                 # Extract video URL
2503                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2504                 if mobj is None:
2505                         self._downloader.trouble(u'ERROR: unable to extract video url')
2506                         return
2507                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2508
2509
2510                 # Extract title
2511                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2512                 if mobj is None:
2513                         self._downloader.trouble(u'ERROR: unable to extract video title')
2514                         return
2515                 video_title = mobj.group(1).decode('utf-8')
2516
2517                 # Extract description
2518                 video_description = u'No description available.'
2519                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2520                 if mobj is not None:
2521                         video_description = mobj.group(1).decode('utf-8')
2522
2523                 video_filename = video_url.split('/')[-1]
2524                 video_id, extension = video_filename.split('.')
2525
2526                 info = {
2527                         'id': video_id,
2528                         'url': video_url,
2529                         'uploader': None,
2530                         'upload_date': None,
2531                         'title': video_title,
2532                         'ext': extension,
2533                         'format': extension, # Extension is always(?) mp4, but seems to be flv
2534                         'thumbnail': None,
2535                         'description': video_description,
2536                         'player_url': None,
2537                 }
2538
2539                 return [info]
2540
2541 class MixcloudIE(InfoExtractor):
2542         """Information extractor for www.mixcloud.com"""
2543         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2544         IE_NAME = u'mixcloud'
2545
2546         def __init__(self, downloader=None):
2547                 InfoExtractor.__init__(self, downloader)
2548
2549         def report_download_json(self, file_id):
2550                 """Report JSON download."""
2551                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2552
2553         def report_extraction(self, file_id):
2554                 """Report information extraction."""
2555                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2556
2557         def get_urls(self, jsonData, fmt, bitrate='best'):
2558                 """Get urls from 'audio_formats' section in json"""
2559                 file_url = None
2560                 try:
2561                         bitrate_list = jsonData[fmt]
2562                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2563                                 bitrate = max(bitrate_list) # select highest
2564
2565                         url_list = jsonData[fmt][bitrate]
2566                 except TypeError: # we have no bitrate info.
2567                         url_list = jsonData[fmt]
2568                 return url_list
2569
2570         def check_urls(self, url_list):
2571                 """Returns 1st active url from list"""
2572                 for url in url_list:
2573                         try:
2574                                 urllib2.urlopen(url)
2575                                 return url
2576                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2577                                 url = None
2578
2579                 return None
2580
2581         def _print_formats(self, formats):
2582                 print 'Available formats:'
2583                 for fmt in formats.keys():
2584                         for b in formats[fmt]:
2585                                 try:
2586                                         ext = formats[fmt][b][0]
2587                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2588                                 except TypeError: # we have no bitrate info
2589                                         ext = formats[fmt][0]
2590                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2591                                         break
2592
2593         def _real_extract(self, url):
2594                 mobj = re.match(self._VALID_URL, url)
2595                 if mobj is None:
2596                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2597                         return
2598                 # extract uploader & filename from url
2599                 uploader = mobj.group(1).decode('utf-8')
2600                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2601
2602                 # construct API request
2603                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2604                 # retrieve .json file with links to files
2605                 request = urllib2.Request(file_url)
2606                 try:
2607                         self.report_download_json(file_url)
2608                         jsonData = urllib2.urlopen(request).read()
2609                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2610                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2611                         return
2612
2613                 # parse JSON
2614                 json_data = json.loads(jsonData)
2615                 player_url = json_data['player_swf_url']
2616                 formats = dict(json_data['audio_formats'])
2617
2618                 req_format = self._downloader.params.get('format', None)
2619                 bitrate = None
2620
2621                 if self._downloader.params.get('listformats', None):
2622                         self._print_formats(formats)
2623                         return
2624
2625                 if req_format is None or req_format == 'best':
2626                         for format_param in formats.keys():
2627                                 url_list = self.get_urls(formats, format_param)
2628                                 # check urls
2629                                 file_url = self.check_urls(url_list)
2630                                 if file_url is not None:
2631                                         break # got it!
2632                 else:
2633                         if req_format not in formats.keys():
2634                                 self._downloader.trouble(u'ERROR: format is not available')
2635                                 return
2636
2637                         url_list = self.get_urls(formats, req_format)
2638                         file_url = self.check_urls(url_list)
2639                         format_param = req_format
2640
2641                 return [{
2642                         'id': file_id.decode('utf-8'),
2643                         'url': file_url.decode('utf-8'),
2644                         'uploader':     uploader.decode('utf-8'),
2645                         'upload_date': u'NA',
2646                         'title': json_data['name'],
2647                         'ext': file_url.split('.')[-1].decode('utf-8'),
2648                         'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2649                         'thumbnail': json_data['thumbnail_url'],
2650                         'description': json_data['description'],
2651                         'player_url': player_url.decode('utf-8'),
2652                 }]
2653
2654 class StanfordOpenClassroomIE(InfoExtractor):
2655         """Information extractor for Stanford's Open ClassRoom"""
2656
2657         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2658         IE_NAME = u'stanfordoc'
2659
2660         def report_download_webpage(self, objid):
2661                 """Report information extraction."""
2662                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2663
2664         def report_extraction(self, video_id):
2665                 """Report information extraction."""
2666                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2667
2668         def _real_extract(self, url):
2669                 mobj = re.match(self._VALID_URL, url)
2670                 if mobj is None:
2671                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2672                         return
2673
2674                 if mobj.group('course') and mobj.group('video'): # A specific video
2675                         course = mobj.group('course')
2676                         video = mobj.group('video')
2677                         info = {
2678                                 'id': course + '_' + video,
2679                         }
2680
2681                         self.report_extraction(info['id'])
2682                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2683                         xmlUrl = baseUrl + video + '.xml'
2684                         try:
2685                                 metaXml = urllib2.urlopen(xmlUrl).read()
2686                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2687                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2688                                 return
2689                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2690                         try:
2691                                 info['title'] = mdoc.findall('./title')[0].text
2692                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2693                         except IndexError:
2694                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2695                                 return
2696                         info['ext'] = info['url'].rpartition('.')[2]
2697                         info['format'] = info['ext']
2698                         return [info]
2699                 elif mobj.group('course'): # A course page
2700                         course = mobj.group('course')
2701                         info = {
2702                                 'id': course,
2703                                 'type': 'playlist',
2704                         }
2705
2706                         self.report_download_webpage(info['id'])
2707                         try:
2708                                 coursepage = urllib2.urlopen(url).read()
2709                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2710                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2711                                 return
2712
2713                         m = re.search('<h1>([^<]+)</h1>', coursepage)
2714                         if m:
2715                                 info['title'] = unescapeHTML(m.group(1))
2716                         else:
2717                                 info['title'] = info['id']
2718
2719                         m = re.search('<description>([^<]+)</description>', coursepage)
2720                         if m:
2721                                 info['description'] = unescapeHTML(m.group(1))
2722
2723                         links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2724                         info['list'] = [
2725                                 {
2726                                         'type': 'reference',
2727                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2728                                 }
2729                                         for vpage in links]
2730                         results = []
2731                         for entry in info['list']:
2732                                 assert entry['type'] == 'reference'
2733                                 results += self.extract(entry['url'])
2734                         return results
2735                         
2736                 else: # Root page
2737                         info = {
2738                                 'id': 'Stanford OpenClassroom',
2739                                 'type': 'playlist',
2740                         }
2741
2742                         self.report_download_webpage(info['id'])
2743                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2744                         try:
2745                                 rootpage = urllib2.urlopen(rootURL).read()
2746                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2747                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2748                                 return
2749
2750                         info['title'] = info['id']
2751
2752                         links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2753                         info['list'] = [
2754                                 {
2755                                         'type': 'reference',
2756                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2757                                 }
2758                                         for cpage in links]
2759
2760                         results = []
2761                         for entry in info['list']:
2762                                 assert entry['type'] == 'reference'
2763                                 results += self.extract(entry['url'])
2764                         return results
2765
2766 class MTVIE(InfoExtractor):
2767         """Information extractor for MTV.com"""
2768
2769         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2770         IE_NAME = u'mtv'
2771
2772         def report_webpage(self, video_id):
2773                 """Report information extraction."""
2774                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2775
2776         def report_extraction(self, video_id):
2777                 """Report information extraction."""
2778                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2779
2780         def _real_extract(self, url):
2781                 mobj = re.match(self._VALID_URL, url)
2782                 if mobj is None:
2783                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2784                         return
2785                 if not mobj.group('proto'):
2786                         url = 'http://' + url
2787                 video_id = mobj.group('videoid')
2788                 self.report_webpage(video_id)
2789
2790                 request = urllib2.Request(url)
2791                 try:
2792                         webpage = urllib2.urlopen(request).read()
2793                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2794                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2795                         return
2796
2797                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2798                 if mobj is None:
2799                         self._downloader.trouble(u'ERROR: unable to extract song name')
2800                         return
2801                 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2802                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2803                 if mobj is None:
2804                         self._downloader.trouble(u'ERROR: unable to extract performer')
2805                         return
2806                 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2807                 video_title = performer + ' - ' + song_name 
2808
2809                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2810                 if mobj is None:
2811                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
2812                         return
2813                 mtvn_uri = mobj.group(1)
2814
2815                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2816                 if mobj is None:
2817                         self._downloader.trouble(u'ERROR: unable to extract content id')
2818                         return
2819                 content_id = mobj.group(1)
2820
2821                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2822                 self.report_extraction(video_id)
2823                 request = urllib2.Request(videogen_url)
2824                 try:
2825                         metadataXml = urllib2.urlopen(request).read()
2826                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2827                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
2828                         return
2829
2830                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2831                 renditions = mdoc.findall('.//rendition')
2832
2833                 # For now, always pick the highest quality.
2834                 rendition = renditions[-1]
2835
2836                 try:
2837                         _,_,ext = rendition.attrib['type'].partition('/')
2838                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2839                         video_url = rendition.find('./src').text
2840                 except KeyError:
2841                         self._downloader.trouble('Invalid rendition field.')
2842                         return
2843
2844                 info = {
2845                         'id': video_id,
2846                         'url': video_url,
2847                         'uploader': performer,
2848                         'title': video_title,
2849                         'ext': ext,
2850                         'format': format,
2851                 }
2852
2853                 return [info]