Simplified preferredencoding()
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import datetime
5 import HTMLParser
6 import httplib
7 import netrc
8 import os
9 import re
10 import socket
11 import time
12 import urllib
13 import urllib2
14 import email.utils
15 import xml.etree.ElementTree
16 from urlparse import parse_qs
17
18 try:
19         import cStringIO as StringIO
20 except ImportError:
21         import StringIO
22
23 from utils import *
24
25
26 class InfoExtractor(object):
27         """Information Extractor class.
28
29         Information extractors are the classes that, given a URL, extract
30         information from the video (or videos) the URL refers to. This
31         information includes the real video URL, the video title and simplified
32         title, author and others. The information is stored in a dictionary
33         which is then passed to the FileDownloader. The FileDownloader
34         processes this information possibly downloading the video to the file
35         system, among other possible outcomes. The dictionaries must include
36         the following fields:
37
38         id:             Video identifier.
39         url:            Final video URL.
40         uploader:       Nickname of the video uploader.
41         title:          Literal title.
42         ext:            Video filename extension.
43         format:         Video format.
44         player_url:     SWF Player URL (may be None).
45
46         The following fields are optional. Their primary purpose is to allow
47         youtube-dl to serve as the backend for a video search function, such
48         as the one in youtube2mp3.  They are only used when their respective
49         forced printing functions are called:
50
51         thumbnail:      Full URL to a video thumbnail image.
52         description:    One-line video description.
53
54         Subclasses of this one should re-define the _real_initialize() and
55         _real_extract() methods and define a _VALID_URL regexp.
56         Probably, they should also be added to the list of extractors.
57         """
58
59         _ready = False
60         _downloader = None
61
62         def __init__(self, downloader=None):
63                 """Constructor. Receives an optional downloader."""
64                 self._ready = False
65                 self.set_downloader(downloader)
66
67         def suitable(self, url):
68                 """Receives a URL and returns True if suitable for this IE."""
69                 return re.match(self._VALID_URL, url) is not None
70
71         def initialize(self):
72                 """Initializes an instance (authentication, etc)."""
73                 if not self._ready:
74                         self._real_initialize()
75                         self._ready = True
76
77         def extract(self, url):
78                 """Extracts URL information and returns it in list of dicts."""
79                 self.initialize()
80                 return self._real_extract(url)
81
82         def set_downloader(self, downloader):
83                 """Sets the downloader for this IE."""
84                 self._downloader = downloader
85
86         def _real_initialize(self):
87                 """Real initialization process. Redefine in subclasses."""
88                 pass
89
90         def _real_extract(self, url):
91                 """Real extraction process. Redefine in subclasses."""
92                 pass
93
94
95 class YoutubeIE(InfoExtractor):
96         """Information extractor for youtube.com."""
97
98         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
99         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
100         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
101         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
102         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
103         _NETRC_MACHINE = 'youtube'
104         # Listed in order of quality
105         _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
106         _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
107         _video_extensions = {
108                 '13': '3gp',
109                 '17': 'mp4',
110                 '18': 'mp4',
111                 '22': 'mp4',
112                 '37': 'mp4',
113                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
114                 '43': 'webm',
115                 '44': 'webm',
116                 '45': 'webm',
117                 '46': 'webm',
118         }
119         _video_dimensions = {
120                 '5': '240x400',
121                 '6': '???',
122                 '13': '???',
123                 '17': '144x176',
124                 '18': '360x640',
125                 '22': '720x1280',
126                 '34': '360x640',
127                 '35': '480x854',
128                 '37': '1080x1920',
129                 '38': '3072x4096',
130                 '43': '360x640',
131                 '44': '480x854',
132                 '45': '720x1280',
133                 '46': '1080x1920',
134         }       
135         IE_NAME = u'youtube'
136
137         def report_lang(self):
138                 """Report attempt to set language."""
139                 self._downloader.to_screen(u'[youtube] Setting language')
140
141         def report_login(self):
142                 """Report attempt to log in."""
143                 self._downloader.to_screen(u'[youtube] Logging in')
144
145         def report_age_confirmation(self):
146                 """Report attempt to confirm age."""
147                 self._downloader.to_screen(u'[youtube] Confirming age')
148
149         def report_video_webpage_download(self, video_id):
150                 """Report attempt to download video webpage."""
151                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
152
153         def report_video_info_webpage_download(self, video_id):
154                 """Report attempt to download video info webpage."""
155                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
156
157         def report_video_subtitles_download(self, video_id):
158                 """Report attempt to download video info webpage."""
159                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
160
161         def report_information_extraction(self, video_id):
162                 """Report attempt to extract video information."""
163                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
164
165         def report_unavailable_format(self, video_id, format):
166                 """Report extracted video URL."""
167                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
168
169         def report_rtmp_download(self):
170                 """Indicate the download will use the RTMP protocol."""
171                 self._downloader.to_screen(u'[youtube] RTMP download detected')
172
173         def _closed_captions_xml_to_srt(self, xml_string):
174                 srt = ''
175                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
176                 # TODO parse xml instead of regex
177                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
178                         if not dur: dur = '4'
179                         start = float(start)
180                         end = start + float(dur)
181                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
182                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
183                         caption = unescapeHTML(caption)
184                         caption = unescapeHTML(caption) # double cycle, intentional
185                         srt += str(n+1) + '\n'
186                         srt += start + ' --> ' + end + '\n'
187                         srt += caption + '\n\n'
188                 return srt
189
190         def _print_formats(self, formats):
191                 print 'Available formats:'
192                 for x in formats:
193                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
194
195         def _real_initialize(self):
196                 if self._downloader is None:
197                         return
198
199                 username = None
200                 password = None
201                 downloader_params = self._downloader.params
202
203                 # Attempt to use provided username and password or .netrc data
204                 if downloader_params.get('username', None) is not None:
205                         username = downloader_params['username']
206                         password = downloader_params['password']
207                 elif downloader_params.get('usenetrc', False):
208                         try:
209                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
210                                 if info is not None:
211                                         username = info[0]
212                                         password = info[2]
213                                 else:
214                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
215                         except (IOError, netrc.NetrcParseError), err:
216                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
217                                 return
218
219                 # Set language
220                 request = urllib2.Request(self._LANG_URL)
221                 try:
222                         self.report_lang()
223                         urllib2.urlopen(request).read()
224                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
225                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
226                         return
227
228                 # No authentication to be performed
229                 if username is None:
230                         return
231
232                 # Log in
233                 login_form = {
234                                 'current_form': 'loginForm',
235                                 'next':         '/',
236                                 'action_login': 'Log In',
237                                 'username':     username,
238                                 'password':     password,
239                                 }
240                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
241                 try:
242                         self.report_login()
243                         login_results = urllib2.urlopen(request).read()
244                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
245                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
246                                 return
247                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
248                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
249                         return
250
251                 # Confirm age
252                 age_form = {
253                                 'next_url':             '/',
254                                 'action_confirm':       'Confirm',
255                                 }
256                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
257                 try:
258                         self.report_age_confirmation()
259                         age_results = urllib2.urlopen(request).read()
260                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
261                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
262                         return
263
264         def _real_extract(self, url):
265                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
266                 mobj = re.search(self._NEXT_URL_RE, url)
267                 if mobj:
268                         url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
269
270                 # Extract video id from URL
271                 mobj = re.match(self._VALID_URL, url)
272                 if mobj is None:
273                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
274                         return
275                 video_id = mobj.group(2)
276
277                 # Get video webpage
278                 self.report_video_webpage_download(video_id)
279                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
280                 try:
281                         video_webpage = urllib2.urlopen(request).read()
282                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
283                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
284                         return
285
286                 # Attempt to extract SWF player URL
287                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
288                 if mobj is not None:
289                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
290                 else:
291                         player_url = None
292
293                 # Get video info
294                 self.report_video_info_webpage_download(video_id)
295                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
296                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
297                                         % (video_id, el_type))
298                         request = urllib2.Request(video_info_url)
299                         try:
300                                 video_info_webpage = urllib2.urlopen(request).read()
301                                 video_info = parse_qs(video_info_webpage)
302                                 if 'token' in video_info:
303                                         break
304                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
305                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
306                                 return
307                 if 'token' not in video_info:
308                         if 'reason' in video_info:
309                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
310                         else:
311                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
312                         return
313
314                 # Check for "rental" videos
315                 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
316                         self._downloader.trouble(u'ERROR: "rental" videos not supported')
317                         return
318
319                 # Start extracting information
320                 self.report_information_extraction(video_id)
321
322                 # uploader
323                 if 'author' not in video_info:
324                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
325                         return
326                 video_uploader = urllib.unquote_plus(video_info['author'][0])
327
328                 # title
329                 if 'title' not in video_info:
330                         self._downloader.trouble(u'ERROR: unable to extract video title')
331                         return
332                 video_title = urllib.unquote_plus(video_info['title'][0])
333                 video_title = video_title.decode('utf-8')
334
335                 # thumbnail image
336                 if 'thumbnail_url' not in video_info:
337                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
338                         video_thumbnail = ''
339                 else:   # don't panic if we can't find it
340                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
341
342                 # upload date
343                 upload_date = u'NA'
344                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
345                 if mobj is not None:
346                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
347                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
348                         for expression in format_expressions:
349                                 try:
350                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
351                                 except:
352                                         pass
353
354                 # description
355                 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
356                 if video_description: video_description = clean_html(video_description)
357                 else: video_description = ''
358                         
359                 # closed captions
360                 video_subtitles = None
361                 if self._downloader.params.get('writesubtitles', False):
362                         try:
363                                 self.report_video_subtitles_download(video_id)
364                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
365                                 try:
366                                         srt_list = urllib2.urlopen(request).read()
367                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
368                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
369                                 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
370                                 if not srt_lang_list:
371                                         raise Trouble(u'WARNING: video has no closed captions')
372                                 if self._downloader.params.get('subtitleslang', False):
373                                         srt_lang = self._downloader.params.get('subtitleslang')
374                                 elif 'en' in srt_lang_list:
375                                         srt_lang = 'en'
376                                 else:
377                                         srt_lang = srt_lang_list[0]
378                                 if not srt_lang in srt_lang_list:
379                                         raise Trouble(u'WARNING: no closed captions found in the specified language')
380                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
381                                 try:
382                                         srt_xml = urllib2.urlopen(request).read()
383                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
384                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
385                                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
386                         except Trouble as trouble:
387                                 self._downloader.trouble(trouble[0])
388
389                 # token
390                 video_token = urllib.unquote_plus(video_info['token'][0])
391
392                 # Decide which formats to download
393                 req_format = self._downloader.params.get('format', None)
394
395                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
396                         self.report_rtmp_download()
397                         video_url_list = [(None, video_info['conn'][0])]
398                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
399                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
400                         url_data = [parse_qs(uds) for uds in url_data_strs]
401                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
402                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
403
404                         format_limit = self._downloader.params.get('format_limit', None)
405                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
406                         if format_limit is not None and format_limit in available_formats:
407                                 format_list = available_formats[available_formats.index(format_limit):]
408                         else:
409                                 format_list = available_formats
410                         existing_formats = [x for x in format_list if x in url_map]
411                         if len(existing_formats) == 0:
412                                 self._downloader.trouble(u'ERROR: no known formats available for video')
413                                 return
414                         if self._downloader.params.get('listformats', None):
415                                 self._print_formats(existing_formats)
416                                 return
417                         if req_format is None or req_format == 'best':
418                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
419                         elif req_format == 'worst':
420                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
421                         elif req_format in ('-1', 'all'):
422                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
423                         else:
424                                 # Specific formats. We pick the first in a slash-delimeted sequence.
425                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
426                                 req_formats = req_format.split('/')
427                                 video_url_list = None
428                                 for rf in req_formats:
429                                         if rf in url_map:
430                                                 video_url_list = [(rf, url_map[rf])]
431                                                 break
432                                 if video_url_list is None:
433                                         self._downloader.trouble(u'ERROR: requested format not available')
434                                         return
435                 else:
436                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
437                         return
438
439                 results = []
440                 for format_param, video_real_url in video_url_list:
441                         # Extension
442                         video_extension = self._video_extensions.get(format_param, 'flv')
443
444                         results.append({
445                                 'id':           video_id.decode('utf-8'),
446                                 'url':          video_real_url.decode('utf-8'),
447                                 'uploader':     video_uploader.decode('utf-8'),
448                                 'upload_date':  upload_date,
449                                 'title':        video_title,
450                                 'ext':          video_extension.decode('utf-8'),
451                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
452                                 'thumbnail':    video_thumbnail.decode('utf-8'),
453                                 'description':  video_description,
454                                 'player_url':   player_url,
455                                 'subtitles':    video_subtitles
456                         })
457                 return results
458
459
460 class MetacafeIE(InfoExtractor):
461         """Information Extractor for metacafe.com."""
462
463         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
464         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
465         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
466         IE_NAME = u'metacafe'
467
468         def __init__(self, downloader=None):
469                 InfoExtractor.__init__(self, downloader)
470
471         def report_disclaimer(self):
472                 """Report disclaimer retrieval."""
473                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
474
475         def report_age_confirmation(self):
476                 """Report attempt to confirm age."""
477                 self._downloader.to_screen(u'[metacafe] Confirming age')
478
479         def report_download_webpage(self, video_id):
480                 """Report webpage download."""
481                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
482
483         def report_extraction(self, video_id):
484                 """Report information extraction."""
485                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
486
487         def _real_initialize(self):
488                 # Retrieve disclaimer
489                 request = urllib2.Request(self._DISCLAIMER)
490                 try:
491                         self.report_disclaimer()
492                         disclaimer = urllib2.urlopen(request).read()
493                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
494                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
495                         return
496
497                 # Confirm age
498                 disclaimer_form = {
499                         'filters': '0',
500                         'submit': "Continue - I'm over 18",
501                         }
502                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
503                 try:
504                         self.report_age_confirmation()
505                         disclaimer = urllib2.urlopen(request).read()
506                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
507                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
508                         return
509
510         def _real_extract(self, url):
511                 # Extract id and simplified title from URL
512                 mobj = re.match(self._VALID_URL, url)
513                 if mobj is None:
514                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
515                         return
516
517                 video_id = mobj.group(1)
518
519                 # Check if video comes from YouTube
520                 mobj2 = re.match(r'^yt-(.*)$', video_id)
521                 if mobj2 is not None:
522                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
523                         return
524
525                 # Retrieve video webpage to extract further information
526                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
527                 try:
528                         self.report_download_webpage(video_id)
529                         webpage = urllib2.urlopen(request).read()
530                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
531                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
532                         return
533
534                 # Extract URL, uploader and title from webpage
535                 self.report_extraction(video_id)
536                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
537                 if mobj is not None:
538                         mediaURL = urllib.unquote(mobj.group(1))
539                         video_extension = mediaURL[-3:]
540
541                         # Extract gdaKey if available
542                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
543                         if mobj is None:
544                                 video_url = mediaURL
545                         else:
546                                 gdaKey = mobj.group(1)
547                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
548                 else:
549                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
550                         if mobj is None:
551                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
552                                 return
553                         vardict = parse_qs(mobj.group(1))
554                         if 'mediaData' not in vardict:
555                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
556                                 return
557                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
558                         if mobj is None:
559                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
560                                 return
561                         mediaURL = mobj.group(1).replace('\\/', '/')
562                         video_extension = mediaURL[-3:]
563                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
564
565                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
566                 if mobj is None:
567                         self._downloader.trouble(u'ERROR: unable to extract title')
568                         return
569                 video_title = mobj.group(1).decode('utf-8')
570
571                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
572                 if mobj is None:
573                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
574                         return
575                 video_uploader = mobj.group(1)
576
577                 return [{
578                         'id':           video_id.decode('utf-8'),
579                         'url':          video_url.decode('utf-8'),
580                         'uploader':     video_uploader.decode('utf-8'),
581                         'upload_date':  u'NA',
582                         'title':        video_title,
583                         'ext':          video_extension.decode('utf-8'),
584                         'format':       u'NA',
585                         'player_url':   None,
586                 }]
587
588
589 class DailymotionIE(InfoExtractor):
590         """Information Extractor for Dailymotion"""
591
592         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
593         IE_NAME = u'dailymotion'
594
595         def __init__(self, downloader=None):
596                 InfoExtractor.__init__(self, downloader)
597
598         def report_download_webpage(self, video_id):
599                 """Report webpage download."""
600                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
601
602         def report_extraction(self, video_id):
603                 """Report information extraction."""
604                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
605
606         def _real_extract(self, url):
607                 # Extract id and simplified title from URL
608                 mobj = re.match(self._VALID_URL, url)
609                 if mobj is None:
610                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
611                         return
612
613                 video_id = mobj.group(1)
614
615                 video_extension = 'flv'
616
617                 # Retrieve video webpage to extract further information
618                 request = urllib2.Request(url)
619                 request.add_header('Cookie', 'family_filter=off')
620                 try:
621                         self.report_download_webpage(video_id)
622                         webpage = urllib2.urlopen(request).read()
623                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
624                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
625                         return
626
627                 # Extract URL, uploader and title from webpage
628                 self.report_extraction(video_id)
629                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
630                 if mobj is None:
631                         self._downloader.trouble(u'ERROR: unable to extract media URL')
632                         return
633                 sequence = urllib.unquote(mobj.group(1))
634                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
635                 if mobj is None:
636                         self._downloader.trouble(u'ERROR: unable to extract media URL')
637                         return
638                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
639
640                 # if needed add http://www.dailymotion.com/ if relative URL
641
642                 video_url = mediaURL
643
644                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
645                 if mobj is None:
646                         self._downloader.trouble(u'ERROR: unable to extract title')
647                         return
648                 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
649
650                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
651                 if mobj is None:
652                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
653                         return
654                 video_uploader = mobj.group(1)
655
656                 return [{
657                         'id':           video_id.decode('utf-8'),
658                         'url':          video_url.decode('utf-8'),
659                         'uploader':     video_uploader.decode('utf-8'),
660                         'upload_date':  u'NA',
661                         'title':        video_title,
662                         'ext':          video_extension.decode('utf-8'),
663                         'format':       u'NA',
664                         'player_url':   None,
665                 }]
666
667
668 class GoogleIE(InfoExtractor):
669         """Information extractor for video.google.com."""
670
671         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
672         IE_NAME = u'video.google'
673
674         def __init__(self, downloader=None):
675                 InfoExtractor.__init__(self, downloader)
676
677         def report_download_webpage(self, video_id):
678                 """Report webpage download."""
679                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
680
681         def report_extraction(self, video_id):
682                 """Report information extraction."""
683                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
684
685         def _real_extract(self, url):
686                 # Extract id from URL
687                 mobj = re.match(self._VALID_URL, url)
688                 if mobj is None:
689                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
690                         return
691
692                 video_id = mobj.group(1)
693
694                 video_extension = 'mp4'
695
696                 # Retrieve video webpage to extract further information
697                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
698                 try:
699                         self.report_download_webpage(video_id)
700                         webpage = urllib2.urlopen(request).read()
701                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
702                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
703                         return
704
705                 # Extract URL, uploader, and title from webpage
706                 self.report_extraction(video_id)
707                 mobj = re.search(r"download_url:'([^']+)'", webpage)
708                 if mobj is None:
709                         video_extension = 'flv'
710                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
711                 if mobj is None:
712                         self._downloader.trouble(u'ERROR: unable to extract media URL')
713                         return
714                 mediaURL = urllib.unquote(mobj.group(1))
715                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
716                 mediaURL = mediaURL.replace('\\x26', '\x26')
717
718                 video_url = mediaURL
719
720                 mobj = re.search(r'<title>(.*)</title>', webpage)
721                 if mobj is None:
722                         self._downloader.trouble(u'ERROR: unable to extract title')
723                         return
724                 video_title = mobj.group(1).decode('utf-8')
725
726                 # Extract video description
727                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
728                 if mobj is None:
729                         self._downloader.trouble(u'ERROR: unable to extract video description')
730                         return
731                 video_description = mobj.group(1).decode('utf-8')
732                 if not video_description:
733                         video_description = 'No description available.'
734
735                 # Extract video thumbnail
736                 if self._downloader.params.get('forcethumbnail', False):
737                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
738                         try:
739                                 webpage = urllib2.urlopen(request).read()
740                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
741                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
742                                 return
743                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
744                         if mobj is None:
745                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
746                                 return
747                         video_thumbnail = mobj.group(1)
748                 else:   # we need something to pass to process_info
749                         video_thumbnail = ''
750
751                 return [{
752                         'id':           video_id.decode('utf-8'),
753                         'url':          video_url.decode('utf-8'),
754                         'uploader':     u'NA',
755                         'upload_date':  u'NA',
756                         'title':        video_title,
757                         'ext':          video_extension.decode('utf-8'),
758                         'format':       u'NA',
759                         'player_url':   None,
760                 }]
761
762
763 class PhotobucketIE(InfoExtractor):
764         """Information extractor for photobucket.com."""
765
766         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
767         IE_NAME = u'photobucket'
768
769         def __init__(self, downloader=None):
770                 InfoExtractor.__init__(self, downloader)
771
772         def report_download_webpage(self, video_id):
773                 """Report webpage download."""
774                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
775
776         def report_extraction(self, video_id):
777                 """Report information extraction."""
778                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
779
780         def _real_extract(self, url):
781                 # Extract id from URL
782                 mobj = re.match(self._VALID_URL, url)
783                 if mobj is None:
784                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
785                         return
786
787                 video_id = mobj.group(1)
788
789                 video_extension = 'flv'
790
791                 # Retrieve video webpage to extract further information
792                 request = urllib2.Request(url)
793                 try:
794                         self.report_download_webpage(video_id)
795                         webpage = urllib2.urlopen(request).read()
796                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
797                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
798                         return
799
800                 # Extract URL, uploader, and title from webpage
801                 self.report_extraction(video_id)
802                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
803                 if mobj is None:
804                         self._downloader.trouble(u'ERROR: unable to extract media URL')
805                         return
806                 mediaURL = urllib.unquote(mobj.group(1))
807
808                 video_url = mediaURL
809
810                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
811                 if mobj is None:
812                         self._downloader.trouble(u'ERROR: unable to extract title')
813                         return
814                 video_title = mobj.group(1).decode('utf-8')
815
816                 video_uploader = mobj.group(2).decode('utf-8')
817
818                 return [{
819                         'id':           video_id.decode('utf-8'),
820                         'url':          video_url.decode('utf-8'),
821                         'uploader':     video_uploader,
822                         'upload_date':  u'NA',
823                         'title':        video_title,
824                         'ext':          video_extension.decode('utf-8'),
825                         'format':       u'NA',
826                         'player_url':   None,
827                 }]
828
829
830 class YahooIE(InfoExtractor):
831         """Information extractor for video.yahoo.com."""
832
833         # _VALID_URL matches all Yahoo! Video URLs
834         # _VPAGE_URL matches only the extractable '/watch/' URLs
835         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
836         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
837         IE_NAME = u'video.yahoo'
838
839         def __init__(self, downloader=None):
840                 InfoExtractor.__init__(self, downloader)
841
842         def report_download_webpage(self, video_id):
843                 """Report webpage download."""
844                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
845
846         def report_extraction(self, video_id):
847                 """Report information extraction."""
848                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
849
850         def _real_extract(self, url, new_video=True):
851                 # Extract ID from URL
852                 mobj = re.match(self._VALID_URL, url)
853                 if mobj is None:
854                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
855                         return
856
857                 video_id = mobj.group(2)
858                 video_extension = 'flv'
859
860                 # Rewrite valid but non-extractable URLs as
861                 # extractable English language /watch/ URLs
862                 if re.match(self._VPAGE_URL, url) is None:
863                         request = urllib2.Request(url)
864                         try:
865                                 webpage = urllib2.urlopen(request).read()
866                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
867                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
868                                 return
869
870                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
871                         if mobj is None:
872                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
873                                 return
874                         yahoo_id = mobj.group(1)
875
876                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
877                         if mobj is None:
878                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
879                                 return
880                         yahoo_vid = mobj.group(1)
881
882                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
883                         return self._real_extract(url, new_video=False)
884
885                 # Retrieve video webpage to extract further information
886                 request = urllib2.Request(url)
887                 try:
888                         self.report_download_webpage(video_id)
889                         webpage = urllib2.urlopen(request).read()
890                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
891                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
892                         return
893
894                 # Extract uploader and title from webpage
895                 self.report_extraction(video_id)
896                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
897                 if mobj is None:
898                         self._downloader.trouble(u'ERROR: unable to extract video title')
899                         return
900                 video_title = mobj.group(1).decode('utf-8')
901
902                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
903                 if mobj is None:
904                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
905                         return
906                 video_uploader = mobj.group(1).decode('utf-8')
907
908                 # Extract video thumbnail
909                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
910                 if mobj is None:
911                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
912                         return
913                 video_thumbnail = mobj.group(1).decode('utf-8')
914
915                 # Extract video description
916                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
917                 if mobj is None:
918                         self._downloader.trouble(u'ERROR: unable to extract video description')
919                         return
920                 video_description = mobj.group(1).decode('utf-8')
921                 if not video_description:
922                         video_description = 'No description available.'
923
924                 # Extract video height and width
925                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
926                 if mobj is None:
927                         self._downloader.trouble(u'ERROR: unable to extract video height')
928                         return
929                 yv_video_height = mobj.group(1)
930
931                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
932                 if mobj is None:
933                         self._downloader.trouble(u'ERROR: unable to extract video width')
934                         return
935                 yv_video_width = mobj.group(1)
936
937                 # Retrieve video playlist to extract media URL
938                 # I'm not completely sure what all these options are, but we
939                 # seem to need most of them, otherwise the server sends a 401.
940                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
941                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
942                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
943                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
944                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
945                 try:
946                         self.report_download_webpage(video_id)
947                         webpage = urllib2.urlopen(request).read()
948                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
949                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
950                         return
951
952                 # Extract media URL from playlist XML
953                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
954                 if mobj is None:
955                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
956                         return
957                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
958                 video_url = unescapeHTML(video_url)
959
960                 return [{
961                         'id':           video_id.decode('utf-8'),
962                         'url':          video_url,
963                         'uploader':     video_uploader,
964                         'upload_date':  u'NA',
965                         'title':        video_title,
966                         'ext':          video_extension.decode('utf-8'),
967                         'thumbnail':    video_thumbnail.decode('utf-8'),
968                         'description':  video_description,
969                         'thumbnail':    video_thumbnail,
970                         'player_url':   None,
971                 }]
972
973
974 class VimeoIE(InfoExtractor):
975         """Information extractor for vimeo.com."""
976
977         # _VALID_URL matches Vimeo URLs
978         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
979         IE_NAME = u'vimeo'
980
981         def __init__(self, downloader=None):
982                 InfoExtractor.__init__(self, downloader)
983
984         def report_download_webpage(self, video_id):
985                 """Report webpage download."""
986                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
987
988         def report_extraction(self, video_id):
989                 """Report information extraction."""
990                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
991
992         def _real_extract(self, url, new_video=True):
993                 # Extract ID from URL
994                 mobj = re.match(self._VALID_URL, url)
995                 if mobj is None:
996                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
997                         return
998
999                 video_id = mobj.group(1)
1000
1001                 # Retrieve video webpage to extract further information
1002                 request = urllib2.Request(url, None, std_headers)
1003                 try:
1004                         self.report_download_webpage(video_id)
1005                         webpage = urllib2.urlopen(request).read()
1006                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1007                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1008                         return
1009
1010                 # Now we begin extracting as much information as we can from what we
1011                 # retrieved. First we extract the information common to all extractors,
1012                 # and latter we extract those that are Vimeo specific.
1013                 self.report_extraction(video_id)
1014
1015                 # Extract the config JSON
1016                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1017                 try:
1018                         config = json.loads(config)
1019                 except:
1020                         self._downloader.trouble(u'ERROR: unable to extract info section')
1021                         return
1022                 
1023                 # Extract title
1024                 video_title = config["video"]["title"]
1025
1026                 # Extract uploader
1027                 video_uploader = config["video"]["owner"]["name"]
1028
1029                 # Extract video thumbnail
1030                 video_thumbnail = config["video"]["thumbnail"]
1031
1032                 # Extract video description
1033                 video_description = get_element_by_id("description", webpage.decode('utf8'))
1034                 if video_description: video_description = clean_html(video_description)
1035                 else: video_description = ''
1036
1037                 # Extract upload date
1038                 video_upload_date = u'NA'
1039                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1040                 if mobj is not None:
1041                         video_upload_date = mobj.group(1)
1042
1043                 # Vimeo specific: extract request signature and timestamp
1044                 sig = config['request']['signature']
1045                 timestamp = config['request']['timestamp']
1046
1047                 # Vimeo specific: extract video codec and quality information
1048                 # TODO bind to format param
1049                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1050                 for codec in codecs:
1051                         if codec[0] in config["video"]["files"]:
1052                                 video_codec = codec[0]
1053                                 video_extension = codec[1]
1054                                 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1055                                 else: quality = 'sd'
1056                                 break
1057                 else:
1058                         self._downloader.trouble(u'ERROR: no known codec found')
1059                         return
1060
1061                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1062                                         %(video_id, sig, timestamp, quality, video_codec.upper())
1063
1064                 return [{
1065                         'id':           video_id,
1066                         'url':          video_url,
1067                         'uploader':     video_uploader,
1068                         'upload_date':  video_upload_date,
1069                         'title':        video_title,
1070                         'ext':          video_extension,
1071                         'thumbnail':    video_thumbnail,
1072                         'description':  video_description,
1073                         'player_url':   None,
1074                 }]
1075
1076
1077 class GenericIE(InfoExtractor):
1078         """Generic last-resort information extractor."""
1079
1080         _VALID_URL = r'.*'
1081         IE_NAME = u'generic'
1082
1083         def __init__(self, downloader=None):
1084                 InfoExtractor.__init__(self, downloader)
1085
1086         def report_download_webpage(self, video_id):
1087                 """Report webpage download."""
1088                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1089                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1090
1091         def report_extraction(self, video_id):
1092                 """Report information extraction."""
1093                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1094
1095         def report_following_redirect(self, new_url):
1096                 """Report information extraction."""
1097                 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1098                 
1099         def _test_redirect(self, url):
1100                 """Check if it is a redirect, like url shorteners, in case restart chain."""
1101                 class HeadRequest(urllib2.Request):
1102                         def get_method(self):
1103                                 return "HEAD"
1104
1105                 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1106                         """
1107                         Subclass the HTTPRedirectHandler to make it use our 
1108                         HeadRequest also on the redirected URL
1109                         """
1110                         def redirect_request(self, req, fp, code, msg, headers, newurl): 
1111                                 if code in (301, 302, 303, 307):
1112                                         newurl = newurl.replace(' ', '%20') 
1113                                         newheaders = dict((k,v) for k,v in req.headers.items()
1114                                                                           if k.lower() not in ("content-length", "content-type"))
1115                                         return HeadRequest(newurl, 
1116                                                                            headers=newheaders,
1117                                                                            origin_req_host=req.get_origin_req_host(), 
1118                                                                            unverifiable=True) 
1119                                 else: 
1120                                         raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp) 
1121
1122                 class HTTPMethodFallback(urllib2.BaseHandler):
1123                         """
1124                         Fallback to GET if HEAD is not allowed (405 HTTP error)
1125                         """
1126                         def http_error_405(self, req, fp, code, msg, headers): 
1127                                 fp.read()
1128                                 fp.close()
1129
1130                                 newheaders = dict((k,v) for k,v in req.headers.items()
1131                                                                   if k.lower() not in ("content-length", "content-type"))
1132                                 return self.parent.open(urllib2.Request(req.get_full_url(), 
1133                                                                                                  headers=newheaders, 
1134                                                                                                  origin_req_host=req.get_origin_req_host(), 
1135                                                                                                  unverifiable=True))
1136
1137                 # Build our opener
1138                 opener = urllib2.OpenerDirector() 
1139                 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1140                                                 HTTPMethodFallback, HEADRedirectHandler,
1141                                                 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1142                         opener.add_handler(handler())
1143
1144                 response = opener.open(HeadRequest(url))
1145                 new_url = response.geturl()
1146                 
1147                 if url == new_url: return False
1148                 
1149                 self.report_following_redirect(new_url)
1150                 self._downloader.download([new_url])
1151                 return True
1152
1153         def _real_extract(self, url):
1154                 if self._test_redirect(url): return
1155
1156                 video_id = url.split('/')[-1]
1157                 request = urllib2.Request(url)
1158                 try:
1159                         self.report_download_webpage(video_id)
1160                         webpage = urllib2.urlopen(request).read()
1161                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1162                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1163                         return
1164                 except ValueError, err:
1165                         # since this is the last-resort InfoExtractor, if
1166                         # this error is thrown, it'll be thrown here
1167                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1168                         return
1169
1170                 self.report_extraction(video_id)
1171                 # Start with something easy: JW Player in SWFObject
1172                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1173                 if mobj is None:
1174                         # Broaden the search a little bit
1175                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1176                 if mobj is None:
1177                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1178                         return
1179
1180                 # It's possible that one of the regexes
1181                 # matched, but returned an empty group:
1182                 if mobj.group(1) is None:
1183                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1184                         return
1185
1186                 video_url = urllib.unquote(mobj.group(1))
1187                 video_id = os.path.basename(video_url)
1188
1189                 # here's a fun little line of code for you:
1190                 video_extension = os.path.splitext(video_id)[1][1:]
1191                 video_id = os.path.splitext(video_id)[0]
1192
1193                 # it's tempting to parse this further, but you would
1194                 # have to take into account all the variations like
1195                 #   Video Title - Site Name
1196                 #   Site Name | Video Title
1197                 #   Video Title - Tagline | Site Name
1198                 # and so on and so forth; it's just not practical
1199                 mobj = re.search(r'<title>(.*)</title>', webpage)
1200                 if mobj is None:
1201                         self._downloader.trouble(u'ERROR: unable to extract title')
1202                         return
1203                 video_title = mobj.group(1).decode('utf-8')
1204
1205                 # video uploader is domain name
1206                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1207                 if mobj is None:
1208                         self._downloader.trouble(u'ERROR: unable to extract title')
1209                         return
1210                 video_uploader = mobj.group(1).decode('utf-8')
1211
1212                 return [{
1213                         'id':           video_id.decode('utf-8'),
1214                         'url':          video_url.decode('utf-8'),
1215                         'uploader':     video_uploader,
1216                         'upload_date':  u'NA',
1217                         'title':        video_title,
1218                         'ext':          video_extension.decode('utf-8'),
1219                         'format':       u'NA',
1220                         'player_url':   None,
1221                 }]
1222
1223
1224 class YoutubeSearchIE(InfoExtractor):
1225         """Information Extractor for YouTube search queries."""
1226         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1227         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1228         _max_youtube_results = 1000
1229         IE_NAME = u'youtube:search'
1230
1231         def __init__(self, downloader=None):
1232                 InfoExtractor.__init__(self, downloader)
1233
1234         def report_download_page(self, query, pagenum):
1235                 """Report attempt to download search page with given number."""
1236                 query = query.decode(preferredencoding())
1237                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1238
1239         def _real_extract(self, query):
1240                 mobj = re.match(self._VALID_URL, query)
1241                 if mobj is None:
1242                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1243                         return
1244
1245                 prefix, query = query.split(':')
1246                 prefix = prefix[8:]
1247                 query = query.encode('utf-8')
1248                 if prefix == '':
1249                         self._download_n_results(query, 1)
1250                         return
1251                 elif prefix == 'all':
1252                         self._download_n_results(query, self._max_youtube_results)
1253                         return
1254                 else:
1255                         try:
1256                                 n = long(prefix)
1257                                 if n <= 0:
1258                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1259                                         return
1260                                 elif n > self._max_youtube_results:
1261                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1262                                         n = self._max_youtube_results
1263                                 self._download_n_results(query, n)
1264                                 return
1265                         except ValueError: # parsing prefix as integer fails
1266                                 self._download_n_results(query, 1)
1267                                 return
1268
1269         def _download_n_results(self, query, n):
1270                 """Downloads a specified number of results for a query"""
1271
1272                 video_ids = []
1273                 pagenum = 0
1274                 limit = n
1275
1276                 while (50 * pagenum) < limit:
1277                         self.report_download_page(query, pagenum+1)
1278                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1279                         request = urllib2.Request(result_url)
1280                         try:
1281                                 data = urllib2.urlopen(request).read()
1282                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1283                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1284                                 return
1285                         api_response = json.loads(data)['data']
1286
1287                         new_ids = list(video['id'] for video in api_response['items'])
1288                         video_ids += new_ids
1289
1290                         limit = min(n, api_response['totalItems'])
1291                         pagenum += 1
1292
1293                 if len(video_ids) > n:
1294                         video_ids = video_ids[:n]
1295                 for id in video_ids:
1296                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1297                 return
1298
1299
1300 class GoogleSearchIE(InfoExtractor):
1301         """Information Extractor for Google Video search queries."""
1302         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1303         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1304         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1305         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1306         _max_google_results = 1000
1307         IE_NAME = u'video.google:search'
1308
1309         def __init__(self, downloader=None):
1310                 InfoExtractor.__init__(self, downloader)
1311
1312         def report_download_page(self, query, pagenum):
1313                 """Report attempt to download playlist page with given number."""
1314                 query = query.decode(preferredencoding())
1315                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1316
1317         def _real_extract(self, query):
1318                 mobj = re.match(self._VALID_URL, query)
1319                 if mobj is None:
1320                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1321                         return
1322
1323                 prefix, query = query.split(':')
1324                 prefix = prefix[8:]
1325                 query = query.encode('utf-8')
1326                 if prefix == '':
1327                         self._download_n_results(query, 1)
1328                         return
1329                 elif prefix == 'all':
1330                         self._download_n_results(query, self._max_google_results)
1331                         return
1332                 else:
1333                         try:
1334                                 n = long(prefix)
1335                                 if n <= 0:
1336                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1337                                         return
1338                                 elif n > self._max_google_results:
1339                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1340                                         n = self._max_google_results
1341                                 self._download_n_results(query, n)
1342                                 return
1343                         except ValueError: # parsing prefix as integer fails
1344                                 self._download_n_results(query, 1)
1345                                 return
1346
1347         def _download_n_results(self, query, n):
1348                 """Downloads a specified number of results for a query"""
1349
1350                 video_ids = []
1351                 pagenum = 0
1352
1353                 while True:
1354                         self.report_download_page(query, pagenum)
1355                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1356                         request = urllib2.Request(result_url)
1357                         try:
1358                                 page = urllib2.urlopen(request).read()
1359                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1360                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1361                                 return
1362
1363                         # Extract video identifiers
1364                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1365                                 video_id = mobj.group(1)
1366                                 if video_id not in video_ids:
1367                                         video_ids.append(video_id)
1368                                         if len(video_ids) == n:
1369                                                 # Specified n videos reached
1370                                                 for id in video_ids:
1371                                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1372                                                 return
1373
1374                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1375                                 for id in video_ids:
1376                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1377                                 return
1378
1379                         pagenum = pagenum + 1
1380
1381
1382 class YahooSearchIE(InfoExtractor):
1383         """Information Extractor for Yahoo! Video search queries."""
1384         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1385         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1386         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1387         _MORE_PAGES_INDICATOR = r'\s*Next'
1388         _max_yahoo_results = 1000
1389         IE_NAME = u'video.yahoo:search'
1390
1391         def __init__(self, downloader=None):
1392                 InfoExtractor.__init__(self, downloader)
1393
1394         def report_download_page(self, query, pagenum):
1395                 """Report attempt to download playlist page with given number."""
1396                 query = query.decode(preferredencoding())
1397                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1398
1399         def _real_extract(self, query):
1400                 mobj = re.match(self._VALID_URL, query)
1401                 if mobj is None:
1402                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1403                         return
1404
1405                 prefix, query = query.split(':')
1406                 prefix = prefix[8:]
1407                 query = query.encode('utf-8')
1408                 if prefix == '':
1409                         self._download_n_results(query, 1)
1410                         return
1411                 elif prefix == 'all':
1412                         self._download_n_results(query, self._max_yahoo_results)
1413                         return
1414                 else:
1415                         try:
1416                                 n = long(prefix)
1417                                 if n <= 0:
1418                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1419                                         return
1420                                 elif n > self._max_yahoo_results:
1421                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1422                                         n = self._max_yahoo_results
1423                                 self._download_n_results(query, n)
1424                                 return
1425                         except ValueError: # parsing prefix as integer fails
1426                                 self._download_n_results(query, 1)
1427                                 return
1428
1429         def _download_n_results(self, query, n):
1430                 """Downloads a specified number of results for a query"""
1431
1432                 video_ids = []
1433                 already_seen = set()
1434                 pagenum = 1
1435
1436                 while True:
1437                         self.report_download_page(query, pagenum)
1438                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1439                         request = urllib2.Request(result_url)
1440                         try:
1441                                 page = urllib2.urlopen(request).read()
1442                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1443                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1444                                 return
1445
1446                         # Extract video identifiers
1447                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1448                                 video_id = mobj.group(1)
1449                                 if video_id not in already_seen:
1450                                         video_ids.append(video_id)
1451                                         already_seen.add(video_id)
1452                                         if len(video_ids) == n:
1453                                                 # Specified n videos reached
1454                                                 for id in video_ids:
1455                                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1456                                                 return
1457
1458                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1459                                 for id in video_ids:
1460                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1461                                 return
1462
1463                         pagenum = pagenum + 1
1464
1465
1466 class YoutubePlaylistIE(InfoExtractor):
1467         """Information Extractor for YouTube playlists."""
1468
1469         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1470         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1471         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=(PL)?%s&'
1472         _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1473         IE_NAME = u'youtube:playlist'
1474
1475         def __init__(self, downloader=None):
1476                 InfoExtractor.__init__(self, downloader)
1477
1478         def report_download_page(self, playlist_id, pagenum):
1479                 """Report attempt to download playlist page with given number."""
1480                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1481
1482         def _real_extract(self, url):
1483                 # Extract playlist id
1484                 mobj = re.match(self._VALID_URL, url)
1485                 if mobj is None:
1486                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1487                         return
1488
1489                 # Single video case
1490                 if mobj.group(3) is not None:
1491                         self._downloader.download([mobj.group(3)])
1492                         return
1493
1494                 # Download playlist pages
1495                 # prefix is 'p' as default for playlists but there are other types that need extra care
1496                 playlist_prefix = mobj.group(1)
1497                 if playlist_prefix == 'a':
1498                         playlist_access = 'artist'
1499                 else:
1500                         playlist_prefix = 'p'
1501                         playlist_access = 'view_play_list'
1502                 playlist_id = mobj.group(2)
1503                 video_ids = []
1504                 pagenum = 1
1505
1506                 while True:
1507                         self.report_download_page(playlist_id, pagenum)
1508                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1509                         request = urllib2.Request(url)
1510                         try:
1511                                 page = urllib2.urlopen(request).read()
1512                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1513                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1514                                 return
1515
1516                         # Extract video identifiers
1517                         ids_in_page = []
1518                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1519                                 if mobj.group(1) not in ids_in_page:
1520                                         ids_in_page.append(mobj.group(1))
1521                         video_ids.extend(ids_in_page)
1522
1523                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1524                                 break
1525                         pagenum = pagenum + 1
1526
1527                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1528                 playlistend = self._downloader.params.get('playlistend', -1)
1529                 if playlistend == -1:
1530                         video_ids = video_ids[playliststart:]
1531                 else:
1532                         video_ids = video_ids[playliststart:playlistend]
1533
1534                 for id in video_ids:
1535                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1536                 return
1537
1538
1539 class YoutubeUserIE(InfoExtractor):
1540         """Information Extractor for YouTube users."""
1541
1542         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1543         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1544         _GDATA_PAGE_SIZE = 50
1545         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1546         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1547         IE_NAME = u'youtube:user'
1548
1549         def __init__(self, downloader=None):
1550                 InfoExtractor.__init__(self, downloader)
1551
1552         def report_download_page(self, username, start_index):
1553                 """Report attempt to download user page."""
1554                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1555                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1556
1557         def _real_extract(self, url):
1558                 # Extract username
1559                 mobj = re.match(self._VALID_URL, url)
1560                 if mobj is None:
1561                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1562                         return
1563
1564                 username = mobj.group(1)
1565
1566                 # Download video ids using YouTube Data API. Result size per
1567                 # query is limited (currently to 50 videos) so we need to query
1568                 # page by page until there are no video ids - it means we got
1569                 # all of them.
1570
1571                 video_ids = []
1572                 pagenum = 0
1573
1574                 while True:
1575                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1576                         self.report_download_page(username, start_index)
1577
1578                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1579
1580                         try:
1581                                 page = urllib2.urlopen(request).read()
1582                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1583                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1584                                 return
1585
1586                         # Extract video identifiers
1587                         ids_in_page = []
1588
1589                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1590                                 if mobj.group(1) not in ids_in_page:
1591                                         ids_in_page.append(mobj.group(1))
1592
1593                         video_ids.extend(ids_in_page)
1594
1595                         # A little optimization - if current page is not
1596                         # "full", ie. does not contain PAGE_SIZE video ids then
1597                         # we can assume that this page is the last one - there
1598                         # are no more ids on further pages - no need to query
1599                         # again.
1600
1601                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1602                                 break
1603
1604                         pagenum += 1
1605
1606                 all_ids_count = len(video_ids)
1607                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1608                 playlistend = self._downloader.params.get('playlistend', -1)
1609
1610                 if playlistend == -1:
1611                         video_ids = video_ids[playliststart:]
1612                 else:
1613                         video_ids = video_ids[playliststart:playlistend]
1614
1615                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1616                                 (username, all_ids_count, len(video_ids)))
1617
1618                 for video_id in video_ids:
1619                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1620
1621
1622 class DepositFilesIE(InfoExtractor):
1623         """Information extractor for depositfiles.com"""
1624
1625         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1626         IE_NAME = u'DepositFiles'
1627
1628         def __init__(self, downloader=None):
1629                 InfoExtractor.__init__(self, downloader)
1630
1631         def report_download_webpage(self, file_id):
1632                 """Report webpage download."""
1633                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1634
1635         def report_extraction(self, file_id):
1636                 """Report information extraction."""
1637                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1638
1639         def _real_extract(self, url):
1640                 file_id = url.split('/')[-1]
1641                 # Rebuild url in english locale
1642                 url = 'http://depositfiles.com/en/files/' + file_id
1643
1644                 # Retrieve file webpage with 'Free download' button pressed
1645                 free_download_indication = { 'gateway_result' : '1' }
1646                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1647                 try:
1648                         self.report_download_webpage(file_id)
1649                         webpage = urllib2.urlopen(request).read()
1650                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1651                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1652                         return
1653
1654                 # Search for the real file URL
1655                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1656                 if (mobj is None) or (mobj.group(1) is None):
1657                         # Try to figure out reason of the error.
1658                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1659                         if (mobj is not None) and (mobj.group(1) is not None):
1660                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1661                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1662                         else:
1663                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1664                         return
1665
1666                 file_url = mobj.group(1)
1667                 file_extension = os.path.splitext(file_url)[1][1:]
1668
1669                 # Search for file title
1670                 mobj = re.search(r'<b title="(.*?)">', webpage)
1671                 if mobj is None:
1672                         self._downloader.trouble(u'ERROR: unable to extract title')
1673                         return
1674                 file_title = mobj.group(1).decode('utf-8')
1675
1676                 return [{
1677                         'id':           file_id.decode('utf-8'),
1678                         'url':          file_url.decode('utf-8'),
1679                         'uploader':     u'NA',
1680                         'upload_date':  u'NA',
1681                         'title':        file_title,
1682                         'ext':          file_extension.decode('utf-8'),
1683                         'format':       u'NA',
1684                         'player_url':   None,
1685                 }]
1686
1687
1688 class FacebookIE(InfoExtractor):
1689         """Information Extractor for Facebook"""
1690
1691         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1692         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1693         _NETRC_MACHINE = 'facebook'
1694         _available_formats = ['video', 'highqual', 'lowqual']
1695         _video_extensions = {
1696                 'video': 'mp4',
1697                 'highqual': 'mp4',
1698                 'lowqual': 'mp4',
1699         }
1700         IE_NAME = u'facebook'
1701
1702         def __init__(self, downloader=None):
1703                 InfoExtractor.__init__(self, downloader)
1704
1705         def _reporter(self, message):
1706                 """Add header and report message."""
1707                 self._downloader.to_screen(u'[facebook] %s' % message)
1708
1709         def report_login(self):
1710                 """Report attempt to log in."""
1711                 self._reporter(u'Logging in')
1712
1713         def report_video_webpage_download(self, video_id):
1714                 """Report attempt to download video webpage."""
1715                 self._reporter(u'%s: Downloading video webpage' % video_id)
1716
1717         def report_information_extraction(self, video_id):
1718                 """Report attempt to extract video information."""
1719                 self._reporter(u'%s: Extracting video information' % video_id)
1720
1721         def _parse_page(self, video_webpage):
1722                 """Extract video information from page"""
1723                 # General data
1724                 data = {'title': r'\("video_title", "(.*?)"\)',
1725                         'description': r'<div class="datawrap">(.*?)</div>',
1726                         'owner': r'\("video_owner_name", "(.*?)"\)',
1727                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1728                         }
1729                 video_info = {}
1730                 for piece in data.keys():
1731                         mobj = re.search(data[piece], video_webpage)
1732                         if mobj is not None:
1733                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1734
1735                 # Video urls
1736                 video_urls = {}
1737                 for fmt in self._available_formats:
1738                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1739                         if mobj is not None:
1740                                 # URL is in a Javascript segment inside an escaped Unicode format within
1741                                 # the generally utf-8 page
1742                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1743                 video_info['video_urls'] = video_urls
1744
1745                 return video_info
1746
1747         def _real_initialize(self):
1748                 if self._downloader is None:
1749                         return
1750
1751                 useremail = None
1752                 password = None
1753                 downloader_params = self._downloader.params
1754
1755                 # Attempt to use provided username and password or .netrc data
1756                 if downloader_params.get('username', None) is not None:
1757                         useremail = downloader_params['username']
1758                         password = downloader_params['password']
1759                 elif downloader_params.get('usenetrc', False):
1760                         try:
1761                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1762                                 if info is not None:
1763                                         useremail = info[0]
1764                                         password = info[2]
1765                                 else:
1766                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1767                         except (IOError, netrc.NetrcParseError), err:
1768                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1769                                 return
1770
1771                 if useremail is None:
1772                         return
1773
1774                 # Log in
1775                 login_form = {
1776                         'email': useremail,
1777                         'pass': password,
1778                         'login': 'Log+In'
1779                         }
1780                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1781                 try:
1782                         self.report_login()
1783                         login_results = urllib2.urlopen(request).read()
1784                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1785                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1786                                 return
1787                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1788                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1789                         return
1790
1791         def _real_extract(self, url):
1792                 mobj = re.match(self._VALID_URL, url)
1793                 if mobj is None:
1794                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1795                         return
1796                 video_id = mobj.group('ID')
1797
1798                 # Get video webpage
1799                 self.report_video_webpage_download(video_id)
1800                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1801                 try:
1802                         page = urllib2.urlopen(request)
1803                         video_webpage = page.read()
1804                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1805                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1806                         return
1807
1808                 # Start extracting information
1809                 self.report_information_extraction(video_id)
1810
1811                 # Extract information
1812                 video_info = self._parse_page(video_webpage)
1813
1814                 # uploader
1815                 if 'owner' not in video_info:
1816                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1817                         return
1818                 video_uploader = video_info['owner']
1819
1820                 # title
1821                 if 'title' not in video_info:
1822                         self._downloader.trouble(u'ERROR: unable to extract video title')
1823                         return
1824                 video_title = video_info['title']
1825                 video_title = video_title.decode('utf-8')
1826
1827                 # thumbnail image
1828                 if 'thumbnail' not in video_info:
1829                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1830                         video_thumbnail = ''
1831                 else:
1832                         video_thumbnail = video_info['thumbnail']
1833
1834                 # upload date
1835                 upload_date = u'NA'
1836                 if 'upload_date' in video_info:
1837                         upload_time = video_info['upload_date']
1838                         timetuple = email.utils.parsedate_tz(upload_time)
1839                         if timetuple is not None:
1840                                 try:
1841                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1842                                 except:
1843                                         pass
1844
1845                 # description
1846                 video_description = video_info.get('description', 'No description available.')
1847
1848                 url_map = video_info['video_urls']
1849                 if len(url_map.keys()) > 0:
1850                         # Decide which formats to download
1851                         req_format = self._downloader.params.get('format', None)
1852                         format_limit = self._downloader.params.get('format_limit', None)
1853
1854                         if format_limit is not None and format_limit in self._available_formats:
1855                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1856                         else:
1857                                 format_list = self._available_formats
1858                         existing_formats = [x for x in format_list if x in url_map]
1859                         if len(existing_formats) == 0:
1860                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1861                                 return
1862                         if req_format is None:
1863                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1864                         elif req_format == 'worst':
1865                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1866                         elif req_format == '-1':
1867                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1868                         else:
1869                                 # Specific format
1870                                 if req_format not in url_map:
1871                                         self._downloader.trouble(u'ERROR: requested format not available')
1872                                         return
1873                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1874
1875                 results = []
1876                 for format_param, video_real_url in video_url_list:
1877                         # Extension
1878                         video_extension = self._video_extensions.get(format_param, 'mp4')
1879
1880                         results.append({
1881                                 'id':           video_id.decode('utf-8'),
1882                                 'url':          video_real_url.decode('utf-8'),
1883                                 'uploader':     video_uploader.decode('utf-8'),
1884                                 'upload_date':  upload_date,
1885                                 'title':        video_title,
1886                                 'ext':          video_extension.decode('utf-8'),
1887                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1888                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1889                                 'description':  video_description.decode('utf-8'),
1890                                 'player_url':   None,
1891                         })
1892                 return results
1893
1894 class BlipTVIE(InfoExtractor):
1895         """Information extractor for blip.tv"""
1896
1897         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1898         _URL_EXT = r'^.*\.([a-z0-9]+)$'
1899         IE_NAME = u'blip.tv'
1900
1901         def report_extraction(self, file_id):
1902                 """Report information extraction."""
1903                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
1904
1905         def report_direct_download(self, title):
1906                 """Report information extraction."""
1907                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
1908
1909         def _real_extract(self, url):
1910                 mobj = re.match(self._VALID_URL, url)
1911                 if mobj is None:
1912                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1913                         return
1914
1915                 if '?' in url:
1916                         cchar = '&'
1917                 else:
1918                         cchar = '?'
1919                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1920                 request = urllib2.Request(json_url)
1921                 self.report_extraction(mobj.group(1))
1922                 info = None
1923                 try:
1924                         urlh = urllib2.urlopen(request)
1925                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1926                                 basename = url.split('/')[-1]
1927                                 title,ext = os.path.splitext(basename)
1928                                 title = title.decode('UTF-8')
1929                                 ext = ext.replace('.', '')
1930                                 self.report_direct_download(title)
1931                                 info = {
1932                                         'id': title,
1933                                         'url': url,
1934                                         'title': title,
1935                                         'ext': ext,
1936                                         'urlhandle': urlh
1937                                 }
1938                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1939                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1940                         return
1941                 if info is None: # Regular URL
1942                         try:
1943                                 json_code = urlh.read()
1944                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1945                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
1946                                 return
1947
1948                         try:
1949                                 json_data = json.loads(json_code)
1950                                 if 'Post' in json_data:
1951                                         data = json_data['Post']
1952                                 else:
1953                                         data = json_data
1954
1955                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1956                                 video_url = data['media']['url']
1957                                 umobj = re.match(self._URL_EXT, video_url)
1958                                 if umobj is None:
1959                                         raise ValueError('Can not determine filename extension')
1960                                 ext = umobj.group(1)
1961
1962                                 info = {
1963                                         'id': data['item_id'],
1964                                         'url': video_url,
1965                                         'uploader': data['display_name'],
1966                                         'upload_date': upload_date,
1967                                         'title': data['title'],
1968                                         'ext': ext,
1969                                         'format': data['media']['mimeType'],
1970                                         'thumbnail': data['thumbnailUrl'],
1971                                         'description': data['description'],
1972                                         'player_url': data['embedUrl']
1973                                 }
1974                         except (ValueError,KeyError), err:
1975                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
1976                                 return
1977
1978                 return [info]
1979
1980
1981 class MyVideoIE(InfoExtractor):
1982         """Information Extractor for myvideo.de."""
1983
1984         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
1985         IE_NAME = u'myvideo'
1986
1987         def __init__(self, downloader=None):
1988                 InfoExtractor.__init__(self, downloader)
1989         
1990         def report_download_webpage(self, video_id):
1991                 """Report webpage download."""
1992                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
1993
1994         def report_extraction(self, video_id):
1995                 """Report information extraction."""
1996                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
1997
1998         def _real_extract(self,url):
1999                 mobj = re.match(self._VALID_URL, url)
2000                 if mobj is None:
2001                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2002                         return
2003
2004                 video_id = mobj.group(1)
2005
2006                 # Get video webpage
2007                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2008                 try:
2009                         self.report_download_webpage(video_id)
2010                         webpage = urllib2.urlopen(request).read()
2011                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2012                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2013                         return
2014
2015                 self.report_extraction(video_id)
2016                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2017                                  webpage)
2018                 if mobj is None:
2019                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2020                         return
2021                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2022
2023                 mobj = re.search('<title>([^<]+)</title>', webpage)
2024                 if mobj is None:
2025                         self._downloader.trouble(u'ERROR: unable to extract title')
2026                         return
2027
2028                 video_title = mobj.group(1)
2029
2030                 return [{
2031                         'id':           video_id,
2032                         'url':          video_url,
2033                         'uploader':     u'NA',
2034                         'upload_date':  u'NA',
2035                         'title':        video_title,
2036                         'ext':          u'flv',
2037                         'format':       u'NA',
2038                         'player_url':   None,
2039                 }]
2040
2041 class ComedyCentralIE(InfoExtractor):
2042         """Information extractor for The Daily Show and Colbert Report """
2043
2044         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2045         IE_NAME = u'comedycentral'
2046
2047         def report_extraction(self, episode_id):
2048                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2049
2050         def report_config_download(self, episode_id):
2051                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2052
2053         def report_index_download(self, episode_id):
2054                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2055
2056         def report_player_url(self, episode_id):
2057                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2058
2059         def _real_extract(self, url):
2060                 mobj = re.match(self._VALID_URL, url)
2061                 if mobj is None:
2062                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2063                         return
2064
2065                 if mobj.group('shortname'):
2066                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
2067                                 url = u'http://www.thedailyshow.com/full-episodes/'
2068                         else:
2069                                 url = u'http://www.colbertnation.com/full-episodes/'
2070                         mobj = re.match(self._VALID_URL, url)
2071                         assert mobj is not None
2072
2073                 dlNewest = not mobj.group('episode')
2074                 if dlNewest:
2075                         epTitle = mobj.group('showname')
2076                 else:
2077                         epTitle = mobj.group('episode')
2078
2079                 req = urllib2.Request(url)
2080                 self.report_extraction(epTitle)
2081                 try:
2082                         htmlHandle = urllib2.urlopen(req)
2083                         html = htmlHandle.read()
2084                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2085                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2086                         return
2087                 if dlNewest:
2088                         url = htmlHandle.geturl()
2089                         mobj = re.match(self._VALID_URL, url)
2090                         if mobj is None:
2091                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2092                                 return
2093                         if mobj.group('episode') == '':
2094                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2095                                 return
2096                         epTitle = mobj.group('episode')
2097
2098                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2099                 if len(mMovieParams) == 0:
2100                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2101                         return
2102
2103                 playerUrl_raw = mMovieParams[0][0]
2104                 self.report_player_url(epTitle)
2105                 try:
2106                         urlHandle = urllib2.urlopen(playerUrl_raw)
2107                         playerUrl = urlHandle.geturl()
2108                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2109                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2110                         return
2111
2112                 uri = mMovieParams[0][1]
2113                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2114                 self.report_index_download(epTitle)
2115                 try:
2116                         indexXml = urllib2.urlopen(indexUrl).read()
2117                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2118                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2119                         return
2120
2121                 results = []
2122
2123                 idoc = xml.etree.ElementTree.fromstring(indexXml)
2124                 itemEls = idoc.findall('.//item')
2125                 for itemEl in itemEls:
2126                         mediaId = itemEl.findall('./guid')[0].text
2127                         shortMediaId = mediaId.split(':')[-1]
2128                         showId = mediaId.split(':')[-2].replace('.com', '')
2129                         officialTitle = itemEl.findall('./title')[0].text
2130                         officialDate = itemEl.findall('./pubDate')[0].text
2131
2132                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2133                                                 urllib.urlencode({'uri': mediaId}))
2134                         configReq = urllib2.Request(configUrl)
2135                         self.report_config_download(epTitle)
2136                         try:
2137                                 configXml = urllib2.urlopen(configReq).read()
2138                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2139                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2140                                 return
2141
2142                         cdoc = xml.etree.ElementTree.fromstring(configXml)
2143                         turls = []
2144                         for rendition in cdoc.findall('.//rendition'):
2145                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2146                                 turls.append(finfo)
2147
2148                         if len(turls) == 0:
2149                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2150                                 continue
2151
2152                         # For now, just pick the highest bitrate
2153                         format,video_url = turls[-1]
2154
2155                         effTitle = showId + u'-' + epTitle
2156                         info = {
2157                                 'id': shortMediaId,
2158                                 'url': video_url,
2159                                 'uploader': showId,
2160                                 'upload_date': officialDate,
2161                                 'title': effTitle,
2162                                 'ext': 'mp4',
2163                                 'format': format,
2164                                 'thumbnail': None,
2165                                 'description': officialTitle,
2166                                 'player_url': playerUrl
2167                         }
2168
2169                         results.append(info)
2170                         
2171                 return results
2172
2173
2174 class EscapistIE(InfoExtractor):
2175         """Information extractor for The Escapist """
2176
2177         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2178         IE_NAME = u'escapist'
2179
2180         def report_extraction(self, showName):
2181                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2182
2183         def report_config_download(self, showName):
2184                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2185
2186         def _real_extract(self, url):
2187                 mobj = re.match(self._VALID_URL, url)
2188                 if mobj is None:
2189                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2190                         return
2191                 showName = mobj.group('showname')
2192                 videoId = mobj.group('episode')
2193
2194                 self.report_extraction(showName)
2195                 try:
2196                         webPageBytes = urllib2.urlopen(url).read()
2197                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2198                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2199                         return
2200
2201                 webPage = webPageBytes.decode('utf-8')
2202                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2203                 description = unescapeHTML(descMatch.group(1))
2204                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2205                 imgUrl = unescapeHTML(imgMatch.group(1))
2206                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2207                 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2208                 configUrlMatch = re.search('config=(.*)$', playerUrl)
2209                 configUrl = urllib2.unquote(configUrlMatch.group(1))
2210
2211                 self.report_config_download(showName)
2212                 try:
2213                         configJSON = urllib2.urlopen(configUrl).read()
2214                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2215                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2216                         return
2217
2218                 # Technically, it's JavaScript, not JSON
2219                 configJSON = configJSON.replace("'", '"')
2220
2221                 try:
2222                         config = json.loads(configJSON)
2223                 except (ValueError,), err:
2224                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2225                         return
2226
2227                 playlist = config['playlist']
2228                 videoUrl = playlist[1]['url']
2229
2230                 info = {
2231                         'id': videoId,
2232                         'url': videoUrl,
2233                         'uploader': showName,
2234                         'upload_date': None,
2235                         'title': showName,
2236                         'ext': 'flv',
2237                         'format': 'flv',
2238                         'thumbnail': imgUrl,
2239                         'description': description,
2240                         'player_url': playerUrl,
2241                 }
2242
2243                 return [info]
2244
2245
2246 class CollegeHumorIE(InfoExtractor):
2247         """Information extractor for collegehumor.com"""
2248
2249         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2250         IE_NAME = u'collegehumor'
2251
2252         def report_webpage(self, video_id):
2253                 """Report information extraction."""
2254                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2255
2256         def report_extraction(self, video_id):
2257                 """Report information extraction."""
2258                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2259
2260         def _real_extract(self, url):
2261                 mobj = re.match(self._VALID_URL, url)
2262                 if mobj is None:
2263                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2264                         return
2265                 video_id = mobj.group('videoid')
2266
2267                 self.report_webpage(video_id)
2268                 request = urllib2.Request(url)
2269                 try:
2270                         webpage = urllib2.urlopen(request).read()
2271                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2272                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2273                         return
2274
2275                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2276                 if m is None:
2277                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2278                         return
2279                 internal_video_id = m.group('internalvideoid')
2280
2281                 info = {
2282                         'id': video_id,
2283                         'internal_id': internal_video_id,
2284                 }
2285
2286                 self.report_extraction(video_id)
2287                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2288                 try:
2289                         metaXml = urllib2.urlopen(xmlUrl).read()
2290                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2291                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2292                         return
2293
2294                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2295                 try:
2296                         videoNode = mdoc.findall('./video')[0]
2297                         info['description'] = videoNode.findall('./description')[0].text
2298                         info['title'] = videoNode.findall('./caption')[0].text
2299                         info['url'] = videoNode.findall('./file')[0].text
2300                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2301                         info['ext'] = info['url'].rpartition('.')[2]
2302                         info['format'] = info['ext']
2303                 except IndexError:
2304                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2305                         return
2306
2307                 return [info]
2308
2309
2310 class XVideosIE(InfoExtractor):
2311         """Information extractor for xvideos.com"""
2312
2313         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2314         IE_NAME = u'xvideos'
2315
2316         def report_webpage(self, video_id):
2317                 """Report information extraction."""
2318                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2319
2320         def report_extraction(self, video_id):
2321                 """Report information extraction."""
2322                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2323
2324         def _real_extract(self, url):
2325                 mobj = re.match(self._VALID_URL, url)
2326                 if mobj is None:
2327                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2328                         return
2329                 video_id = mobj.group(1).decode('utf-8')
2330
2331                 self.report_webpage(video_id)
2332
2333                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2334                 try:
2335                         webpage = urllib2.urlopen(request).read()
2336                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2337                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2338                         return
2339
2340                 self.report_extraction(video_id)
2341
2342
2343                 # Extract video URL
2344                 mobj = re.search(r'flv_url=(.+?)&', webpage)
2345                 if mobj is None:
2346                         self._downloader.trouble(u'ERROR: unable to extract video url')
2347                         return
2348                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2349
2350
2351                 # Extract title
2352                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2353                 if mobj is None:
2354                         self._downloader.trouble(u'ERROR: unable to extract video title')
2355                         return
2356                 video_title = mobj.group(1).decode('utf-8')
2357
2358
2359                 # Extract video thumbnail
2360                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
2361                 if mobj is None:
2362                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2363                         return
2364                 video_thumbnail = mobj.group(1).decode('utf-8')
2365
2366                 info = {
2367                         'id': video_id,
2368                         'url': video_url,
2369                         'uploader': None,
2370                         'upload_date': None,
2371                         'title': video_title,
2372                         'ext': 'flv',
2373                         'format': 'flv',
2374                         'thumbnail': video_thumbnail,
2375                         'description': None,
2376                         'player_url': None,
2377                 }
2378
2379                 return [info]
2380
2381
2382 class SoundcloudIE(InfoExtractor):
2383         """Information extractor for soundcloud.com
2384            To access the media, the uid of the song and a stream token
2385            must be extracted from the page source and the script must make
2386            a request to media.soundcloud.com/crossdomain.xml. Then
2387            the media can be grabbed by requesting from an url composed
2388            of the stream token and uid
2389          """
2390
2391         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2392         IE_NAME = u'soundcloud'
2393
2394         def __init__(self, downloader=None):
2395                 InfoExtractor.__init__(self, downloader)
2396
2397         def report_webpage(self, video_id):
2398                 """Report information extraction."""
2399                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2400
2401         def report_extraction(self, video_id):
2402                 """Report information extraction."""
2403                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2404
2405         def _real_extract(self, url):
2406                 mobj = re.match(self._VALID_URL, url)
2407                 if mobj is None:
2408                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2409                         return
2410
2411                 # extract uploader (which is in the url)
2412                 uploader = mobj.group(1).decode('utf-8')
2413                 # extract simple title (uploader + slug of song title)
2414                 slug_title =  mobj.group(2).decode('utf-8')
2415                 simple_title = uploader + u'-' + slug_title
2416
2417                 self.report_webpage('%s/%s' % (uploader, slug_title))
2418
2419                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2420                 try:
2421                         webpage = urllib2.urlopen(request).read()
2422                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2423                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2424                         return
2425
2426                 self.report_extraction('%s/%s' % (uploader, slug_title))
2427
2428                 # extract uid and stream token that soundcloud hands out for access
2429                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2430                 if mobj:
2431                         video_id = mobj.group(1)
2432                         stream_token = mobj.group(2)
2433
2434                 # extract unsimplified title
2435                 mobj = re.search('"title":"(.*?)",', webpage)
2436                 if mobj:
2437                         title = mobj.group(1).decode('utf-8')
2438                 else:
2439                         title = simple_title
2440
2441                 # construct media url (with uid/token)
2442                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2443                 mediaURL = mediaURL % (video_id, stream_token)
2444
2445                 # description
2446                 description = u'No description available'
2447                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2448                 if mobj:
2449                         description = mobj.group(1)
2450                 
2451                 # upload date
2452                 upload_date = None
2453                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2454                 if mobj:
2455                         try:
2456                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2457                         except Exception, e:
2458                                 self._downloader.to_stderr(str(e))
2459
2460                 # for soundcloud, a request to a cross domain is required for cookies
2461                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2462
2463                 return [{
2464                         'id':           video_id.decode('utf-8'),
2465                         'url':          mediaURL,
2466                         'uploader':     uploader.decode('utf-8'),
2467                         'upload_date':  upload_date,
2468                         'title':        title,
2469                         'ext':          u'mp3',
2470                         'format':       u'NA',
2471                         'player_url':   None,
2472                         'description': description.decode('utf-8')
2473                 }]
2474
2475
2476 class InfoQIE(InfoExtractor):
2477         """Information extractor for infoq.com"""
2478
2479         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2480         IE_NAME = u'infoq'
2481
2482         def report_webpage(self, video_id):
2483                 """Report information extraction."""
2484                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2485
2486         def report_extraction(self, video_id):
2487                 """Report information extraction."""
2488                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2489
2490         def _real_extract(self, url):
2491                 mobj = re.match(self._VALID_URL, url)
2492                 if mobj is None:
2493                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2494                         return
2495
2496                 self.report_webpage(url)
2497
2498                 request = urllib2.Request(url)
2499                 try:
2500                         webpage = urllib2.urlopen(request).read()
2501                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2502                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2503                         return
2504
2505                 self.report_extraction(url)
2506
2507
2508                 # Extract video URL
2509                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2510                 if mobj is None:
2511                         self._downloader.trouble(u'ERROR: unable to extract video url')
2512                         return
2513                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2514
2515
2516                 # Extract title
2517                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2518                 if mobj is None:
2519                         self._downloader.trouble(u'ERROR: unable to extract video title')
2520                         return
2521                 video_title = mobj.group(1).decode('utf-8')
2522
2523                 # Extract description
2524                 video_description = u'No description available.'
2525                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2526                 if mobj is not None:
2527                         video_description = mobj.group(1).decode('utf-8')
2528
2529                 video_filename = video_url.split('/')[-1]
2530                 video_id, extension = video_filename.split('.')
2531
2532                 info = {
2533                         'id': video_id,
2534                         'url': video_url,
2535                         'uploader': None,
2536                         'upload_date': None,
2537                         'title': video_title,
2538                         'ext': extension,
2539                         'format': extension, # Extension is always(?) mp4, but seems to be flv
2540                         'thumbnail': None,
2541                         'description': video_description,
2542                         'player_url': None,
2543                 }
2544
2545                 return [info]
2546
2547 class MixcloudIE(InfoExtractor):
2548         """Information extractor for www.mixcloud.com"""
2549         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2550         IE_NAME = u'mixcloud'
2551
2552         def __init__(self, downloader=None):
2553                 InfoExtractor.__init__(self, downloader)
2554
2555         def report_download_json(self, file_id):
2556                 """Report JSON download."""
2557                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2558
2559         def report_extraction(self, file_id):
2560                 """Report information extraction."""
2561                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2562
2563         def get_urls(self, jsonData, fmt, bitrate='best'):
2564                 """Get urls from 'audio_formats' section in json"""
2565                 file_url = None
2566                 try:
2567                         bitrate_list = jsonData[fmt]
2568                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2569                                 bitrate = max(bitrate_list) # select highest
2570
2571                         url_list = jsonData[fmt][bitrate]
2572                 except TypeError: # we have no bitrate info.
2573                         url_list = jsonData[fmt]
2574                 return url_list
2575
2576         def check_urls(self, url_list):
2577                 """Returns 1st active url from list"""
2578                 for url in url_list:
2579                         try:
2580                                 urllib2.urlopen(url)
2581                                 return url
2582                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2583                                 url = None
2584
2585                 return None
2586
2587         def _print_formats(self, formats):
2588                 print 'Available formats:'
2589                 for fmt in formats.keys():
2590                         for b in formats[fmt]:
2591                                 try:
2592                                         ext = formats[fmt][b][0]
2593                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2594                                 except TypeError: # we have no bitrate info
2595                                         ext = formats[fmt][0]
2596                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2597                                         break
2598
2599         def _real_extract(self, url):
2600                 mobj = re.match(self._VALID_URL, url)
2601                 if mobj is None:
2602                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2603                         return
2604                 # extract uploader & filename from url
2605                 uploader = mobj.group(1).decode('utf-8')
2606                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2607
2608                 # construct API request
2609                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2610                 # retrieve .json file with links to files
2611                 request = urllib2.Request(file_url)
2612                 try:
2613                         self.report_download_json(file_url)
2614                         jsonData = urllib2.urlopen(request).read()
2615                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2616                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2617                         return
2618
2619                 # parse JSON
2620                 json_data = json.loads(jsonData)
2621                 player_url = json_data['player_swf_url']
2622                 formats = dict(json_data['audio_formats'])
2623
2624                 req_format = self._downloader.params.get('format', None)
2625                 bitrate = None
2626
2627                 if self._downloader.params.get('listformats', None):
2628                         self._print_formats(formats)
2629                         return
2630
2631                 if req_format is None or req_format == 'best':
2632                         for format_param in formats.keys():
2633                                 url_list = self.get_urls(formats, format_param)
2634                                 # check urls
2635                                 file_url = self.check_urls(url_list)
2636                                 if file_url is not None:
2637                                         break # got it!
2638                 else:
2639                         if req_format not in formats.keys():
2640                                 self._downloader.trouble(u'ERROR: format is not available')
2641                                 return
2642
2643                         url_list = self.get_urls(formats, req_format)
2644                         file_url = self.check_urls(url_list)
2645                         format_param = req_format
2646
2647                 return [{
2648                         'id': file_id.decode('utf-8'),
2649                         'url': file_url.decode('utf-8'),
2650                         'uploader':     uploader.decode('utf-8'),
2651                         'upload_date': u'NA',
2652                         'title': json_data['name'],
2653                         'ext': file_url.split('.')[-1].decode('utf-8'),
2654                         'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2655                         'thumbnail': json_data['thumbnail_url'],
2656                         'description': json_data['description'],
2657                         'player_url': player_url.decode('utf-8'),
2658                 }]
2659
2660 class StanfordOpenClassroomIE(InfoExtractor):
2661         """Information extractor for Stanford's Open ClassRoom"""
2662
2663         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2664         IE_NAME = u'stanfordoc'
2665
2666         def report_download_webpage(self, objid):
2667                 """Report information extraction."""
2668                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2669
2670         def report_extraction(self, video_id):
2671                 """Report information extraction."""
2672                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2673
2674         def _real_extract(self, url):
2675                 mobj = re.match(self._VALID_URL, url)
2676                 if mobj is None:
2677                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2678                         return
2679
2680                 if mobj.group('course') and mobj.group('video'): # A specific video
2681                         course = mobj.group('course')
2682                         video = mobj.group('video')
2683                         info = {
2684                                 'id': course + '_' + video,
2685                         }
2686
2687                         self.report_extraction(info['id'])
2688                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2689                         xmlUrl = baseUrl + video + '.xml'
2690                         try:
2691                                 metaXml = urllib2.urlopen(xmlUrl).read()
2692                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2693                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2694                                 return
2695                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2696                         try:
2697                                 info['title'] = mdoc.findall('./title')[0].text
2698                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2699                         except IndexError:
2700                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2701                                 return
2702                         info['ext'] = info['url'].rpartition('.')[2]
2703                         info['format'] = info['ext']
2704                         return [info]
2705                 elif mobj.group('course'): # A course page
2706                         course = mobj.group('course')
2707                         info = {
2708                                 'id': course,
2709                                 'type': 'playlist',
2710                         }
2711
2712                         self.report_download_webpage(info['id'])
2713                         try:
2714                                 coursepage = urllib2.urlopen(url).read()
2715                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2716                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2717                                 return
2718
2719                         m = re.search('<h1>([^<]+)</h1>', coursepage)
2720                         if m:
2721                                 info['title'] = unescapeHTML(m.group(1))
2722                         else:
2723                                 info['title'] = info['id']
2724
2725                         m = re.search('<description>([^<]+)</description>', coursepage)
2726                         if m:
2727                                 info['description'] = unescapeHTML(m.group(1))
2728
2729                         links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2730                         info['list'] = [
2731                                 {
2732                                         'type': 'reference',
2733                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2734                                 }
2735                                         for vpage in links]
2736                         results = []
2737                         for entry in info['list']:
2738                                 assert entry['type'] == 'reference'
2739                                 results += self.extract(entry['url'])
2740                         return results
2741                         
2742                 else: # Root page
2743                         info = {
2744                                 'id': 'Stanford OpenClassroom',
2745                                 'type': 'playlist',
2746                         }
2747
2748                         self.report_download_webpage(info['id'])
2749                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2750                         try:
2751                                 rootpage = urllib2.urlopen(rootURL).read()
2752                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2753                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2754                                 return
2755
2756                         info['title'] = info['id']
2757
2758                         links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2759                         info['list'] = [
2760                                 {
2761                                         'type': 'reference',
2762                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2763                                 }
2764                                         for cpage in links]
2765
2766                         results = []
2767                         for entry in info['list']:
2768                                 assert entry['type'] == 'reference'
2769                                 results += self.extract(entry['url'])
2770                         return results
2771
2772 class MTVIE(InfoExtractor):
2773         """Information extractor for MTV.com"""
2774
2775         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2776         IE_NAME = u'mtv'
2777
2778         def report_webpage(self, video_id):
2779                 """Report information extraction."""
2780                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2781
2782         def report_extraction(self, video_id):
2783                 """Report information extraction."""
2784                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2785
2786         def _real_extract(self, url):
2787                 mobj = re.match(self._VALID_URL, url)
2788                 if mobj is None:
2789                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2790                         return
2791                 if not mobj.group('proto'):
2792                         url = 'http://' + url
2793                 video_id = mobj.group('videoid')
2794                 self.report_webpage(video_id)
2795
2796                 request = urllib2.Request(url)
2797                 try:
2798                         webpage = urllib2.urlopen(request).read()
2799                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2800                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2801                         return
2802
2803                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2804                 if mobj is None:
2805                         self._downloader.trouble(u'ERROR: unable to extract song name')
2806                         return
2807                 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2808                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2809                 if mobj is None:
2810                         self._downloader.trouble(u'ERROR: unable to extract performer')
2811                         return
2812                 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2813                 video_title = performer + ' - ' + song_name 
2814
2815                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2816                 if mobj is None:
2817                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
2818                         return
2819                 mtvn_uri = mobj.group(1)
2820
2821                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2822                 if mobj is None:
2823                         self._downloader.trouble(u'ERROR: unable to extract content id')
2824                         return
2825                 content_id = mobj.group(1)
2826
2827                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2828                 self.report_extraction(video_id)
2829                 request = urllib2.Request(videogen_url)
2830                 try:
2831                         metadataXml = urllib2.urlopen(request).read()
2832                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2833                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
2834                         return
2835
2836                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2837                 renditions = mdoc.findall('.//rendition')
2838
2839                 # For now, always pick the highest quality.
2840                 rendition = renditions[-1]
2841
2842                 try:
2843                         _,_,ext = rendition.attrib['type'].partition('/')
2844                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2845                         video_url = rendition.find('./src').text
2846                 except KeyError:
2847                         self._downloader.trouble('Invalid rendition field.')
2848                         return
2849
2850                 info = {
2851                         'id': video_id,
2852                         'url': video_url,
2853                         'uploader': performer,
2854                         'title': video_title,
2855                         'ext': ext,
2856                         'format': format,
2857                 }
2858
2859                 return [info]