ba534e10b8cb58cef9081796e5bfccf82057b646
[youtube-dl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # License: Public domain code
5 import htmlentitydefs
6 import httplib
7 import math
8 import netrc
9 import os
10 import os.path
11 import re
12 import socket
13 import string
14 import sys
15 import time
16 import urllib
17 import urllib2
18
19 std_headers = { 
20         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.1) Gecko/2008070208 Firefox/3.0.1',
21         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
22         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
23         'Accept-Language': 'en-us,en;q=0.5',
24 }
25
26 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
27
28 class DownloadError(Exception):
29         """Download Error exception.
30         
31         This exception may be thrown by FileDownloader objects if they are not
32         configured to continue on errors. They will contain the appropriate
33         error message.
34         """
35         pass
36
37 class SameFileError(Exception):
38         """Same File exception.
39
40         This exception will be thrown by FileDownloader objects if they detect
41         multiple files would have to be downloaded to the same file on disk.
42         """
43         pass
44
45 class PostProcessingError(Exception):
46         """Post Processing exception.
47
48         This exception may be raised by PostProcessor's .run() method to
49         indicate an error in the postprocessing task.
50         """
51         pass
52
53 class FileDownloader(object):
54         """File Downloader class.
55
56         File downloader objects are the ones responsible of downloading the
57         actual video file and writing it to disk if the user has requested
58         it, among some other tasks. In most cases there should be one per
59         program. As, given a video URL, the downloader doesn't know how to
60         extract all the needed information, task that InfoExtractors do, it
61         has to pass the URL to one of them.
62
63         For this, file downloader objects have a method that allows
64         InfoExtractors to be registered in a given order. When it is passed
65         a URL, the file downloader handles it to the first InfoExtractor it
66         finds that reports being able to handle it. The InfoExtractor returns
67         all the information to the FileDownloader and the latter downloads the
68         file or does whatever it's instructed to do.
69
70         File downloaders accept a lot of parameters. In order not to saturate
71         the object constructor with arguments, it receives a dictionary of
72         options instead. These options are available through the get_params()
73         method for the InfoExtractors to use. The FileDownloader also registers
74         itself as the downloader in charge for the InfoExtractors that are
75         added to it, so this is a "mutual registration".
76
77         Available options:
78
79         username:       Username for authentication purposes.
80         password:       Password for authentication purposes.
81         usenetrc:       Use netrc for authentication instead.
82         quiet:          Do not print messages to stdout.
83         forceurl:       Force printing final URL.
84         forcetitle:     Force printing title.
85         simulate:       Do not download the video files.
86         format:         Video format code.
87         outtmpl:        Template for output names.
88         ignoreerrors:   Do not stop on download errors.
89         ratelimit:      Download speed limit, in bytes/sec.
90         """
91
92         _params = None
93         _ies = []
94         _pps = []
95
96         def __init__(self, params):
97                 """Create a FileDownloader object with the given options."""
98                 self._ies = []
99                 self._pps = []
100                 self.set_params(params)
101         
102         @staticmethod
103         def pmkdir(filename):
104                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
105                 components = filename.split(os.sep)
106                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
107                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
108                 for dir in aggregate:
109                         if not os.path.exists(dir):
110                                 os.mkdir(dir)
111         
112         @staticmethod
113         def format_bytes(bytes):
114                 if bytes is None:
115                         return 'N/A'
116                 if bytes == 0:
117                         exponent = 0
118                 else:
119                         exponent = long(math.log(float(bytes), 1024.0))
120                 suffix = 'bkMGTPEZY'[exponent]
121                 converted = float(bytes) / float(1024**exponent)
122                 return '%.2f%s' % (converted, suffix)
123
124         @staticmethod
125         def calc_percent(byte_counter, data_len):
126                 if data_len is None:
127                         return '---.-%'
128                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
129
130         @staticmethod
131         def calc_eta(start, now, total, current):
132                 if total is None:
133                         return '--:--'
134                 dif = now - start
135                 if current == 0 or dif < 0.001: # One millisecond
136                         return '--:--'
137                 rate = float(current) / dif
138                 eta = long((float(total) - float(current)) / rate)
139                 (eta_mins, eta_secs) = divmod(eta, 60)
140                 if eta_mins > 99:
141                         return '--:--'
142                 return '%02d:%02d' % (eta_mins, eta_secs)
143
144         @staticmethod
145         def calc_speed(start, now, bytes):
146                 dif = now - start
147                 if bytes == 0 or dif < 0.001: # One millisecond
148                         return '%10s' % '---b/s'
149                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
150
151         @staticmethod
152         def best_block_size(elapsed_time, bytes):
153                 new_min = max(bytes / 2.0, 1.0)
154                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
155                 if elapsed_time < 0.001:
156                         return int(new_max)
157                 rate = bytes / elapsed_time
158                 if rate > new_max:
159                         return int(new_max)
160                 if rate < new_min:
161                         return int(new_min)
162                 return int(rate)
163
164         @staticmethod
165         def parse_bytes(bytestr):
166                 """Parse a string indicating a byte quantity into a long integer."""
167                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
168                 if matchobj is None:
169                         return None
170                 number = float(matchobj.group(1))
171                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
172                 return long(round(number * multiplier))
173
174         def set_params(self, params):
175                 """Sets parameters."""
176                 if type(params) != dict:
177                         raise ValueError('params: dictionary expected')
178                 self._params = params
179         
180         def get_params(self):
181                 """Get parameters."""
182                 return self._params
183
184         def add_info_extractor(self, ie):
185                 """Add an InfoExtractor object to the end of the list."""
186                 self._ies.append(ie)
187                 ie.set_downloader(self)
188         
189         def add_post_processor(self, pp):
190                 """Add a PostProcessor object to the end of the chain."""
191                 self._pps.append(pp)
192                 pp.set_downloader(self)
193         
194         def to_stdout(self, message, skip_eol=False):
195                 """Print message to stdout if not in quiet mode."""
196                 if not self._params.get('quiet', False):
197                         print u'%s%s' % (message, [u'\n', u''][skip_eol]),
198                         sys.stdout.flush()
199         
200         def to_stderr(self, message):
201                 """Print message to stderr."""
202                 print >>sys.stderr, message
203         
204         def fixed_template(self):
205                 """Checks if the output template is fixed."""
206                 return (re.search(ur'(?u)%\(.+?\)s', self._params['outtmpl']) is None)
207
208         def trouble(self, message=None):
209                 """Determine action to take when a download problem appears.
210
211                 Depending on if the downloader has been configured to ignore
212                 download errors or not, this method may throw an exception or
213                 not when errors are found, after printing the message. If it
214                 doesn't raise, it returns an error code suitable to be returned
215                 later as a program exit code to indicate error.
216                 """
217                 if message is not None:
218                         self.to_stderr(message)
219                 if not self._params.get('ignoreerrors', False):
220                         raise DownloadError(message)
221                 return 1
222
223         def slow_down(self, start_time, byte_counter):
224                 """Sleep if the download speed is over the rate limit."""
225                 rate_limit = self._params.get('ratelimit', None)
226                 if rate_limit is None or byte_counter == 0:
227                         return
228                 now = time.time()
229                 elapsed = now - start_time
230                 if elapsed <= 0.0:
231                         return
232                 speed = float(byte_counter) / elapsed
233                 if speed > rate_limit:
234                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
235
236         def report_destination(self, filename):
237                 """Report destination filename."""
238                 self.to_stdout(u'[download] Destination: %s' % filename)
239         
240         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
241                 """Report download progress."""
242                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
243                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
244         
245         def report_finish(self):
246                 """Report download finished."""
247                 self.to_stdout(u'')
248
249         def download(self, url_list):
250                 """Download a given list of URLs."""
251                 retcode = 0
252                 if len(url_list) > 1 and self.fixed_template():
253                         raise SameFileError(self._params['outtmpl'])
254
255                 for url in url_list:
256                         suitable_found = False
257                         for ie in self._ies:
258                                 if not ie.suitable(url):
259                                         continue
260                                 # Suitable InfoExtractor found
261                                 suitable_found = True
262                                 all_results = ie.extract(url)
263                                 results = [x for x in all_results if x is not None]
264                                 if len(results) != len(all_results):
265                                         retcode = self.trouble()
266
267                                 if len(results) > 1 and self.fixed_template():
268                                         raise SameFileError(self._params['outtmpl'])
269
270                                 for result in results:
271                                         # Forced printings
272                                         if self._params.get('forcetitle', False):
273                                                 print result['title']
274                                         if self._params.get('forceurl', False):
275                                                 print result['url']
276                                                 
277                                         # Do nothing else if in simulate mode
278                                         if self._params.get('simulate', False):
279                                                 continue
280
281                                         try:
282                                                 filename = self._params['outtmpl'] % result
283                                                 self.report_destination(filename)
284                                         except (ValueError, KeyError), err:
285                                                 retcode = self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
286                                                 continue
287                                         try:
288                                                 self.pmkdir(filename)
289                                         except (OSError, IOError), err:
290                                                 retcode = self.trouble('ERROR: unable to create directories: %s' % str(err))
291                                                 continue
292                                         try:
293                                                 outstream = open(filename, 'wb')
294                                         except (OSError, IOError), err:
295                                                 retcode = self.trouble('ERROR: unable to open for writing: %s' % str(err))
296                                                 continue
297                                         try:
298                                                 self._do_download(outstream, result['url'])
299                                                 outstream.close()
300                                         except (OSError, IOError), err:
301                                                 retcode = self.trouble('ERROR: unable to write video data: %s' % str(err))
302                                                 continue
303                                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
304                                                 retcode = self.trouble('ERROR: unable to download video data: %s' % str(err))
305                                                 continue
306                                         try:
307                                                 self.post_process(filename, result)
308                                         except (PostProcessingError), err:
309                                                 retcode = self.trouble('ERROR: postprocessing: %s' % str(err))
310                                                 continue
311
312                                 break
313                         if not suitable_found:
314                                 retcode = self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
315
316                 return retcode
317
318         def post_process(self, filename, ie_info):
319                 """Run the postprocessing chain on the given file."""
320                 info = dict(ie_info)
321                 info['filepath'] = filename
322                 for pp in self._pps:
323                         info = pp.run(info)
324                         if info is None:
325                                 break
326         
327         def _do_download(self, stream, url):
328                 request = urllib2.Request(url, None, std_headers)
329                 data = urllib2.urlopen(request)
330                 data_len = data.info().get('Content-length', None)
331                 data_len_str = self.format_bytes(data_len)
332                 byte_counter = 0
333                 block_size = 1024
334                 start = time.time()
335                 while True:
336                         # Progress message
337                         percent_str = self.calc_percent(byte_counter, data_len)
338                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
339                         speed_str = self.calc_speed(start, time.time(), byte_counter)
340                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
341
342                         # Download and write
343                         before = time.time()
344                         data_block = data.read(block_size)
345                         after = time.time()
346                         data_block_len = len(data_block)
347                         if data_block_len == 0:
348                                 break
349                         byte_counter += data_block_len
350                         stream.write(data_block)
351                         block_size = self.best_block_size(after - before, data_block_len)
352
353                         # Apply rate limit
354                         self.slow_down(start, byte_counter)
355
356                 self.report_finish()
357                 if data_len is not None and str(byte_counter) != data_len:
358                         raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
359
360 class InfoExtractor(object):
361         """Information Extractor class.
362
363         Information extractors are the classes that, given a URL, extract
364         information from the video (or videos) the URL refers to. This
365         information includes the real video URL, the video title and simplified
366         title, author and others. It is returned in a list of dictionaries when
367         calling its extract() method. It is a list because a URL can refer to
368         more than one video (think of playlists). The dictionaries must include
369         the following fields:
370
371         id:             Video identifier.
372         url:            Final video URL.
373         uploader:       Nickname of the video uploader.
374         title:          Literal title.
375         stitle:         Simplified title.
376         ext:            Video filename extension.
377
378         Subclasses of this one should re-define the _real_initialize() and
379         _real_extract() methods, as well as the suitable() static method.
380         Probably, they should also be instantiated and added to the main
381         downloader.
382         """
383
384         _ready = False
385         _downloader = None
386
387         def __init__(self, downloader=None):
388                 """Constructor. Receives an optional downloader."""
389                 self._ready = False
390                 self.set_downloader(downloader)
391
392         @staticmethod
393         def suitable(url):
394                 """Receives a URL and returns True if suitable for this IE."""
395                 return False
396
397         def initialize(self):
398                 """Initializes an instance (authentication, etc)."""
399                 if not self._ready:
400                         self._real_initialize()
401                         self._ready = True
402
403         def extract(self, url):
404                 """Extracts URL information and returns it in list of dicts."""
405                 self.initialize()
406                 return self._real_extract(url)
407
408         def set_downloader(self, downloader):
409                 """Sets the downloader for this IE."""
410                 self._downloader = downloader
411         
412         def to_stdout(self, message):
413                 """Print message to stdout if downloader is not in quiet mode."""
414                 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
415                         print message
416         
417         def to_stderr(self, message):
418                 """Print message to stderr."""
419                 print >>sys.stderr, message
420
421         def _real_initialize(self):
422                 """Real initialization process. Redefine in subclasses."""
423                 pass
424
425         def _real_extract(self, url):
426                 """Real extraction process. Redefine in subclasses."""
427                 pass
428
429 class YoutubeIE(InfoExtractor):
430         """Information extractor for youtube.com."""
431
432         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
433         _LOGIN_URL = 'http://www.youtube.com/login?next=/'
434         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
435         _NETRC_MACHINE = 'youtube'
436
437         @staticmethod
438         def suitable(url):
439                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
440
441         def report_login(self):
442                 """Report attempt to log in."""
443                 self.to_stdout(u'[youtube] Logging in')
444         
445         def report_age_confirmation(self):
446                 """Report attempt to confirm age."""
447                 self.to_stdout(u'[youtube] Confirming age')
448         
449         def report_webpage_download(self, video_id):
450                 """Report attempt to download webpage."""
451                 self.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
452         
453         def report_information_extraction(self, video_id):
454                 """Report attempt to extract video information."""
455                 self.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
456         
457         def report_video_url(self, video_id, video_real_url):
458                 """Report extracted video URL."""
459                 self.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url))
460
461         def _real_initialize(self):
462                 if self._downloader is None:
463                         return
464
465                 username = None
466                 password = None
467                 downloader_params = self._downloader.get_params()
468
469                 # Attempt to use provided username and password or .netrc data
470                 if downloader_params.get('username', None) is not None:
471                         username = downloader_params['username']
472                         password = downloader_params['password']
473                 elif downloader_params.get('usenetrc', False):
474                         try:
475                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
476                                 if info is not None:
477                                         username = info[0]
478                                         password = info[2]
479                                 else:
480                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
481                         except (IOError, netrc.NetrcParseError), err:
482                                 self.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
483                                 return
484
485                 # No authentication to be performed
486                 if username is None:
487                         return
488
489                 # Log in
490                 login_form = {
491                                 'current_form': 'loginForm',
492                                 'next':         '/',
493                                 'action_login': 'Log In',
494                                 'username':     username,
495                                 'password':     password,
496                                 }
497                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
498                 try:
499                         self.report_login()
500                         login_results = urllib2.urlopen(request).read()
501                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
502                                 self.to_stderr(u'WARNING: unable to log in: bad username or password')
503                                 return
504                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
505                         self.to_stderr(u'WARNING: unable to log in: %s' % str(err))
506                         return
507         
508                 # Confirm age
509                 age_form = {
510                                 'next_url':             '/',
511                                 'action_confirm':       'Confirm',
512                                 }
513                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
514                 try:
515                         self.report_age_confirmation()
516                         age_results = urllib2.urlopen(request).read()
517                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
518                         self.to_stderr(u'ERROR: unable to confirm age: %s' % str(err))
519                         return
520
521         def _real_extract(self, url):
522                 # Extract video id from URL
523                 mobj = re.match(self._VALID_URL, url)
524                 if mobj is None:
525                         self.to_stderr(u'ERROR: invalid URL: %s' % url)
526                         return [None]
527                 video_id = mobj.group(2)
528
529                 # Downloader parameters
530                 format_param = None
531                 if self._downloader is not None:
532                         params = self._downloader.get_params()
533                         format_param = params.get('format', None)
534
535                 # Extension
536                 video_extension = {'18': 'mp4', '17': '3gp'}.get(format_param, 'flv')
537
538                 # Normalize URL, including format
539                 normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
540                 if format_param is not None:
541                         normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
542                 request = urllib2.Request(normalized_url, None, std_headers)
543                 try:
544                         self.report_webpage_download(video_id)
545                         video_webpage = urllib2.urlopen(request).read()
546                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
547                         self.to_stderr(u'ERROR: unable to download video webpage: %s' % str(err))
548                         return [None]
549                 self.report_information_extraction(video_id)
550                 
551                 # "t" param
552                 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
553                 if mobj is None:
554                         self.to_stderr(u'ERROR: unable to extract "t" parameter')
555                         return [None]
556                 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
557                 if format_param is not None:
558                         video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
559                 self.report_video_url(video_id, video_real_url)
560
561                 # uploader
562                 mobj = re.search(r"var watchUsername = '([^']+)';", video_webpage)
563                 if mobj is None:
564                         self.to_stderr(u'ERROR: unable to extract uploader nickname')
565                         return [None]
566                 video_uploader = mobj.group(1)
567
568                 # title
569                 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
570                 if mobj is None:
571                         self.to_stderr(u'ERROR: unable to extract video title')
572                         return [None]
573                 video_title = mobj.group(1).decode('utf-8')
574                 video_title = re.sub(ur'(?u)&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
575                 video_title = video_title.replace(os.sep, u'%')
576
577                 # simplified title
578                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
579                 simple_title = simple_title.strip(ur'_')
580
581                 # Return information
582                 return [{
583                         'id':           video_id.decode('utf-8'),
584                         'url':          video_real_url.decode('utf-8'),
585                         'uploader':     video_uploader.decode('utf-8'),
586                         'title':        video_title,
587                         'stitle':       simple_title,
588                         'ext':          video_extension.decode('utf-8'),
589                         }]
590
591 class MetacafeIE(InfoExtractor):
592         """Information Extractor for metacafe.com."""
593
594         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
595         _DISCLAIMER = 'http://www.metacafe.com/disclaimer'
596         _youtube_ie = None
597
598         def __init__(self, youtube_ie, downloader=None):
599                 InfoExtractor.__init__(self, downloader)
600                 self._youtube_ie = youtube_ie
601
602         @staticmethod
603         def suitable(url):
604                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
605
606         def report_disclaimer(self):
607                 """Report disclaimer retrieval."""
608                 self.to_stdout(u'[metacafe] Retrieving disclaimer')
609
610         def report_age_confirmation(self):
611                 """Report attempt to confirm age."""
612                 self.to_stdout(u'[metacafe] Confirming age')
613         
614         def report_download_webpage(self, video_id):
615                 """Report webpage download."""
616                 self.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
617         
618         def report_extraction(self, video_id):
619                 """Report information extraction."""
620                 self.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
621
622         def _real_initialize(self):
623                 # Retrieve disclaimer
624                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
625                 try:
626                         self.report_disclaimer()
627                         disclaimer = urllib2.urlopen(request).read()
628                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
629                         self.to_stderr(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
630                         return
631
632                 # Confirm age
633                 disclaimer_form = {
634                         'allowAdultContent': '1',
635                         'submit': "Continue - I'm over 18",
636                         }
637                 request = urllib2.Request('http://www.metacafe.com/watch/', urllib.urlencode(disclaimer_form), std_headers)
638                 try:
639                         self.report_age_confirmation()
640                         disclaimer = urllib2.urlopen(request).read()
641                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
642                         self.to_stderr(u'ERROR: unable to confirm age: %s' % str(err))
643                         return
644         
645         def _real_extract(self, url):
646                 # Extract id and simplified title from URL
647                 mobj = re.match(self._VALID_URL, url)
648                 if mobj is None:
649                         self.to_stderr(u'ERROR: invalid URL: %s' % url)
650                         return [None]
651
652                 video_id = mobj.group(1)
653
654                 # Check if video comes from YouTube
655                 mobj2 = re.match(r'^yt-(.*)$', video_id)
656                 if mobj2 is not None:
657                         return self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
658
659                 simple_title = mobj.group(2).decode('utf-8')
660                 video_extension = 'flv'
661
662                 # Retrieve video webpage to extract further information
663                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
664                 try:
665                         self.report_download_webpage(video_id)
666                         webpage = urllib2.urlopen(request).read()
667                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
668                         self.to_stderr(u'ERROR: unable retrieve video webpage: %s' % str(err))
669                         return [None]
670
671                 # Extract URL, uploader and title from webpage
672                 self.report_extraction(video_id)
673                 mobj = re.search(r'(?m)"mediaURL":"(http.*?\.flv)"', webpage)
674                 if mobj is None:
675                         self.to_stderr(u'ERROR: unable to extract media URL')
676                         return [None]
677                 mediaURL = mobj.group(1).replace('\\', '')
678
679                 mobj = re.search(r'(?m)"gdaKey":"(.*?)"', webpage)
680                 if mobj is None:
681                         self.to_stderr(u'ERROR: unable to extract gdaKey')
682                         return [None]
683                 gdaKey = mobj.group(1)
684
685                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
686
687                 mobj = re.search(r'(?im)<meta name="title" content="Metacafe - ([^"]+)"', webpage)
688                 if mobj is None:
689                         self.to_stderr(u'ERROR: unable to extract title')
690                         return [None]
691                 video_title = mobj.group(1).decode('utf-8')
692
693                 mobj = re.search(r'(?m)<li id="ChnlUsr">.*?Submitter:<br />(.*?)</li>', webpage)
694                 if mobj is None:
695                         self.to_stderr(u'ERROR: unable to extract uploader nickname')
696                         return [None]
697                 video_uploader = re.sub(r'<.*?>', '', mobj.group(1))
698
699                 # Return information
700                 return [{
701                         'id':           video_id.decode('utf-8'),
702                         'url':          video_url.decode('utf-8'),
703                         'uploader':     video_uploader.decode('utf-8'),
704                         'title':        video_title,
705                         'stitle':       simple_title,
706                         'ext':          video_extension.decode('utf-8'),
707                         }]
708
709 class YoutubePlaylistIE(InfoExtractor):
710         """Information Extractor for YouTube playlists."""
711
712         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/view_play_list\?p=(.+)'
713         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s'
714         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
715         _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&amp;page=%s'
716         _youtube_ie = None
717
718         def __init__(self, youtube_ie, downloader=None):
719                 InfoExtractor.__init__(self, downloader)
720                 self._youtube_ie = youtube_ie
721         
722         @staticmethod
723         def suitable(url):
724                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
725
726         def report_download_page(self, playlist_id, pagenum):
727                 """Report attempt to download playlist page with given number."""
728                 self.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
729
730         def _real_initialize(self):
731                 self._youtube_ie.initialize()
732         
733         def _real_extract(self, url):
734                 # Extract playlist id
735                 mobj = re.match(self._VALID_URL, url)
736                 if mobj is None:
737                         self.to_stderr(u'ERROR: invalid url: %s' % url)
738                         return [None]
739
740                 # Download playlist pages
741                 playlist_id = mobj.group(1)
742                 video_ids = []
743                 pagenum = 1
744
745                 while True:
746                         self.report_download_page(playlist_id, pagenum)
747                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
748                         try:
749                                 page = urllib2.urlopen(request).read()
750                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
751                                 self.to_stderr(u'ERROR: unable to download webpage: %s' % str(err))
752                                 return [None]
753
754                         # Extract video identifiers
755                         ids_in_page = set()
756                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
757                                 ids_in_page.add(mobj.group(1))
758                         video_ids.extend(list(ids_in_page))
759
760                         if (self._MORE_PAGES_INDICATOR % (playlist_id, pagenum + 1)) not in page:
761                                 break
762                         pagenum = pagenum + 1
763
764                 information = []
765                 for id in video_ids:
766                         information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
767                 return information
768
769 class PostProcessor(object):
770         """Post Processor class.
771
772         PostProcessor objects can be added to downloaders with their
773         add_post_processor() method. When the downloader has finished a
774         successful download, it will take its internal chain of PostProcessors
775         and start calling the run() method on each one of them, first with
776         an initial argument and then with the returned value of the previous
777         PostProcessor.
778
779         The chain will be stopped if one of them ever returns None or the end
780         of the chain is reached.
781
782         PostProcessor objects follow a "mutual registration" process similar
783         to InfoExtractor objects.
784         """
785
786         _downloader = None
787
788         def __init__(self, downloader=None):
789                 self._downloader = downloader
790
791         def to_stdout(self, message):
792                 """Print message to stdout if downloader is not in quiet mode."""
793                 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
794                         print message
795         
796         def to_stderr(self, message):
797                 """Print message to stderr."""
798                 print >>sys.stderr, message
799
800         def set_downloader(self, downloader):
801                 """Sets the downloader for this PP."""
802                 self._downloader = downloader
803         
804         def run(self, information):
805                 """Run the PostProcessor.
806
807                 The "information" argument is a dictionary like the ones
808                 returned by InfoExtractors. The only difference is that this
809                 one has an extra field called "filepath" that points to the
810                 downloaded file.
811
812                 When this method returns None, the postprocessing chain is
813                 stopped. However, this method may return an information
814                 dictionary that will be passed to the next postprocessing
815                 object in the chain. It can be the one it received after
816                 changing some fields.
817
818                 In addition, this method may raise a PostProcessingError
819                 exception that will be taken into account by the downloader
820                 it was called from.
821                 """
822                 return information # by default, do nothing
823         
824 ### MAIN PROGRAM ###
825 if __name__ == '__main__':
826         try:
827                 # Modules needed only when running the main program
828                 import getpass
829                 import optparse
830
831                 # General configuration
832                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
833                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
834                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
835
836                 # Parse command line
837                 parser = optparse.OptionParser(
838                                 usage='Usage: %prog [options] url...',
839                                 version='2008.08.09',
840                                 conflict_handler='resolve',
841                                 )
842                 parser.add_option('-h', '--help',
843                                 action='help', help='print this help text and exit')
844                 parser.add_option('-v', '--version',
845                                 action='version', help='print program version and exit')
846                 parser.add_option('-u', '--username',
847                                 dest='username', metavar='UN', help='account username')
848                 parser.add_option('-p', '--password',
849                                 dest='password', metavar='PW', help='account password')
850                 parser.add_option('-o', '--output',
851                                 dest='outtmpl', metavar='TPL', help='output filename template')
852                 parser.add_option('-q', '--quiet',
853                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
854                 parser.add_option('-s', '--simulate',
855                                 action='store_true', dest='simulate', help='do not download video', default=False)
856                 parser.add_option('-t', '--title',
857                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
858                 parser.add_option('-l', '--literal',
859                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
860                 parser.add_option('-n', '--netrc',
861                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
862                 parser.add_option('-g', '--get-url',
863                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
864                 parser.add_option('-e', '--get-title',
865                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
866                 parser.add_option('-f', '--format',
867                                 dest='format', metavar='FMT', help='video format code')
868                 parser.add_option('-b', '--best-quality',
869                                 action='store_const', dest='format', help='alias for -f 18', const='18')
870                 parser.add_option('-m', '--mobile-version',
871                                 action='store_const', dest='format', help='alias for -f 17', const='17')
872                 parser.add_option('-i', '--ignore-errors',
873                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
874                 parser.add_option('-r', '--rate-limit',
875                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
876                 (opts, args) = parser.parse_args()
877
878                 # Conflicting, missing and erroneous options
879                 if len(args) < 1:
880                         sys.exit(u'ERROR: you must provide at least one URL')
881                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
882                         sys.exit(u'ERROR: using .netrc conflicts with giving username/password')
883                 if opts.password is not None and opts.username is None:
884                         sys.exit(u'ERROR: account username missing')
885                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
886                         sys.exit(u'ERROR: using output template conflicts with using title or literal title')
887                 if opts.usetitle and opts.useliteral:
888                         sys.exit(u'ERROR: using title conflicts with using literal title')
889                 if opts.username is not None and opts.password is None:
890                         opts.password = getpass.getpass(u'Type account password and press return:')
891                 if opts.ratelimit is not None:
892                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
893                         if numeric_limit is None:
894                                 sys.exit(u'ERROR: invalid rate limit specified')
895                         opts.ratelimit = numeric_limit
896
897                 # Information extractors
898                 youtube_ie = YoutubeIE()
899                 metacafe_ie = MetacafeIE(youtube_ie)
900                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
901
902                 # File downloader
903                 fd = FileDownloader({
904                         'usenetrc': opts.usenetrc,
905                         'username': opts.username,
906                         'password': opts.password,
907                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
908                         'forceurl': opts.geturl,
909                         'forcetitle': opts.gettitle,
910                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
911                         'format': opts.format,
912                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode())
913                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
914                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
915                                 or u'%(id)s.%(ext)s'),
916                         'ignoreerrors': opts.ignoreerrors,
917                         'ratelimit': opts.ratelimit,
918                         })
919                 fd.add_info_extractor(youtube_pl_ie)
920                 fd.add_info_extractor(metacafe_ie)
921                 fd.add_info_extractor(youtube_ie)
922                 retcode = fd.download(args)
923                 sys.exit(retcode)
924
925         except DownloadError:
926                 sys.exit(1)
927         except SameFileError:
928                 sys.exit(u'ERROR: fixed output name but more than one file to download')
929         except KeyboardInterrupt:
930                 sys.exit(u'\nERROR: Interrupted by user')