patched to add Google Video and Photobucket support
[youtube-dl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # License: Public domain code
7 import htmlentitydefs
8 import httplib
9 import locale
10 import math
11 import netrc
12 import os
13 import os.path
14 import re
15 import socket
16 import string
17 import subprocess
18 import sys
19 import time
20 import urllib
21 import urllib2
22
23 # parse_qs was moved from the cgi module to the urlparse module recently.
24 try:
25         from urlparse import parse_qs
26 except ImportError:
27         from cgi import parse_qs
28
29 std_headers = {
30         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2',
31         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
32         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
33         'Accept-Language': 'en-us,en;q=0.5',
34 }
35
36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
37
38 def preferredencoding():
39         """Get preferred encoding.
40
41         Returns the best encoding scheme for the system, based on
42         locale.getpreferredencoding() and some further tweaks.
43         """
44         def yield_preferredencoding():
45                 try:
46                         pref = locale.getpreferredencoding()
47                         u'TEST'.encode(pref)
48                 except:
49                         pref = 'UTF-8'
50                 while True:
51                         yield pref
52         return yield_preferredencoding().next()
53
54 class DownloadError(Exception):
55         """Download Error exception.
56         
57         This exception may be thrown by FileDownloader objects if they are not
58         configured to continue on errors. They will contain the appropriate
59         error message.
60         """
61         pass
62
63 class SameFileError(Exception):
64         """Same File exception.
65
66         This exception will be thrown by FileDownloader objects if they detect
67         multiple files would have to be downloaded to the same file on disk.
68         """
69         pass
70
71 class PostProcessingError(Exception):
72         """Post Processing exception.
73
74         This exception may be raised by PostProcessor's .run() method to
75         indicate an error in the postprocessing task.
76         """
77         pass
78
79 class UnavailableFormatError(Exception):
80         """Unavailable Format exception.
81
82         This exception will be thrown when a video is requested
83         in a format that is not available for that video.
84         """
85         pass
86
87 class ContentTooShortError(Exception):
88         """Content Too Short exception.
89
90         This exception may be raised by FileDownloader objects when a file they
91         download is too small for what the server announced first, indicating
92         the connection was probably interrupted.
93         """
94         # Both in bytes
95         downloaded = None
96         expected = None
97
98         def __init__(self, downloaded, expected):
99                 self.downloaded = downloaded
100                 self.expected = expected
101
102 class FileDownloader(object):
103         """File Downloader class.
104
105         File downloader objects are the ones responsible of downloading the
106         actual video file and writing it to disk if the user has requested
107         it, among some other tasks. In most cases there should be one per
108         program. As, given a video URL, the downloader doesn't know how to
109         extract all the needed information, task that InfoExtractors do, it
110         has to pass the URL to one of them.
111
112         For this, file downloader objects have a method that allows
113         InfoExtractors to be registered in a given order. When it is passed
114         a URL, the file downloader handles it to the first InfoExtractor it
115         finds that reports being able to handle it. The InfoExtractor extracts
116         all the information about the video or videos the URL refers to, and
117         asks the FileDownloader to process the video information, possibly
118         downloading the video.
119
120         File downloaders accept a lot of parameters. In order not to saturate
121         the object constructor with arguments, it receives a dictionary of
122         options instead. These options are available through the params
123         attribute for the InfoExtractors to use. The FileDownloader also
124         registers itself as the downloader in charge for the InfoExtractors
125         that are added to it, so this is a "mutual registration".
126
127         Available options:
128
129         username:       Username for authentication purposes.
130         password:       Password for authentication purposes.
131         usenetrc:       Use netrc for authentication instead.
132         quiet:          Do not print messages to stdout.
133         forceurl:       Force printing final URL.
134         forcetitle:     Force printing title.
135         simulate:       Do not download the video files.
136         format:         Video format code.
137         outtmpl:        Template for output names.
138         ignoreerrors:   Do not stop on download errors.
139         ratelimit:      Download speed limit, in bytes/sec.
140         nooverwrites:   Prevent overwriting files.
141         continuedl:     Try to continue downloads if possible.
142         """
143
144         params = None
145         _ies = []
146         _pps = []
147         _download_retcode = None
148
149         def __init__(self, params):
150                 """Create a FileDownloader object with the given options."""
151                 self._ies = []
152                 self._pps = []
153                 self._download_retcode = 0
154                 self.params = params
155         
156         @staticmethod
157         def pmkdir(filename):
158                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
159                 components = filename.split(os.sep)
160                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
161                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
162                 for dir in aggregate:
163                         if not os.path.exists(dir):
164                                 os.mkdir(dir)
165         
166         @staticmethod
167         def format_bytes(bytes):
168                 if bytes is None:
169                         return 'N/A'
170                 if type(bytes) is str:
171                         bytes = float(bytes)
172                 if bytes == 0.0:
173                         exponent = 0
174                 else:
175                         exponent = long(math.log(bytes, 1024.0))
176                 suffix = 'bkMGTPEZY'[exponent]
177                 converted = float(bytes) / float(1024**exponent)
178                 return '%.2f%s' % (converted, suffix)
179
180         @staticmethod
181         def calc_percent(byte_counter, data_len):
182                 if data_len is None:
183                         return '---.-%'
184                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
185
186         @staticmethod
187         def calc_eta(start, now, total, current):
188                 if total is None:
189                         return '--:--'
190                 dif = now - start
191                 if current == 0 or dif < 0.001: # One millisecond
192                         return '--:--'
193                 rate = float(current) / dif
194                 eta = long((float(total) - float(current)) / rate)
195                 (eta_mins, eta_secs) = divmod(eta, 60)
196                 if eta_mins > 99:
197                         return '--:--'
198                 return '%02d:%02d' % (eta_mins, eta_secs)
199
200         @staticmethod
201         def calc_speed(start, now, bytes):
202                 dif = now - start
203                 if bytes == 0 or dif < 0.001: # One millisecond
204                         return '%10s' % '---b/s'
205                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
206
207         @staticmethod
208         def best_block_size(elapsed_time, bytes):
209                 new_min = max(bytes / 2.0, 1.0)
210                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
211                 if elapsed_time < 0.001:
212                         return long(new_max)
213                 rate = bytes / elapsed_time
214                 if rate > new_max:
215                         return long(new_max)
216                 if rate < new_min:
217                         return long(new_min)
218                 return long(rate)
219
220         @staticmethod
221         def parse_bytes(bytestr):
222                 """Parse a string indicating a byte quantity into a long integer."""
223                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
224                 if matchobj is None:
225                         return None
226                 number = float(matchobj.group(1))
227                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
228                 return long(round(number * multiplier))
229
230         @staticmethod
231         def verify_url(url):
232                 """Verify a URL is valid and data could be downloaded. Return real data URL."""
233                 request = urllib2.Request(url, None, std_headers)
234                 data = urllib2.urlopen(request)
235                 data.read(1)
236                 url = data.geturl()
237                 data.close()
238                 return url
239
240         def add_info_extractor(self, ie):
241                 """Add an InfoExtractor object to the end of the list."""
242                 self._ies.append(ie)
243                 ie.set_downloader(self)
244         
245         def add_post_processor(self, pp):
246                 """Add a PostProcessor object to the end of the chain."""
247                 self._pps.append(pp)
248                 pp.set_downloader(self)
249         
250         def to_stdout(self, message, skip_eol=False):
251                 """Print message to stdout if not in quiet mode."""
252                 if not self.params.get('quiet', False):
253                         print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
254                         sys.stdout.flush()
255         
256         def to_stderr(self, message):
257                 """Print message to stderr."""
258                 print >>sys.stderr, message.encode(preferredencoding())
259         
260         def fixed_template(self):
261                 """Checks if the output template is fixed."""
262                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
263
264         def trouble(self, message=None):
265                 """Determine action to take when a download problem appears.
266
267                 Depending on if the downloader has been configured to ignore
268                 download errors or not, this method may throw an exception or
269                 not when errors are found, after printing the message.
270                 """
271                 if message is not None:
272                         self.to_stderr(message)
273                 if not self.params.get('ignoreerrors', False):
274                         raise DownloadError(message)
275                 self._download_retcode = 1
276
277         def slow_down(self, start_time, byte_counter):
278                 """Sleep if the download speed is over the rate limit."""
279                 rate_limit = self.params.get('ratelimit', None)
280                 if rate_limit is None or byte_counter == 0:
281                         return
282                 now = time.time()
283                 elapsed = now - start_time
284                 if elapsed <= 0.0:
285                         return
286                 speed = float(byte_counter) / elapsed
287                 if speed > rate_limit:
288                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
289
290         def report_destination(self, filename):
291                 """Report destination filename."""
292                 self.to_stdout(u'[download] Destination: %s' % filename)
293         
294         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
295                 """Report download progress."""
296                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
297                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
298
299         def report_resuming_byte(self, resume_len):
300                 """Report attemtp to resume at given byte."""
301                 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
302         
303         def report_file_already_downloaded(self, file_name):
304                 """Report file has already been fully downloaded."""
305                 self.to_stdout(u'[download] %s has already been downloaded' % file_name)
306         
307         def report_unable_to_resume(self):
308                 """Report it was impossible to resume download."""
309                 self.to_stdout(u'[download] Unable to resume')
310         
311         def report_finish(self):
312                 """Report download finished."""
313                 self.to_stdout(u'')
314
315         def process_info(self, info_dict):
316                 """Process a single dictionary returned by an InfoExtractor."""
317                 # Do nothing else if in simulate mode
318                 if self.params.get('simulate', False):
319                         try:
320                                 info_dict['url'] = self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
321                         except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
322                                 raise UnavailableFormatError
323
324                         # Forced printings
325                         if self.params.get('forcetitle', False):
326                                 print info_dict['title'].encode(preferredencoding())
327                         if self.params.get('forceurl', False):
328                                 print info_dict['url'].encode(preferredencoding())
329
330                         return
331                         
332                 try:
333                         template_dict = dict(info_dict)
334                         template_dict['epoch'] = unicode(long(time.time()))
335                         filename = self.params['outtmpl'] % template_dict
336                 except (ValueError, KeyError), err:
337                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
338                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
339                         self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
340                         return
341
342                 try:
343                         self.pmkdir(filename)
344                 except (OSError, IOError), err:
345                         self.trouble('ERROR: unable to create directories: %s' % str(err))
346                         return
347
348                 try:
349                         success = self._do_download(filename, info_dict['url'].encode('utf-8'))
350                 except (OSError, IOError), err:
351                         raise UnavailableFormatError
352                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
353                         self.trouble('ERROR: unable to download video data: %s' % str(err))
354                         return
355                 except (ContentTooShortError, ), err:
356                         self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
357                         return
358
359                 if success:
360                         try:
361                                 self.post_process(filename, info_dict)
362                         except (PostProcessingError), err:
363                                 self.trouble('ERROR: postprocessing: %s' % str(err))
364                                 return
365
366         def download(self, url_list):
367                 """Download a given list of URLs."""
368                 if len(url_list) > 1 and self.fixed_template():
369                         raise SameFileError(self.params['outtmpl'])
370
371                 for url in url_list:
372                         suitable_found = False
373                         for ie in self._ies:
374                                 # Go to next InfoExtractor if not suitable
375                                 if not ie.suitable(url):
376                                         continue
377
378                                 # Suitable InfoExtractor found
379                                 suitable_found = True
380
381                                 # Extract information from URL and process it
382                                 ie.extract(url)
383
384                                 # Suitable InfoExtractor had been found; go to next URL
385                                 break
386
387                         if not suitable_found:
388                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
389
390                 return self._download_retcode
391
392         def post_process(self, filename, ie_info):
393                 """Run the postprocessing chain on the given file."""
394                 info = dict(ie_info)
395                 info['filepath'] = filename
396                 for pp in self._pps:
397                         info = pp.run(info)
398                         if info is None:
399                                 break
400         
401         def _download_with_rtmpdump(self, filename, url):
402                 self.report_destination(filename)
403
404                 # Check for rtmpdump first
405                 try:
406                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
407                 except (OSError, IOError):
408                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
409                         return False
410
411                 # Download using rtmpdump. rtmpdump returns exit code 2 when
412                 # the connection was interrumpted and resuming appears to be
413                 # possible. This is part of rtmpdump's normal usage, AFAIK.
414                 retval = subprocess.call(['rtmpdump', '-q', '-r', url, '-o', filename] + [[], ['-e']][self.params.get('continuedl', False)])
415                 while retval == 2:
416                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename), skip_eol=True)
417                         time.sleep(2.0) # This seems to be needed
418                         retval = subprocess.call(['rtmpdump', '-q', '-e', '-r', url, '-o', filename])
419                 if retval == 0:
420                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
421                         return True
422                 else:
423                         self.trouble('ERROR: rtmpdump exited with code %d' % retval)
424                         return False
425
426         def _do_download(self, filename, url):
427                 # Attempt to download using rtmpdump
428                 if url.startswith('rtmp'):
429                         return self._download_with_rtmpdump(filename, url)
430
431                 stream = None
432                 open_mode = 'wb'
433                 basic_request = urllib2.Request(url, None, std_headers)
434                 request = urllib2.Request(url, None, std_headers)
435
436                 # Establish possible resume length
437                 if os.path.isfile(filename):
438                         resume_len = os.path.getsize(filename)
439                 else:
440                         resume_len = 0
441
442                 # Request parameters in case of being able to resume
443                 if self.params.get('continuedl', False) and resume_len != 0:
444                         self.report_resuming_byte(resume_len)
445                         request.add_header('Range','bytes=%d-' % resume_len)
446                         open_mode = 'ab'
447
448                 # Establish connection
449                 try:
450                         data = urllib2.urlopen(request)
451                 except (urllib2.HTTPError, ), err:
452                         if err.code != 416: #  416 is 'Requested range not satisfiable'
453                                 raise
454                         # Unable to resume
455                         data = urllib2.urlopen(basic_request)
456                         content_length = data.info()['Content-Length']
457
458                         if content_length is not None and long(content_length) == resume_len:
459                                 # Because the file had already been fully downloaded
460                                 self.report_file_already_downloaded(filename)
461                                 return True
462                         else:
463                                 # Because the server didn't let us
464                                 self.report_unable_to_resume()
465                                 open_mode = 'wb'
466
467                 data_len = data.info().get('Content-length', None)
468                 data_len_str = self.format_bytes(data_len)
469                 byte_counter = 0
470                 block_size = 1024
471                 start = time.time()
472                 while True:
473                         # Download and write
474                         before = time.time()
475                         data_block = data.read(block_size)
476                         after = time.time()
477                         data_block_len = len(data_block)
478                         if data_block_len == 0:
479                                 break
480                         byte_counter += data_block_len
481
482                         # Open file just in time
483                         if stream is None:
484                                 try:
485                                         stream = open(filename, open_mode)
486                                         self.report_destination(filename)
487                                 except (OSError, IOError), err:
488                                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
489                                         return False
490                         stream.write(data_block)
491                         block_size = self.best_block_size(after - before, data_block_len)
492
493                         # Progress message
494                         percent_str = self.calc_percent(byte_counter, data_len)
495                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
496                         speed_str = self.calc_speed(start, time.time(), byte_counter)
497                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
498
499                         # Apply rate limit
500                         self.slow_down(start, byte_counter)
501
502                 self.report_finish()
503                 if data_len is not None and str(byte_counter) != data_len:
504                         raise ContentTooShortError(byte_counter, long(data_len))
505                 return True
506
507 class InfoExtractor(object):
508         """Information Extractor class.
509
510         Information extractors are the classes that, given a URL, extract
511         information from the video (or videos) the URL refers to. This
512         information includes the real video URL, the video title and simplified
513         title, author and others. The information is stored in a dictionary
514         which is then passed to the FileDownloader. The FileDownloader
515         processes this information possibly downloading the video to the file
516         system, among other possible outcomes. The dictionaries must include
517         the following fields:
518
519         id:             Video identifier.
520         url:            Final video URL.
521         uploader:       Nickname of the video uploader.
522         title:          Literal title.
523         stitle:         Simplified title.
524         ext:            Video filename extension.
525
526         Subclasses of this one should re-define the _real_initialize() and
527         _real_extract() methods, as well as the suitable() static method.
528         Probably, they should also be instantiated and added to the main
529         downloader.
530         """
531
532         _ready = False
533         _downloader = None
534
535         def __init__(self, downloader=None):
536                 """Constructor. Receives an optional downloader."""
537                 self._ready = False
538                 self.set_downloader(downloader)
539
540         @staticmethod
541         def suitable(url):
542                 """Receives a URL and returns True if suitable for this IE."""
543                 return False
544
545         def initialize(self):
546                 """Initializes an instance (authentication, etc)."""
547                 if not self._ready:
548                         self._real_initialize()
549                         self._ready = True
550
551         def extract(self, url):
552                 """Extracts URL information and returns it in list of dicts."""
553                 self.initialize()
554                 return self._real_extract(url)
555
556         def set_downloader(self, downloader):
557                 """Sets the downloader for this IE."""
558                 self._downloader = downloader
559         
560         def _real_initialize(self):
561                 """Real initialization process. Redefine in subclasses."""
562                 pass
563
564         def _real_extract(self, url):
565                 """Real extraction process. Redefine in subclasses."""
566                 pass
567
568 class YoutubeIE(InfoExtractor):
569         """Information extractor for youtube.com."""
570
571         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
572         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
573         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
574         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
575         _NETRC_MACHINE = 'youtube'
576         _available_formats = ['37', '22', '35', '18', '5', '17', '13', None] # listed in order of priority for -b flag
577         _video_extensions = {
578                 '13': '3gp',
579                 '17': 'mp4',
580                 '18': 'mp4',
581                 '22': 'mp4',
582                 '37': 'mp4',
583         }
584
585         @staticmethod
586         def suitable(url):
587                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
588
589         @staticmethod
590         def htmlentity_transform(matchobj):
591                 """Transforms an HTML entity to a Unicode character."""
592                 entity = matchobj.group(1)
593
594                 # Known non-numeric HTML entity
595                 if entity in htmlentitydefs.name2codepoint:
596                         return unichr(htmlentitydefs.name2codepoint[entity])
597
598                 # Unicode character
599                 mobj = re.match(ur'(?u)#(x?\d+)', entity)
600                 if mobj is not None:
601                         numstr = mobj.group(1)
602                         if numstr.startswith(u'x'):
603                                 base = 16
604                                 numstr = u'0%s' % numstr
605                         else:
606                                 base = 10
607                         return unichr(long(numstr, base))
608
609                 # Unknown entity in name, return its literal representation
610                 return (u'&%s;' % entity)
611
612         def report_lang(self):
613                 """Report attempt to set language."""
614                 self._downloader.to_stdout(u'[youtube] Setting language')
615
616         def report_login(self):
617                 """Report attempt to log in."""
618                 self._downloader.to_stdout(u'[youtube] Logging in')
619         
620         def report_age_confirmation(self):
621                 """Report attempt to confirm age."""
622                 self._downloader.to_stdout(u'[youtube] Confirming age')
623         
624         def report_video_info_webpage_download(self, video_id):
625                 """Report attempt to download video info webpage."""
626                 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
627         
628         def report_information_extraction(self, video_id):
629                 """Report attempt to extract video information."""
630                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
631         
632         def report_unavailable_format(self, video_id, format):
633                 """Report extracted video URL."""
634                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
635         
636         def report_rtmp_download(self):
637                 """Indicate the download will use the RTMP protocol."""
638                 self._downloader.to_stdout(u'[youtube] RTMP download detected')
639         
640         def _real_initialize(self):
641                 if self._downloader is None:
642                         return
643
644                 username = None
645                 password = None
646                 downloader_params = self._downloader.params
647
648                 # Attempt to use provided username and password or .netrc data
649                 if downloader_params.get('username', None) is not None:
650                         username = downloader_params['username']
651                         password = downloader_params['password']
652                 elif downloader_params.get('usenetrc', False):
653                         try:
654                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
655                                 if info is not None:
656                                         username = info[0]
657                                         password = info[2]
658                                 else:
659                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
660                         except (IOError, netrc.NetrcParseError), err:
661                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
662                                 return
663
664                 # Set language
665                 request = urllib2.Request(self._LANG_URL, None, std_headers)
666                 try:
667                         self.report_lang()
668                         urllib2.urlopen(request).read()
669                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
670                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
671                         return
672
673                 # No authentication to be performed
674                 if username is None:
675                         return
676
677                 # Log in
678                 login_form = {
679                                 'current_form': 'loginForm',
680                                 'next':         '/',
681                                 'action_login': 'Log In',
682                                 'username':     username,
683                                 'password':     password,
684                                 }
685                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
686                 try:
687                         self.report_login()
688                         login_results = urllib2.urlopen(request).read()
689                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
690                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
691                                 return
692                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
693                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
694                         return
695         
696                 # Confirm age
697                 age_form = {
698                                 'next_url':             '/',
699                                 'action_confirm':       'Confirm',
700                                 }
701                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
702                 try:
703                         self.report_age_confirmation()
704                         age_results = urllib2.urlopen(request).read()
705                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
706                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
707                         return
708
709         def _real_extract(self, url):
710                 # Extract video id from URL
711                 mobj = re.match(self._VALID_URL, url)
712                 if mobj is None:
713                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
714                         return
715                 video_id = mobj.group(2)
716
717                 # Downloader parameters
718                 best_quality = False
719                 format_param = None
720                 quality_index = 0
721                 if self._downloader is not None:
722                         params = self._downloader.params
723                         format_param = params.get('format', None)
724                         if format_param == '0':
725                                 format_param = self._available_formats[quality_index]
726                                 best_quality = True
727
728                 while True:
729                         # Extension
730                         video_extension = self._video_extensions.get(format_param, 'flv')
731
732                         # Get video info
733                         video_info_url = 'http://www.youtube.com/get_video_info?&video_id=%s&el=detailpage&ps=default&eurl=&gl=US&hl=en' % video_id
734                         request = urllib2.Request(video_info_url, None, std_headers)
735                         try:
736                                 self.report_video_info_webpage_download(video_id)
737                                 video_info_webpage = urllib2.urlopen(request).read()
738                                 video_info = parse_qs(video_info_webpage)
739                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
740                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
741                                 return
742                         self.report_information_extraction(video_id)
743
744                         # "t" param
745                         if 'token' not in video_info:
746                                 # Attempt to see if YouTube has issued an error message
747                                 if 'reason' not in video_info:
748                                         self._downloader.trouble(u'ERROR: unable to extract "t" parameter for unknown reason')
749                                         stream = open('reportme-ydl-%s.dat' % time.time(), 'wb')
750                                         stream.write(video_info_webpage)
751                                         stream.close()
752                                 else:
753                                         reason = urllib.unquote_plus(video_info['reason'][0])
754                                         self._downloader.trouble(u'ERROR: YouTube said: %s' % reason.decode('utf-8'))
755                                 return
756                         token = urllib.unquote_plus(video_info['token'][0])
757                         video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=detailpage&ps=default&gl=US&hl=en' % (video_id, token)
758                         if format_param is not None:
759                                 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
760
761                         # Check possible RTMP download
762                         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
763                                 self.report_rtmp_download()
764                                 video_real_url = video_info['conn'][0]
765
766                         # uploader
767                         if 'author' not in video_info:
768                                 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
769                                 return
770                         video_uploader = urllib.unquote_plus(video_info['author'][0])
771
772                         # title
773                         if 'title' not in video_info:
774                                 self._downloader.trouble(u'ERROR: unable to extract video title')
775                                 return
776                         video_title = urllib.unquote_plus(video_info['title'][0])
777                         video_title = video_title.decode('utf-8')
778                         video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
779                         video_title = video_title.replace(os.sep, u'%')
780
781                         # simplified title
782                         simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
783                         simple_title = simple_title.strip(ur'_')
784
785                         try:
786                                 # Process video information
787                                 self._downloader.process_info({
788                                         'id':           video_id.decode('utf-8'),
789                                         'url':          video_real_url.decode('utf-8'),
790                                         'uploader':     video_uploader.decode('utf-8'),
791                                         'title':        video_title,
792                                         'stitle':       simple_title,
793                                         'ext':          video_extension.decode('utf-8'),
794                                 })
795
796                                 return
797
798                         except UnavailableFormatError, err:
799                                 if best_quality:
800                                         if quality_index == len(self._available_formats) - 1:
801                                                 # I don't ever expect this to happen
802                                                 self._downloader.trouble(u'ERROR: no known formats available for video')
803                                                 return
804                                         else:
805                                                 self.report_unavailable_format(video_id, format_param)
806                                                 quality_index += 1
807                                                 format_param = self._available_formats[quality_index]
808                                                 continue
809                                 else: 
810                                         self._downloader.trouble('ERROR: format not available for video')
811                                         return
812
813
814 class MetacafeIE(InfoExtractor):
815         """Information Extractor for metacafe.com."""
816
817         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
818         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
819         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
820         _youtube_ie = None
821
822         def __init__(self, youtube_ie, downloader=None):
823                 InfoExtractor.__init__(self, downloader)
824                 self._youtube_ie = youtube_ie
825
826         @staticmethod
827         def suitable(url):
828                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
829
830         def report_disclaimer(self):
831                 """Report disclaimer retrieval."""
832                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
833
834         def report_age_confirmation(self):
835                 """Report attempt to confirm age."""
836                 self._downloader.to_stdout(u'[metacafe] Confirming age')
837         
838         def report_download_webpage(self, video_id):
839                 """Report webpage download."""
840                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
841         
842         def report_extraction(self, video_id):
843                 """Report information extraction."""
844                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
845
846         def _real_initialize(self):
847                 # Retrieve disclaimer
848                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
849                 try:
850                         self.report_disclaimer()
851                         disclaimer = urllib2.urlopen(request).read()
852                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
853                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
854                         return
855
856                 # Confirm age
857                 disclaimer_form = {
858                         'filters': '0',
859                         'submit': "Continue - I'm over 18",
860                         }
861                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
862                 try:
863                         self.report_age_confirmation()
864                         disclaimer = urllib2.urlopen(request).read()
865                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
866                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
867                         return
868         
869         def _real_extract(self, url):
870                 # Extract id and simplified title from URL
871                 mobj = re.match(self._VALID_URL, url)
872                 if mobj is None:
873                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
874                         return
875
876                 video_id = mobj.group(1)
877
878                 # Check if video comes from YouTube
879                 mobj2 = re.match(r'^yt-(.*)$', video_id)
880                 if mobj2 is not None:
881                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
882                         return
883
884                 simple_title = mobj.group(2).decode('utf-8')
885                 video_extension = 'flv'
886
887                 # Retrieve video webpage to extract further information
888                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
889                 try:
890                         self.report_download_webpage(video_id)
891                         webpage = urllib2.urlopen(request).read()
892                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
893                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
894                         return
895
896                 # Extract URL, uploader and title from webpage
897                 self.report_extraction(video_id)
898                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
899                 if mobj is None:
900                         self._downloader.trouble(u'ERROR: unable to extract media URL')
901                         return
902                 mediaURL = urllib.unquote(mobj.group(1))
903
904                 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
905                 #if mobj is None:
906                 #       self._downloader.trouble(u'ERROR: unable to extract gdaKey')
907                 #       return
908                 #gdaKey = mobj.group(1)
909                 #
910                 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
911
912                 video_url = mediaURL
913
914                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
915                 if mobj is None:
916                         self._downloader.trouble(u'ERROR: unable to extract title')
917                         return
918                 video_title = mobj.group(1).decode('utf-8')
919
920                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
921                 if mobj is None:
922                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
923                         return
924                 video_uploader = mobj.group(1)
925
926                 try:
927                         # Process video information
928                         self._downloader.process_info({
929                                 'id':           video_id.decode('utf-8'),
930                                 'url':          video_url.decode('utf-8'),
931                                 'uploader':     video_uploader.decode('utf-8'),
932                                 'title':        video_title,
933                                 'stitle':       simple_title,
934                                 'ext':          video_extension.decode('utf-8'),
935                         })
936                 except UnavailableFormatError:
937                         self._downloader.trouble(u'ERROR: format not available for video')
938
939
940 class GoogleIE(InfoExtractor):
941         """Information extractor for video.google.com."""
942
943         _VALID_URL = r'(?:http://)?video\.google\.com/videoplay\?docid=([^\&]+).*'
944
945         def __init__(self, downloader=None):
946                 InfoExtractor.__init__(self, downloader)
947
948         @staticmethod
949         def suitable(url):
950                 return (re.match(GoogleIE._VALID_URL, url) is not None)
951
952         def report_download_webpage(self, video_id):
953                 """Report webpage download."""
954                 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
955
956         def report_extraction(self, video_id):
957                 """Report information extraction."""
958                 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
959
960         def _real_initialize(self):
961                 return
962
963         def _real_extract(self, url):
964                 # Extract id from URL
965                 mobj = re.match(self._VALID_URL, url)
966                 if mobj is None:
967                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
968                         return
969
970                 video_id = mobj.group(1)
971
972                 video_extension = 'mp4'
973
974                 # Retrieve video webpage to extract further information
975                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s' % video_id)
976                 try:
977                         self.report_download_webpage(video_id)
978                         webpage = urllib2.urlopen(request).read()
979                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
980                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
981                         return
982
983                 # Extract URL, uploader, and title from webpage
984                 self.report_extraction(video_id)
985                 mobj = re.search(r"download_url:'(.*)'", webpage)
986                 if mobj is None:
987                         self._downloader.trouble(u'ERROR: unable to extract media URL')
988                         return
989                 mediaURL = urllib.unquote(mobj.group(1))
990                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
991                 mediaURL = mediaURL.replace('\\x26', '\x26')
992
993                 video_url = mediaURL
994
995                 mobj = re.search(r'<title>(.*)</title>', webpage)
996                 if mobj is None:
997                         self._downloader.trouble(u'ERROR: unable to extract title')
998                         return
999                 video_title = mobj.group(1).decode('utf-8')
1000
1001                 # Google Video doesn't show uploader nicknames?
1002                 video_uploader = 'uploader'
1003
1004                 try:
1005                         # Process video information
1006                         self._downloader.process_info({
1007                                 'id':           video_id.decode('utf-8'),
1008                                 'url':          video_url.decode('utf-8'),
1009                                 'uploader':     video_uploader.decode('utf-8'),
1010                                 'title':        video_title.decode('utf-8'),
1011                                 'stitle':       video_title.decode('utf-8'),
1012                                 'ext':          video_extension.decode('utf-8'),
1013                         })
1014                 except UnavailableFormatError:
1015                         self._downloader.trouble(u'ERROR: format not available for video')
1016
1017
1018 class PhotobucketIE(InfoExtractor):
1019         """Information extractor for photobucket.com."""
1020
1021         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1022
1023         def __init__(self, downloader=None):
1024                 InfoExtractor.__init__(self, downloader)
1025
1026         @staticmethod
1027         def suitable(url):
1028                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1029
1030         def report_download_webpage(self, video_id):
1031                 """Report webpage download."""
1032                 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1033
1034         def report_extraction(self, video_id):
1035                 """Report information extraction."""
1036                 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1037
1038         def _real_initialize(self):
1039                 return
1040
1041         def _real_extract(self, url):
1042                 # Extract id from URL
1043                 mobj = re.match(self._VALID_URL, url)
1044                 if mobj is None:
1045                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1046                         return
1047
1048                 video_id = mobj.group(1)
1049
1050                 video_extension = 'flv'
1051
1052                 # Retrieve video webpage to extract further information
1053                 request = urllib2.Request(url)
1054                 try:
1055                         self.report_download_webpage(video_id)
1056                         webpage = urllib2.urlopen(request).read()
1057                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1058                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1059                         return
1060
1061                 # Extract URL, uploader, and title from webpage
1062                 self.report_extraction(video_id)
1063                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1064                 if mobj is None:
1065                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1066                         return
1067                 mediaURL = urllib.unquote(mobj.group(1))
1068
1069                 video_url = mediaURL
1070
1071                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1072                 if mobj is None:
1073                         self._downloader.trouble(u'ERROR: unable to extract title')
1074                         return
1075                 video_title = mobj.group(1).decode('utf-8')
1076
1077                 video_uploader = mobj.group(2).decode('utf-8')
1078
1079                 try:
1080                         # Process video information
1081                         self._downloader.process_info({
1082                                 'id':           video_id.decode('utf-8'),
1083                                 'url':          video_url.decode('utf-8'),
1084                                 'uploader':     video_uploader.decode('utf-8'),
1085                                 'title':        video_title.decode('utf-8'),
1086                                 'stitle':       video_title.decode('utf-8'),
1087                                 'ext':          video_extension.decode('utf-8'),
1088                         })
1089                 except UnavailableFormatError:
1090                         self._downloader.trouble(u'ERROR: format not available for video')
1091
1092
1093 class YoutubeSearchIE(InfoExtractor):
1094         """Information Extractor for YouTube search queries."""
1095         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1096         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1097         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1098         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1099         _youtube_ie = None
1100         _max_youtube_results = 1000
1101
1102         def __init__(self, youtube_ie, downloader=None):
1103                 InfoExtractor.__init__(self, downloader)
1104                 self._youtube_ie = youtube_ie
1105         
1106         @staticmethod
1107         def suitable(url):
1108                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1109
1110         def report_download_page(self, query, pagenum):
1111                 """Report attempt to download playlist page with given number."""
1112                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1113
1114         def _real_initialize(self):
1115                 self._youtube_ie.initialize()
1116         
1117         def _real_extract(self, query):
1118                 mobj = re.match(self._VALID_QUERY, query)
1119                 if mobj is None:
1120                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1121                         return
1122
1123                 prefix, query = query.split(':')
1124                 prefix = prefix[8:]
1125                 if prefix == '':
1126                         self._download_n_results(query, 1)
1127                         return
1128                 elif prefix == 'all':
1129                         self._download_n_results(query, self._max_youtube_results)
1130                         return
1131                 else:
1132                         try:
1133                                 n = long(prefix)
1134                                 if n <= 0:
1135                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1136                                         return
1137                                 elif n > self._max_youtube_results:
1138                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1139                                         n = self._max_youtube_results
1140                                 self._download_n_results(query, n)
1141                                 return
1142                         except ValueError: # parsing prefix as integer fails
1143                                 self._download_n_results(query, 1)
1144                                 return
1145
1146         def _download_n_results(self, query, n):
1147                 """Downloads a specified number of results for a query"""
1148
1149                 video_ids = []
1150                 already_seen = set()
1151                 pagenum = 1
1152
1153                 while True:
1154                         self.report_download_page(query, pagenum)
1155                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1156                         request = urllib2.Request(result_url, None, std_headers)
1157                         try:
1158                                 page = urllib2.urlopen(request).read()
1159                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1160                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1161                                 return
1162
1163                         # Extract video identifiers
1164                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1165                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1166                                 if video_id not in already_seen:
1167                                         video_ids.append(video_id)
1168                                         already_seen.add(video_id)
1169                                         if len(video_ids) == n:
1170                                                 # Specified n videos reached
1171                                                 for id in video_ids:
1172                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1173                                                 return
1174
1175                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1176                                 for id in video_ids:
1177                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1178                                 return
1179
1180                         pagenum = pagenum + 1
1181
1182 class YoutubePlaylistIE(InfoExtractor):
1183         """Information Extractor for YouTube playlists."""
1184
1185         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:view_play_list|my_playlists)\?.*?p=([^&]+).*'
1186         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1187         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1188         _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&page=%s'
1189         _youtube_ie = None
1190
1191         def __init__(self, youtube_ie, downloader=None):
1192                 InfoExtractor.__init__(self, downloader)
1193                 self._youtube_ie = youtube_ie
1194         
1195         @staticmethod
1196         def suitable(url):
1197                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1198
1199         def report_download_page(self, playlist_id, pagenum):
1200                 """Report attempt to download playlist page with given number."""
1201                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1202
1203         def _real_initialize(self):
1204                 self._youtube_ie.initialize()
1205         
1206         def _real_extract(self, url):
1207                 # Extract playlist id
1208                 mobj = re.match(self._VALID_URL, url)
1209                 if mobj is None:
1210                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1211                         return
1212
1213                 # Download playlist pages
1214                 playlist_id = mobj.group(1)
1215                 video_ids = []
1216                 pagenum = 1
1217
1218                 while True:
1219                         self.report_download_page(playlist_id, pagenum)
1220                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1221                         try:
1222                                 page = urllib2.urlopen(request).read()
1223                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1224                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1225                                 return
1226
1227                         # Extract video identifiers
1228                         ids_in_page = []
1229                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1230                                 if mobj.group(1) not in ids_in_page:
1231                                         ids_in_page.append(mobj.group(1))
1232                         video_ids.extend(ids_in_page)
1233
1234                         if (self._MORE_PAGES_INDICATOR % (playlist_id.upper(), pagenum + 1)) not in page:
1235                                 break
1236                         pagenum = pagenum + 1
1237
1238                 for id in video_ids:
1239                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1240                 return
1241
1242 class YoutubeUserIE(InfoExtractor):
1243         """Information Extractor for YouTube users."""
1244
1245         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1246         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1247         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1248         _youtube_ie = None
1249
1250         def __init__(self, youtube_ie, downloader=None):
1251                 InfoExtractor.__init__(self, downloader)
1252                 self._youtube_ie = youtube_ie
1253         
1254         @staticmethod
1255         def suitable(url):
1256                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1257
1258         def report_download_page(self, username):
1259                 """Report attempt to download user page."""
1260                 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1261
1262         def _real_initialize(self):
1263                 self._youtube_ie.initialize()
1264         
1265         def _real_extract(self, url):
1266                 # Extract username
1267                 mobj = re.match(self._VALID_URL, url)
1268                 if mobj is None:
1269                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1270                         return
1271
1272                 # Download user page
1273                 username = mobj.group(1)
1274                 video_ids = []
1275                 pagenum = 1
1276
1277                 self.report_download_page(username)
1278                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1279                 try:
1280                         page = urllib2.urlopen(request).read()
1281                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1282                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1283                         return
1284
1285                 # Extract video identifiers
1286                 ids_in_page = []
1287
1288                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1289                         if mobj.group(1) not in ids_in_page:
1290                                 ids_in_page.append(mobj.group(1))
1291                 video_ids.extend(ids_in_page)
1292
1293                 for id in video_ids:
1294                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1295                 return
1296
1297 class PostProcessor(object):
1298         """Post Processor class.
1299
1300         PostProcessor objects can be added to downloaders with their
1301         add_post_processor() method. When the downloader has finished a
1302         successful download, it will take its internal chain of PostProcessors
1303         and start calling the run() method on each one of them, first with
1304         an initial argument and then with the returned value of the previous
1305         PostProcessor.
1306
1307         The chain will be stopped if one of them ever returns None or the end
1308         of the chain is reached.
1309
1310         PostProcessor objects follow a "mutual registration" process similar
1311         to InfoExtractor objects.
1312         """
1313
1314         _downloader = None
1315
1316         def __init__(self, downloader=None):
1317                 self._downloader = downloader
1318
1319         def set_downloader(self, downloader):
1320                 """Sets the downloader for this PP."""
1321                 self._downloader = downloader
1322         
1323         def run(self, information):
1324                 """Run the PostProcessor.
1325
1326                 The "information" argument is a dictionary like the ones
1327                 composed by InfoExtractors. The only difference is that this
1328                 one has an extra field called "filepath" that points to the
1329                 downloaded file.
1330
1331                 When this method returns None, the postprocessing chain is
1332                 stopped. However, this method may return an information
1333                 dictionary that will be passed to the next postprocessing
1334                 object in the chain. It can be the one it received after
1335                 changing some fields.
1336
1337                 In addition, this method may raise a PostProcessingError
1338                 exception that will be taken into account by the downloader
1339                 it was called from.
1340                 """
1341                 return information # by default, do nothing
1342         
1343 ### MAIN PROGRAM ###
1344 if __name__ == '__main__':
1345         try:
1346                 # Modules needed only when running the main program
1347                 import getpass
1348                 import optparse
1349
1350                 # Function to update the program file with the latest version from bitbucket.org
1351                 def update_self(downloader, filename):
1352                         # Note: downloader only used for options
1353                         if not os.access (filename, os.W_OK):
1354                                 sys.exit('ERROR: no write permissions on %s' % filename)
1355
1356                         downloader.to_stdout('Updating to latest stable version...')
1357                         latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
1358                         latest_version = urllib.urlopen(latest_url).read().strip()
1359                         prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
1360                         newcontent = urllib.urlopen(prog_url).read()
1361                         stream = open(filename, 'w')
1362                         stream.write(newcontent)
1363                         stream.close()
1364                         downloader.to_stdout('Updated to version %s' % latest_version)
1365
1366                 # General configuration
1367                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
1368                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
1369                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
1370
1371                 # Parse command line
1372                 parser = optparse.OptionParser(
1373                         usage='Usage: %prog [options] url...',
1374                         version='2010.01.06',
1375                         conflict_handler='resolve',
1376                 )
1377
1378                 parser.add_option('-h', '--help',
1379                                 action='help', help='print this help text and exit')
1380                 parser.add_option('-v', '--version',
1381                                 action='version', help='print program version and exit')
1382                 parser.add_option('-U', '--update',
1383                                 action='store_true', dest='update_self', help='update this program to latest stable version')
1384                 parser.add_option('-i', '--ignore-errors',
1385                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1386                 parser.add_option('-r', '--rate-limit',
1387                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1388
1389                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
1390                 authentication.add_option('-u', '--username',
1391                                 dest='username', metavar='UN', help='account username')
1392                 authentication.add_option('-p', '--password',
1393                                 dest='password', metavar='PW', help='account password')
1394                 authentication.add_option('-n', '--netrc',
1395                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1396                 parser.add_option_group(authentication)
1397
1398                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
1399                 video_format.add_option('-f', '--format',
1400                                 action='store', dest='format', metavar='FMT', help='video format code')
1401                 video_format.add_option('-b', '--best-quality',
1402                                 action='store_const', dest='format', help='download the best quality video possible', const='0')
1403                 video_format.add_option('-m', '--mobile-version',
1404                                 action='store_const', dest='format', help='alias for -f 17', const='17')
1405                 video_format.add_option('-d', '--high-def',
1406                                 action='store_const', dest='format', help='alias for -f 22', const='22')
1407                 parser.add_option_group(video_format)
1408
1409                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
1410                 verbosity.add_option('-q', '--quiet',
1411                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
1412                 verbosity.add_option('-s', '--simulate',
1413                                 action='store_true', dest='simulate', help='do not download video', default=False)
1414                 verbosity.add_option('-g', '--get-url',
1415                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
1416                 verbosity.add_option('-e', '--get-title',
1417                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
1418                 parser.add_option_group(verbosity)
1419
1420                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
1421                 filesystem.add_option('-t', '--title',
1422                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
1423                 filesystem.add_option('-l', '--literal',
1424                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
1425                 filesystem.add_option('-o', '--output',
1426                                 dest='outtmpl', metavar='TPL', help='output filename template')
1427                 filesystem.add_option('-a', '--batch-file',
1428                                 dest='batchfile', metavar='F', help='file containing URLs to download')
1429                 filesystem.add_option('-w', '--no-overwrites',
1430                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
1431                 filesystem.add_option('-c', '--continue',
1432                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
1433                 parser.add_option_group(filesystem)
1434
1435                 (opts, args) = parser.parse_args()
1436         
1437                 # Batch file verification
1438                 batchurls = []
1439                 if opts.batchfile is not None:
1440                         try:
1441                                 batchurls = open(opts.batchfile, 'r').readlines()
1442                                 batchurls = [x.strip() for x in batchurls]
1443                                 batchurls = [x for x in batchurls if len(x) > 0]
1444                         except IOError:
1445                                 sys.exit(u'ERROR: batch file could not be read')
1446                 all_urls = batchurls + args
1447
1448                 # Conflicting, missing and erroneous options
1449                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1450                         parser.error(u'using .netrc conflicts with giving username/password')
1451                 if opts.password is not None and opts.username is None:
1452                         parser.error(u'account username missing')
1453                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1454                         parser.error(u'using output template conflicts with using title or literal title')
1455                 if opts.usetitle and opts.useliteral:
1456                         parser.error(u'using title conflicts with using literal title')
1457                 if opts.username is not None and opts.password is None:
1458                         opts.password = getpass.getpass(u'Type account password and press return:')
1459                 if opts.ratelimit is not None:
1460                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1461                         if numeric_limit is None:
1462                                 parser.error(u'invalid rate limit specified')
1463                         opts.ratelimit = numeric_limit
1464
1465                 # Information extractors
1466                 youtube_ie = YoutubeIE()
1467                 metacafe_ie = MetacafeIE(youtube_ie)
1468                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1469                 youtube_user_ie = YoutubeUserIE(youtube_ie)
1470                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1471                 google_ie = GoogleIE()
1472                 photobucket_ie = PhotobucketIE()
1473
1474                 # File downloader
1475                 fd = FileDownloader({
1476                         'usenetrc': opts.usenetrc,
1477                         'username': opts.username,
1478                         'password': opts.password,
1479                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1480                         'forceurl': opts.geturl,
1481                         'forcetitle': opts.gettitle,
1482                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1483                         'format': opts.format,
1484                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
1485                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1486                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1487                                 or u'%(id)s.%(ext)s'),
1488                         'ignoreerrors': opts.ignoreerrors,
1489                         'ratelimit': opts.ratelimit,
1490                         'nooverwrites': opts.nooverwrites,
1491                         'continuedl': opts.continue_dl,
1492                         })
1493                 fd.add_info_extractor(youtube_search_ie)
1494                 fd.add_info_extractor(youtube_pl_ie)
1495                 fd.add_info_extractor(youtube_user_ie)
1496                 fd.add_info_extractor(metacafe_ie)
1497                 fd.add_info_extractor(youtube_ie)
1498                 fd.add_info_extractor(google_ie)
1499                 fd.add_info_extractor(photobucket_ie)
1500
1501                 # Update version
1502                 if opts.update_self:
1503                         update_self(fd, sys.argv[0])
1504
1505                 # Maybe do nothing
1506                 if len(all_urls) < 1:
1507                         if not opts.update_self:
1508                                 parser.error(u'you must provide at least one URL')
1509                         else:
1510                                 sys.exit()
1511                 retcode = fd.download(all_urls)
1512                 sys.exit(retcode)
1513
1514         except DownloadError:
1515                 sys.exit(1)
1516         except SameFileError:
1517                 sys.exit(u'ERROR: fixed output name but more than one file to download')
1518         except KeyboardInterrupt:
1519                 sys.exit(u'\nERROR: Interrupted by user')