standardized the use of unescapeHTML; added clean_html()
[youtube-dl] / youtube_dl / __init__.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 __authors__  = (
5         'Ricardo Garcia Gonzalez',
6         'Danny Colligan',
7         'Benjamin Johnson',
8         'Vasyl\' Vavrychuk',
9         'Witold Baryluk',
10         'Paweł Paprota',
11         'Gergely Imreh',
12         'Rogério Brito',
13         'Philipp Hagemeister',
14         'Sören Schulze',
15         'Kevin Ngo',
16         'Ori Avtalion',
17         'shizeeg',
18         'Filippo Valsorda',
19         )
20
21 __license__ = 'Public Domain'
22 __version__ = '2012.02.27'
23
24 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
25
26
27 import cookielib
28 import datetime
29 import getpass
30 import gzip
31 import htmlentitydefs
32 import HTMLParser
33 import httplib
34 import locale
35 import math
36 import netrc
37 import optparse
38 import os
39 import os.path
40 import re
41 import shlex
42 import socket
43 import string
44 import subprocess
45 import sys
46 import time
47 import urllib
48 import urllib2
49 import warnings
50 import zlib
51
52 if os.name == 'nt':
53         import ctypes
54
55 try:
56         import email.utils
57 except ImportError: # Python 2.4
58         import email.Utils
59 try:
60         import cStringIO as StringIO
61 except ImportError:
62         import StringIO
63
64 # parse_qs was moved from the cgi module to the urlparse module recently.
65 try:
66         from urlparse import parse_qs
67 except ImportError:
68         from cgi import parse_qs
69
70 try:
71         import lxml.etree
72 except ImportError:
73         pass # Handled below
74
75 try:
76         import xml.etree.ElementTree
77 except ImportError: # Python<2.5: Not officially supported, but let it slip
78         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
79
80 std_headers = {
81         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
82         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
83         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
84         'Accept-Encoding': 'gzip, deflate',
85         'Accept-Language': 'en-us,en;q=0.5',
86 }
87
88 try:
89         import json
90 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
91         import re
92         class json(object):
93                 @staticmethod
94                 def loads(s):
95                         s = s.decode('UTF-8')
96                         def raiseError(msg, i):
97                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
98                         def skipSpace(i, expectMore=True):
99                                 while i < len(s) and s[i] in ' \t\r\n':
100                                         i += 1
101                                 if expectMore:
102                                         if i >= len(s):
103                                                 raiseError('Premature end', i)
104                                 return i
105                         def decodeEscape(match):
106                                 esc = match.group(1)
107                                 _STATIC = {
108                                         '"': '"',
109                                         '\\': '\\',
110                                         '/': '/',
111                                         'b': unichr(0x8),
112                                         'f': unichr(0xc),
113                                         'n': '\n',
114                                         'r': '\r',
115                                         't': '\t',
116                                 }
117                                 if esc in _STATIC:
118                                         return _STATIC[esc]
119                                 if esc[0] == 'u':
120                                         if len(esc) == 1+4:
121                                                 return unichr(int(esc[1:5], 16))
122                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
123                                                 hi = int(esc[1:5], 16)
124                                                 low = int(esc[7:11], 16)
125                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
126                                 raise ValueError('Unknown escape ' + str(esc))
127                         def parseString(i):
128                                 i += 1
129                                 e = i
130                                 while True:
131                                         e = s.index('"', e)
132                                         bslashes = 0
133                                         while s[e-bslashes-1] == '\\':
134                                                 bslashes += 1
135                                         if bslashes % 2 == 1:
136                                                 e += 1
137                                                 continue
138                                         break
139                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
140                                 stri = rexp.sub(decodeEscape, s[i:e])
141                                 return (e+1,stri)
142                         def parseObj(i):
143                                 i += 1
144                                 res = {}
145                                 i = skipSpace(i)
146                                 if s[i] == '}': # Empty dictionary
147                                         return (i+1,res)
148                                 while True:
149                                         if s[i] != '"':
150                                                 raiseError('Expected a string object key', i)
151                                         i,key = parseString(i)
152                                         i = skipSpace(i)
153                                         if i >= len(s) or s[i] != ':':
154                                                 raiseError('Expected a colon', i)
155                                         i,val = parse(i+1)
156                                         res[key] = val
157                                         i = skipSpace(i)
158                                         if s[i] == '}':
159                                                 return (i+1, res)
160                                         if s[i] != ',':
161                                                 raiseError('Expected comma or closing curly brace', i)
162                                         i = skipSpace(i+1)
163                         def parseArray(i):
164                                 res = []
165                                 i = skipSpace(i+1)
166                                 if s[i] == ']': # Empty array
167                                         return (i+1,res)
168                                 while True:
169                                         i,val = parse(i)
170                                         res.append(val)
171                                         i = skipSpace(i) # Raise exception if premature end
172                                         if s[i] == ']':
173                                                 return (i+1, res)
174                                         if s[i] != ',':
175                                                 raiseError('Expected a comma or closing bracket', i)
176                                         i = skipSpace(i+1)
177                         def parseDiscrete(i):
178                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
179                                         if s.startswith(k, i):
180                                                 return (i+len(k), v)
181                                 raiseError('Not a boolean (or null)', i)
182                         def parseNumber(i):
183                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
184                                 if mobj is None:
185                                         raiseError('Not a number', i)
186                                 nums = mobj.group(1)
187                                 if '.' in nums or 'e' in nums or 'E' in nums:
188                                         return (i+len(nums), float(nums))
189                                 return (i+len(nums), int(nums))
190                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
191                         def parse(i):
192                                 i = skipSpace(i)
193                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
194                                 i = skipSpace(i, False)
195                                 return (i,res)
196                         i,res = parse(0)
197                         if i < len(s):
198                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
199                         return res
200
201 def preferredencoding():
202         """Get preferred encoding.
203
204         Returns the best encoding scheme for the system, based on
205         locale.getpreferredencoding() and some further tweaks.
206         """
207         def yield_preferredencoding():
208                 try:
209                         pref = locale.getpreferredencoding()
210                         u'TEST'.encode(pref)
211                 except:
212                         pref = 'UTF-8'
213                 while True:
214                         yield pref
215         return yield_preferredencoding().next()
216
217
218 def htmlentity_transform(matchobj):
219         """Transforms an HTML entity to a Unicode character.
220
221         This function receives a match object and is intended to be used with
222         the re.sub() function.
223         """
224         entity = matchobj.group(1)
225
226         # Known non-numeric HTML entity
227         if entity in htmlentitydefs.name2codepoint:
228                 return unichr(htmlentitydefs.name2codepoint[entity])
229
230         # Unicode character
231         mobj = re.match(ur'(?u)#(x?\d+)', entity)
232         if mobj is not None:
233                 numstr = mobj.group(1)
234                 if numstr.startswith(u'x'):
235                         base = 16
236                         numstr = u'0%s' % numstr
237                 else:
238                         base = 10
239                 return unichr(long(numstr, base))
240
241         # Unknown entity in name, return its literal representation
242         return (u'&%s;' % entity)
243
244
245 def clean_html(html):
246         """Clean an HTML snippet into a readable string"""
247         # Newline vs <br />
248         html = html.replace('\n', ' ')
249         html = re.sub('<\s*br\s*/?\s*>', '\n', html)
250         # Strip html tags
251         html = re.sub('<.*?>', '', html)
252         # Replace html entities
253         html = re.sub(ur'(?u)&(.+?);', htmlentity_transform, html)
254         return html
255
256
257 def sanitize_title(utitle):
258         """Sanitizes a video title so it could be used as part of a filename."""
259         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
260         return utitle.replace(unicode(os.sep), u'%')
261
262
263 def sanitize_open(filename, open_mode):
264         """Try to open the given filename, and slightly tweak it if this fails.
265
266         Attempts to open the given filename. If this fails, it tries to change
267         the filename slightly, step by step, until it's either able to open it
268         or it fails and raises a final exception, like the standard open()
269         function.
270
271         It returns the tuple (stream, definitive_file_name).
272         """
273         try:
274                 if filename == u'-':
275                         if sys.platform == 'win32':
276                                 import msvcrt
277                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
278                         return (sys.stdout, filename)
279                 stream = open(_encodeFilename(filename), open_mode)
280                 return (stream, filename)
281         except (IOError, OSError), err:
282                 # In case of error, try to remove win32 forbidden chars
283                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
284
285                 # An exception here should be caught in the caller
286                 stream = open(_encodeFilename(filename), open_mode)
287                 return (stream, filename)
288
289
290 def timeconvert(timestr):
291         """Convert RFC 2822 defined time string into system timestamp"""
292         timestamp = None
293         timetuple = email.utils.parsedate_tz(timestr)
294         if timetuple is not None:
295                 timestamp = email.utils.mktime_tz(timetuple)
296         return timestamp
297
298 def _simplify_title(title):
299         expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
300         return expr.sub(u'_', title).strip(u'_')
301
302 def _orderedSet(iterable):
303         """ Remove all duplicates from the input iterable """
304         res = []
305         for el in iterable:
306                 if el not in res:
307                         res.append(el)
308         return res
309
310 def _unescapeHTML(s):
311         """
312         @param s a string (of type unicode)
313         """
314         assert type(s) == type(u'')
315
316         htmlParser = HTMLParser.HTMLParser()
317         return htmlParser.unescape(s)
318
319 def _encodeFilename(s):
320         """
321         @param s The name of the file (of type unicode)
322         """
323
324         assert type(s) == type(u'')
325
326         if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
327                 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
328                 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
329                 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
330                 return s
331         else:
332                 return s.encode(sys.getfilesystemencoding(), 'ignore')
333
334 class DownloadError(Exception):
335         """Download Error exception.
336
337         This exception may be thrown by FileDownloader objects if they are not
338         configured to continue on errors. They will contain the appropriate
339         error message.
340         """
341         pass
342
343
344 class SameFileError(Exception):
345         """Same File exception.
346
347         This exception will be thrown by FileDownloader objects if they detect
348         multiple files would have to be downloaded to the same file on disk.
349         """
350         pass
351
352
353 class PostProcessingError(Exception):
354         """Post Processing exception.
355
356         This exception may be raised by PostProcessor's .run() method to
357         indicate an error in the postprocessing task.
358         """
359         pass
360
361 class MaxDownloadsReached(Exception):
362         """ --max-downloads limit has been reached. """
363         pass
364
365
366 class UnavailableVideoError(Exception):
367         """Unavailable Format exception.
368
369         This exception will be thrown when a video is requested
370         in a format that is not available for that video.
371         """
372         pass
373
374
375 class ContentTooShortError(Exception):
376         """Content Too Short exception.
377
378         This exception may be raised by FileDownloader objects when a file they
379         download is too small for what the server announced first, indicating
380         the connection was probably interrupted.
381         """
382         # Both in bytes
383         downloaded = None
384         expected = None
385
386         def __init__(self, downloaded, expected):
387                 self.downloaded = downloaded
388                 self.expected = expected
389
390
391 class YoutubeDLHandler(urllib2.HTTPHandler):
392         """Handler for HTTP requests and responses.
393
394         This class, when installed with an OpenerDirector, automatically adds
395         the standard headers to every HTTP request and handles gzipped and
396         deflated responses from web servers. If compression is to be avoided in
397         a particular request, the original request in the program code only has
398         to include the HTTP header "Youtubedl-No-Compression", which will be
399         removed before making the real request.
400
401         Part of this code was copied from:
402
403         http://techknack.net/python-urllib2-handlers/
404
405         Andrew Rowls, the author of that code, agreed to release it to the
406         public domain.
407         """
408
409         @staticmethod
410         def deflate(data):
411                 try:
412                         return zlib.decompress(data, -zlib.MAX_WBITS)
413                 except zlib.error:
414                         return zlib.decompress(data)
415
416         @staticmethod
417         def addinfourl_wrapper(stream, headers, url, code):
418                 if hasattr(urllib2.addinfourl, 'getcode'):
419                         return urllib2.addinfourl(stream, headers, url, code)
420                 ret = urllib2.addinfourl(stream, headers, url)
421                 ret.code = code
422                 return ret
423
424         def http_request(self, req):
425                 for h in std_headers:
426                         if h in req.headers:
427                                 del req.headers[h]
428                         req.add_header(h, std_headers[h])
429                 if 'Youtubedl-no-compression' in req.headers:
430                         if 'Accept-encoding' in req.headers:
431                                 del req.headers['Accept-encoding']
432                         del req.headers['Youtubedl-no-compression']
433                 return req
434
435         def http_response(self, req, resp):
436                 old_resp = resp
437                 # gzip
438                 if resp.headers.get('Content-encoding', '') == 'gzip':
439                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
440                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
441                         resp.msg = old_resp.msg
442                 # deflate
443                 if resp.headers.get('Content-encoding', '') == 'deflate':
444                         gz = StringIO.StringIO(self.deflate(resp.read()))
445                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
446                         resp.msg = old_resp.msg
447                 return resp
448
449
450 class FileDownloader(object):
451         """File Downloader class.
452
453         File downloader objects are the ones responsible of downloading the
454         actual video file and writing it to disk if the user has requested
455         it, among some other tasks. In most cases there should be one per
456         program. As, given a video URL, the downloader doesn't know how to
457         extract all the needed information, task that InfoExtractors do, it
458         has to pass the URL to one of them.
459
460         For this, file downloader objects have a method that allows
461         InfoExtractors to be registered in a given order. When it is passed
462         a URL, the file downloader handles it to the first InfoExtractor it
463         finds that reports being able to handle it. The InfoExtractor extracts
464         all the information about the video or videos the URL refers to, and
465         asks the FileDownloader to process the video information, possibly
466         downloading the video.
467
468         File downloaders accept a lot of parameters. In order not to saturate
469         the object constructor with arguments, it receives a dictionary of
470         options instead. These options are available through the params
471         attribute for the InfoExtractors to use. The FileDownloader also
472         registers itself as the downloader in charge for the InfoExtractors
473         that are added to it, so this is a "mutual registration".
474
475         Available options:
476
477         username:         Username for authentication purposes.
478         password:         Password for authentication purposes.
479         usenetrc:         Use netrc for authentication instead.
480         quiet:            Do not print messages to stdout.
481         forceurl:         Force printing final URL.
482         forcetitle:       Force printing title.
483         forcethumbnail:   Force printing thumbnail URL.
484         forcedescription: Force printing description.
485         forcefilename:    Force printing final filename.
486         simulate:         Do not download the video files.
487         format:           Video format code.
488         format_limit:     Highest quality format to try.
489         outtmpl:          Template for output names.
490         ignoreerrors:     Do not stop on download errors.
491         ratelimit:        Download speed limit, in bytes/sec.
492         nooverwrites:     Prevent overwriting files.
493         retries:          Number of times to retry for HTTP error 5xx
494         continuedl:       Try to continue downloads if possible.
495         noprogress:       Do not print the progress bar.
496         playliststart:    Playlist item to start at.
497         playlistend:      Playlist item to end at.
498         matchtitle:       Download only matching titles.
499         rejecttitle:      Reject downloads for matching titles.
500         logtostderr:      Log messages to stderr instead of stdout.
501         consoletitle:     Display progress in console window's titlebar.
502         nopart:           Do not use temporary .part files.
503         updatetime:       Use the Last-modified header to set output file timestamps.
504         writedescription: Write the video description to a .description file
505         writeinfojson:    Write the video description to a .info.json file
506         writesubtitles:   Write the video subtitles to a .srt file
507         subtitleslang:    Language of the subtitles to download
508         """
509
510         params = None
511         _ies = []
512         _pps = []
513         _download_retcode = None
514         _num_downloads = None
515         _screen_file = None
516
517         def __init__(self, params):
518                 """Create a FileDownloader object with the given options."""
519                 self._ies = []
520                 self._pps = []
521                 self._download_retcode = 0
522                 self._num_downloads = 0
523                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
524                 self.params = params
525
526         @staticmethod
527         def format_bytes(bytes):
528                 if bytes is None:
529                         return 'N/A'
530                 if type(bytes) is str:
531                         bytes = float(bytes)
532                 if bytes == 0.0:
533                         exponent = 0
534                 else:
535                         exponent = long(math.log(bytes, 1024.0))
536                 suffix = 'bkMGTPEZY'[exponent]
537                 converted = float(bytes) / float(1024 ** exponent)
538                 return '%.2f%s' % (converted, suffix)
539
540         @staticmethod
541         def calc_percent(byte_counter, data_len):
542                 if data_len is None:
543                         return '---.-%'
544                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
545
546         @staticmethod
547         def calc_eta(start, now, total, current):
548                 if total is None:
549                         return '--:--'
550                 dif = now - start
551                 if current == 0 or dif < 0.001: # One millisecond
552                         return '--:--'
553                 rate = float(current) / dif
554                 eta = long((float(total) - float(current)) / rate)
555                 (eta_mins, eta_secs) = divmod(eta, 60)
556                 if eta_mins > 99:
557                         return '--:--'
558                 return '%02d:%02d' % (eta_mins, eta_secs)
559
560         @staticmethod
561         def calc_speed(start, now, bytes):
562                 dif = now - start
563                 if bytes == 0 or dif < 0.001: # One millisecond
564                         return '%10s' % '---b/s'
565                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
566
567         @staticmethod
568         def best_block_size(elapsed_time, bytes):
569                 new_min = max(bytes / 2.0, 1.0)
570                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
571                 if elapsed_time < 0.001:
572                         return long(new_max)
573                 rate = bytes / elapsed_time
574                 if rate > new_max:
575                         return long(new_max)
576                 if rate < new_min:
577                         return long(new_min)
578                 return long(rate)
579
580         @staticmethod
581         def parse_bytes(bytestr):
582                 """Parse a string indicating a byte quantity into a long integer."""
583                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
584                 if matchobj is None:
585                         return None
586                 number = float(matchobj.group(1))
587                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
588                 return long(round(number * multiplier))
589
590         def add_info_extractor(self, ie):
591                 """Add an InfoExtractor object to the end of the list."""
592                 self._ies.append(ie)
593                 ie.set_downloader(self)
594
595         def add_post_processor(self, pp):
596                 """Add a PostProcessor object to the end of the chain."""
597                 self._pps.append(pp)
598                 pp.set_downloader(self)
599
600         def to_screen(self, message, skip_eol=False):
601                 """Print message to stdout if not in quiet mode."""
602                 assert type(message) == type(u'')
603                 if not self.params.get('quiet', False):
604                         terminator = [u'\n', u''][skip_eol]
605                         output = message + terminator
606
607                         if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
608                                 output = output.encode(preferredencoding(), 'ignore')
609                         self._screen_file.write(output)
610                         self._screen_file.flush()
611
612         def to_stderr(self, message):
613                 """Print message to stderr."""
614                 print >>sys.stderr, message.encode(preferredencoding())
615
616         def to_cons_title(self, message):
617                 """Set console/terminal window title to message."""
618                 if not self.params.get('consoletitle', False):
619                         return
620                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
621                         # c_wchar_p() might not be necessary if `message` is
622                         # already of type unicode()
623                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
624                 elif 'TERM' in os.environ:
625                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
626
627         def fixed_template(self):
628                 """Checks if the output template is fixed."""
629                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
630
631         def trouble(self, message=None):
632                 """Determine action to take when a download problem appears.
633
634                 Depending on if the downloader has been configured to ignore
635                 download errors or not, this method may throw an exception or
636                 not when errors are found, after printing the message.
637                 """
638                 if message is not None:
639                         self.to_stderr(message)
640                 if not self.params.get('ignoreerrors', False):
641                         raise DownloadError(message)
642                 self._download_retcode = 1
643
644         def slow_down(self, start_time, byte_counter):
645                 """Sleep if the download speed is over the rate limit."""
646                 rate_limit = self.params.get('ratelimit', None)
647                 if rate_limit is None or byte_counter == 0:
648                         return
649                 now = time.time()
650                 elapsed = now - start_time
651                 if elapsed <= 0.0:
652                         return
653                 speed = float(byte_counter) / elapsed
654                 if speed > rate_limit:
655                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
656
657         def temp_name(self, filename):
658                 """Returns a temporary filename for the given filename."""
659                 if self.params.get('nopart', False) or filename == u'-' or \
660                                 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
661                         return filename
662                 return filename + u'.part'
663
664         def undo_temp_name(self, filename):
665                 if filename.endswith(u'.part'):
666                         return filename[:-len(u'.part')]
667                 return filename
668
669         def try_rename(self, old_filename, new_filename):
670                 try:
671                         if old_filename == new_filename:
672                                 return
673                         os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
674                 except (IOError, OSError), err:
675                         self.trouble(u'ERROR: unable to rename file')
676
677         def try_utime(self, filename, last_modified_hdr):
678                 """Try to set the last-modified time of the given file."""
679                 if last_modified_hdr is None:
680                         return
681                 if not os.path.isfile(_encodeFilename(filename)):
682                         return
683                 timestr = last_modified_hdr
684                 if timestr is None:
685                         return
686                 filetime = timeconvert(timestr)
687                 if filetime is None:
688                         return filetime
689                 try:
690                         os.utime(filename, (time.time(), filetime))
691                 except:
692                         pass
693                 return filetime
694
695         def report_writedescription(self, descfn):
696                 """ Report that the description file is being written """
697                 self.to_screen(u'[info] Writing video description to: ' + descfn)
698
699         def report_writesubtitles(self, srtfn):
700                 """ Report that the subtitles file is being written """
701                 self.to_screen(u'[info] Writing video subtitles to: ' + srtfn)
702
703         def report_writeinfojson(self, infofn):
704                 """ Report that the metadata file has been written """
705                 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
706
707         def report_destination(self, filename):
708                 """Report destination filename."""
709                 self.to_screen(u'[download] Destination: ' + filename)
710
711         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
712                 """Report download progress."""
713                 if self.params.get('noprogress', False):
714                         return
715                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
716                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
717                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
718                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
719
720         def report_resuming_byte(self, resume_len):
721                 """Report attempt to resume at given byte."""
722                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
723
724         def report_retry(self, count, retries):
725                 """Report retry in case of HTTP error 5xx"""
726                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
727
728         def report_file_already_downloaded(self, file_name):
729                 """Report file has already been fully downloaded."""
730                 try:
731                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
732                 except (UnicodeEncodeError), err:
733                         self.to_screen(u'[download] The file has already been downloaded')
734
735         def report_unable_to_resume(self):
736                 """Report it was impossible to resume download."""
737                 self.to_screen(u'[download] Unable to resume')
738
739         def report_finish(self):
740                 """Report download finished."""
741                 if self.params.get('noprogress', False):
742                         self.to_screen(u'[download] Download completed')
743                 else:
744                         self.to_screen(u'')
745
746         def increment_downloads(self):
747                 """Increment the ordinal that assigns a number to each file."""
748                 self._num_downloads += 1
749
750         def prepare_filename(self, info_dict):
751                 """Generate the output filename."""
752                 try:
753                         template_dict = dict(info_dict)
754                         template_dict['epoch'] = unicode(long(time.time()))
755                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
756                         filename = self.params['outtmpl'] % template_dict
757                         return filename
758                 except (ValueError, KeyError), err:
759                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
760                         return None
761
762         def _match_entry(self, info_dict):
763                 """ Returns None iff the file should be downloaded """
764
765                 title = info_dict['title']
766                 matchtitle = self.params.get('matchtitle', False)
767                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
768                         return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
769                 rejecttitle = self.params.get('rejecttitle', False)
770                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
771                         return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
772                 return None
773
774         def process_info(self, info_dict):
775                 """Process a single dictionary returned by an InfoExtractor."""
776
777                 reason = self._match_entry(info_dict)
778                 if reason is not None:
779                         self.to_screen(u'[download] ' + reason)
780                         return
781
782                 max_downloads = self.params.get('max_downloads')
783                 if max_downloads is not None:
784                         if self._num_downloads > int(max_downloads):
785                                 raise MaxDownloadsReached()
786
787                 filename = self.prepare_filename(info_dict)
788                 
789                 # Forced printings
790                 if self.params.get('forcetitle', False):
791                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
792                 if self.params.get('forceurl', False):
793                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
794                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
795                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
796                 if self.params.get('forcedescription', False) and 'description' in info_dict:
797                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
798                 if self.params.get('forcefilename', False) and filename is not None:
799                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
800                 if self.params.get('forceformat', False):
801                         print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
802
803                 # Do nothing else if in simulate mode
804                 if self.params.get('simulate', False):
805                         return
806
807                 if filename is None:
808                         return
809
810                 try:
811                         dn = os.path.dirname(_encodeFilename(filename))
812                         if dn != '' and not os.path.exists(dn): # dn is already encoded
813                                 os.makedirs(dn)
814                 except (OSError, IOError), err:
815                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
816                         return
817
818                 if self.params.get('writedescription', False):
819                         try:
820                                 descfn = filename + u'.description'
821                                 self.report_writedescription(descfn)
822                                 descfile = open(_encodeFilename(descfn), 'wb')
823                                 try:
824                                         descfile.write(info_dict['description'].encode('utf-8'))
825                                 finally:
826                                         descfile.close()
827                         except (OSError, IOError):
828                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
829                                 return
830                                 
831                 if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
832                         # subtitles download errors are already managed as troubles in relevant IE
833                         # that way it will silently go on when used with unsupporting IE 
834                         try:
835                                 srtfn = filename.rsplit('.', 1)[0] + u'.srt'
836                                 self.report_writesubtitles(srtfn)
837                                 srtfile = open(_encodeFilename(srtfn), 'wb')
838                                 try:
839                                         srtfile.write(info_dict['subtitles'].encode('utf-8'))
840                                 finally:
841                                         srtfile.close()
842                         except (OSError, IOError):
843                                 self.trouble(u'ERROR: Cannot write subtitles file ' + descfn)
844                                 return
845
846                 if self.params.get('writeinfojson', False):
847                         infofn = filename + u'.info.json'
848                         self.report_writeinfojson(infofn)
849                         try:
850                                 json.dump
851                         except (NameError,AttributeError):
852                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
853                                 return
854                         try:
855                                 infof = open(_encodeFilename(infofn), 'wb')
856                                 try:
857                                         json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
858                                         json.dump(json_info_dict, infof)
859                                 finally:
860                                         infof.close()
861                         except (OSError, IOError):
862                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
863                                 return
864
865                 if not self.params.get('skip_download', False):
866                         if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
867                                 success = True
868                         else:
869                                 try:
870                                         success = self._do_download(filename, info_dict)
871                                 except (OSError, IOError), err:
872                                         raise UnavailableVideoError
873                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
874                                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
875                                         return
876                                 except (ContentTooShortError, ), err:
877                                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
878                                         return
879         
880                         if success:
881                                 try:
882                                         self.post_process(filename, info_dict)
883                                 except (PostProcessingError), err:
884                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
885                                         return
886
887         def download(self, url_list):
888                 """Download a given list of URLs."""
889                 if len(url_list) > 1 and self.fixed_template():
890                         raise SameFileError(self.params['outtmpl'])
891
892                 for url in url_list:
893                         suitable_found = False
894                         for ie in self._ies:
895                                 # Go to next InfoExtractor if not suitable
896                                 if not ie.suitable(url):
897                                         continue
898
899                                 # Suitable InfoExtractor found
900                                 suitable_found = True
901
902                                 # Extract information from URL and process it
903                                 ie.extract(url)
904
905                                 # Suitable InfoExtractor had been found; go to next URL
906                                 break
907
908                         if not suitable_found:
909                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
910
911                 return self._download_retcode
912
913         def post_process(self, filename, ie_info):
914                 """Run the postprocessing chain on the given file."""
915                 info = dict(ie_info)
916                 info['filepath'] = filename
917                 for pp in self._pps:
918                         info = pp.run(info)
919                         if info is None:
920                                 break
921
922         def _download_with_rtmpdump(self, filename, url, player_url):
923                 self.report_destination(filename)
924                 tmpfilename = self.temp_name(filename)
925
926                 # Check for rtmpdump first
927                 try:
928                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
929                 except (OSError, IOError):
930                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
931                         return False
932
933                 # Download using rtmpdump. rtmpdump returns exit code 2 when
934                 # the connection was interrumpted and resuming appears to be
935                 # possible. This is part of rtmpdump's normal usage, AFAIK.
936                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
937                 args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
938                 if self.params.get('verbose', False):
939                         try:
940                                 import pipes
941                                 shell_quote = lambda args: ' '.join(map(pipes.quote, args))
942                         except ImportError:
943                                 shell_quote = repr
944                         self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
945                 retval = subprocess.call(args)
946                 while retval == 2 or retval == 1:
947                         prevsize = os.path.getsize(_encodeFilename(tmpfilename))
948                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
949                         time.sleep(5.0) # This seems to be needed
950                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
951                         cursize = os.path.getsize(_encodeFilename(tmpfilename))
952                         if prevsize == cursize and retval == 1:
953                                 break
954                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
955                         if prevsize == cursize and retval == 2 and cursize > 1024:
956                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
957                                 retval = 0
958                                 break
959                 if retval == 0:
960                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
961                         self.try_rename(tmpfilename, filename)
962                         return True
963                 else:
964                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
965                         return False
966
967         def _do_download(self, filename, info_dict):
968                 url = info_dict['url']
969                 player_url = info_dict.get('player_url', None)
970
971                 # Check file already present
972                 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
973                         self.report_file_already_downloaded(filename)
974                         return True
975
976                 # Attempt to download using rtmpdump
977                 if url.startswith('rtmp'):
978                         return self._download_with_rtmpdump(filename, url, player_url)
979
980                 tmpfilename = self.temp_name(filename)
981                 stream = None
982
983                 # Do not include the Accept-Encoding header
984                 headers = {'Youtubedl-no-compression': 'True'}
985                 basic_request = urllib2.Request(url, None, headers)
986                 request = urllib2.Request(url, None, headers)
987
988                 # Establish possible resume length
989                 if os.path.isfile(_encodeFilename(tmpfilename)):
990                         resume_len = os.path.getsize(_encodeFilename(tmpfilename))
991                 else:
992                         resume_len = 0
993
994                 open_mode = 'wb'
995                 if resume_len != 0:
996                         if self.params.get('continuedl', False):
997                                 self.report_resuming_byte(resume_len)
998                                 request.add_header('Range','bytes=%d-' % resume_len)
999                                 open_mode = 'ab'
1000                         else:
1001                                 resume_len = 0
1002
1003                 count = 0
1004                 retries = self.params.get('retries', 0)
1005                 while count <= retries:
1006                         # Establish connection
1007                         try:
1008                                 if count == 0 and 'urlhandle' in info_dict:
1009                                         data = info_dict['urlhandle']
1010                                 data = urllib2.urlopen(request)
1011                                 break
1012                         except (urllib2.HTTPError, ), err:
1013                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
1014                                         # Unexpected HTTP error
1015                                         raise
1016                                 elif err.code == 416:
1017                                         # Unable to resume (requested range not satisfiable)
1018                                         try:
1019                                                 # Open the connection again without the range header
1020                                                 data = urllib2.urlopen(basic_request)
1021                                                 content_length = data.info()['Content-Length']
1022                                         except (urllib2.HTTPError, ), err:
1023                                                 if err.code < 500 or err.code >= 600:
1024                                                         raise
1025                                         else:
1026                                                 # Examine the reported length
1027                                                 if (content_length is not None and
1028                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
1029                                                         # The file had already been fully downloaded.
1030                                                         # Explanation to the above condition: in issue #175 it was revealed that
1031                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
1032                                                         # changing the file size slightly and causing problems for some users. So
1033                                                         # I decided to implement a suggested change and consider the file
1034                                                         # completely downloaded if the file size differs less than 100 bytes from
1035                                                         # the one in the hard drive.
1036                                                         self.report_file_already_downloaded(filename)
1037                                                         self.try_rename(tmpfilename, filename)
1038                                                         return True
1039                                                 else:
1040                                                         # The length does not match, we start the download over
1041                                                         self.report_unable_to_resume()
1042                                                         open_mode = 'wb'
1043                                                         break
1044                         # Retry
1045                         count += 1
1046                         if count <= retries:
1047                                 self.report_retry(count, retries)
1048
1049                 if count > retries:
1050                         self.trouble(u'ERROR: giving up after %s retries' % retries)
1051                         return False
1052
1053                 data_len = data.info().get('Content-length', None)
1054                 if data_len is not None:
1055                         data_len = long(data_len) + resume_len
1056                 data_len_str = self.format_bytes(data_len)
1057                 byte_counter = 0 + resume_len
1058                 block_size = 1024
1059                 start = time.time()
1060                 while True:
1061                         # Download and write
1062                         before = time.time()
1063                         data_block = data.read(block_size)
1064                         after = time.time()
1065                         if len(data_block) == 0:
1066                                 break
1067                         byte_counter += len(data_block)
1068
1069                         # Open file just in time
1070                         if stream is None:
1071                                 try:
1072                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1073                                         assert stream is not None
1074                                         filename = self.undo_temp_name(tmpfilename)
1075                                         self.report_destination(filename)
1076                                 except (OSError, IOError), err:
1077                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1078                                         return False
1079                         try:
1080                                 stream.write(data_block)
1081                         except (IOError, OSError), err:
1082                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1083                                 return False
1084                         block_size = self.best_block_size(after - before, len(data_block))
1085
1086                         # Progress message
1087                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1088                         if data_len is None:
1089                                 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1090                         else:
1091                                 percent_str = self.calc_percent(byte_counter, data_len)
1092                                 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1093                                 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1094
1095                         # Apply rate limit
1096                         self.slow_down(start, byte_counter - resume_len)
1097
1098                 if stream is None:
1099                         self.trouble(u'\nERROR: Did not get any data blocks')
1100                         return False
1101                 stream.close()
1102                 self.report_finish()
1103                 if data_len is not None and byte_counter != data_len:
1104                         raise ContentTooShortError(byte_counter, long(data_len))
1105                 self.try_rename(tmpfilename, filename)
1106
1107                 # Update file modification time
1108                 if self.params.get('updatetime', True):
1109                         info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1110
1111                 return True
1112
1113
1114 class InfoExtractor(object):
1115         """Information Extractor class.
1116
1117         Information extractors are the classes that, given a URL, extract
1118         information from the video (or videos) the URL refers to. This
1119         information includes the real video URL, the video title and simplified
1120         title, author and others. The information is stored in a dictionary
1121         which is then passed to the FileDownloader. The FileDownloader
1122         processes this information possibly downloading the video to the file
1123         system, among other possible outcomes. The dictionaries must include
1124         the following fields:
1125
1126         id:             Video identifier.
1127         url:            Final video URL.
1128         uploader:       Nickname of the video uploader.
1129         title:          Literal title.
1130         stitle:         Simplified title.
1131         ext:            Video filename extension.
1132         format:         Video format.
1133         player_url:     SWF Player URL (may be None).
1134
1135         The following fields are optional. Their primary purpose is to allow
1136         youtube-dl to serve as the backend for a video search function, such
1137         as the one in youtube2mp3.  They are only used when their respective
1138         forced printing functions are called:
1139
1140         thumbnail:      Full URL to a video thumbnail image.
1141         description:    One-line video description.
1142
1143         Subclasses of this one should re-define the _real_initialize() and
1144         _real_extract() methods and define a _VALID_URL regexp.
1145         Probably, they should also be added to the list of extractors.
1146         """
1147
1148         _ready = False
1149         _downloader = None
1150
1151         def __init__(self, downloader=None):
1152                 """Constructor. Receives an optional downloader."""
1153                 self._ready = False
1154                 self.set_downloader(downloader)
1155
1156         def suitable(self, url):
1157                 """Receives a URL and returns True if suitable for this IE."""
1158                 return re.match(self._VALID_URL, url) is not None
1159
1160         def initialize(self):
1161                 """Initializes an instance (authentication, etc)."""
1162                 if not self._ready:
1163                         self._real_initialize()
1164                         self._ready = True
1165
1166         def extract(self, url):
1167                 """Extracts URL information and returns it in list of dicts."""
1168                 self.initialize()
1169                 return self._real_extract(url)
1170
1171         def set_downloader(self, downloader):
1172                 """Sets the downloader for this IE."""
1173                 self._downloader = downloader
1174
1175         def _real_initialize(self):
1176                 """Real initialization process. Redefine in subclasses."""
1177                 pass
1178
1179         def _real_extract(self, url):
1180                 """Real extraction process. Redefine in subclasses."""
1181                 pass
1182
1183
1184 class YoutubeIE(InfoExtractor):
1185         """Information extractor for youtube.com."""
1186
1187         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1188         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1189         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1190         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1191         _NETRC_MACHINE = 'youtube'
1192         # Listed in order of quality
1193         _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1194         _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1195         _video_extensions = {
1196                 '13': '3gp',
1197                 '17': 'mp4',
1198                 '18': 'mp4',
1199                 '22': 'mp4',
1200                 '37': 'mp4',
1201                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1202                 '43': 'webm',
1203                 '44': 'webm',
1204                 '45': 'webm',
1205         }
1206         _video_dimensions = {
1207                 '5': '240x400',
1208                 '6': '???',
1209                 '13': '???',
1210                 '17': '144x176',
1211                 '18': '360x640',
1212                 '22': '720x1280',
1213                 '34': '360x640',
1214                 '35': '480x854',
1215                 '37': '1080x1920',
1216                 '38': '3072x4096',
1217                 '43': '360x640',
1218                 '44': '480x854',
1219                 '45': '720x1280',
1220         }       
1221         IE_NAME = u'youtube'
1222
1223         def report_lang(self):
1224                 """Report attempt to set language."""
1225                 self._downloader.to_screen(u'[youtube] Setting language')
1226
1227         def report_login(self):
1228                 """Report attempt to log in."""
1229                 self._downloader.to_screen(u'[youtube] Logging in')
1230
1231         def report_age_confirmation(self):
1232                 """Report attempt to confirm age."""
1233                 self._downloader.to_screen(u'[youtube] Confirming age')
1234
1235         def report_video_webpage_download(self, video_id):
1236                 """Report attempt to download video webpage."""
1237                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1238
1239         def report_video_info_webpage_download(self, video_id):
1240                 """Report attempt to download video info webpage."""
1241                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1242
1243         def report_video_subtitles_download(self, video_id):
1244                 """Report attempt to download video info webpage."""
1245                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
1246
1247         def report_information_extraction(self, video_id):
1248                 """Report attempt to extract video information."""
1249                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1250
1251         def report_unavailable_format(self, video_id, format):
1252                 """Report extracted video URL."""
1253                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1254
1255         def report_rtmp_download(self):
1256                 """Indicate the download will use the RTMP protocol."""
1257                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1258
1259         def _closed_captions_xml_to_srt(self, xml_string):
1260                 srt = ''
1261                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
1262                 # TODO parse xml instead of regex
1263                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
1264                         if not dur: dur = '4'
1265                         start = float(start)
1266                         end = start + float(dur)
1267                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
1268                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
1269                         caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
1270                         caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
1271                         srt += str(n) + '\n'
1272                         srt += start + ' --> ' + end + '\n'
1273                         srt += caption + '\n\n'
1274                 return srt
1275
1276         def _print_formats(self, formats):
1277                 print 'Available formats:'
1278                 for x in formats:
1279                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1280
1281         def _real_initialize(self):
1282                 if self._downloader is None:
1283                         return
1284
1285                 username = None
1286                 password = None
1287                 downloader_params = self._downloader.params
1288
1289                 # Attempt to use provided username and password or .netrc data
1290                 if downloader_params.get('username', None) is not None:
1291                         username = downloader_params['username']
1292                         password = downloader_params['password']
1293                 elif downloader_params.get('usenetrc', False):
1294                         try:
1295                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1296                                 if info is not None:
1297                                         username = info[0]
1298                                         password = info[2]
1299                                 else:
1300                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1301                         except (IOError, netrc.NetrcParseError), err:
1302                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1303                                 return
1304
1305                 # Set language
1306                 request = urllib2.Request(self._LANG_URL)
1307                 try:
1308                         self.report_lang()
1309                         urllib2.urlopen(request).read()
1310                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1311                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1312                         return
1313
1314                 # No authentication to be performed
1315                 if username is None:
1316                         return
1317
1318                 # Log in
1319                 login_form = {
1320                                 'current_form': 'loginForm',
1321                                 'next':         '/',
1322                                 'action_login': 'Log In',
1323                                 'username':     username,
1324                                 'password':     password,
1325                                 }
1326                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1327                 try:
1328                         self.report_login()
1329                         login_results = urllib2.urlopen(request).read()
1330                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1331                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1332                                 return
1333                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1334                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1335                         return
1336
1337                 # Confirm age
1338                 age_form = {
1339                                 'next_url':             '/',
1340                                 'action_confirm':       'Confirm',
1341                                 }
1342                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1343                 try:
1344                         self.report_age_confirmation()
1345                         age_results = urllib2.urlopen(request).read()
1346                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1347                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1348                         return
1349
1350         def _real_extract(self, url):
1351                 # Extract video id from URL
1352                 mobj = re.match(self._VALID_URL, url)
1353                 if mobj is None:
1354                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1355                         return
1356                 video_id = mobj.group(2)
1357
1358                 # Get video webpage
1359                 self.report_video_webpage_download(video_id)
1360                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1361                 try:
1362                         video_webpage = urllib2.urlopen(request).read()
1363                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1364                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1365                         return
1366
1367                 # Attempt to extract SWF player URL
1368                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1369                 if mobj is not None:
1370                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1371                 else:
1372                         player_url = None
1373
1374                 # Get video info
1375                 self.report_video_info_webpage_download(video_id)
1376                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1377                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1378                                         % (video_id, el_type))
1379                         request = urllib2.Request(video_info_url)
1380                         try:
1381                                 video_info_webpage = urllib2.urlopen(request).read()
1382                                 video_info = parse_qs(video_info_webpage)
1383                                 if 'token' in video_info:
1384                                         break
1385                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1386                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1387                                 return
1388                 if 'token' not in video_info:
1389                         if 'reason' in video_info:
1390                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1391                         else:
1392                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1393                         return
1394
1395                 # Start extracting information
1396                 self.report_information_extraction(video_id)
1397
1398                 # uploader
1399                 if 'author' not in video_info:
1400                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1401                         return
1402                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1403
1404                 # title
1405                 if 'title' not in video_info:
1406                         self._downloader.trouble(u'ERROR: unable to extract video title')
1407                         return
1408                 video_title = urllib.unquote_plus(video_info['title'][0])
1409                 video_title = video_title.decode('utf-8')
1410                 video_title = sanitize_title(video_title)
1411
1412                 # simplified title
1413                 simple_title = _simplify_title(video_title)
1414
1415                 # thumbnail image
1416                 if 'thumbnail_url' not in video_info:
1417                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1418                         video_thumbnail = ''
1419                 else:   # don't panic if we can't find it
1420                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1421
1422                 # upload date
1423                 upload_date = u'NA'
1424                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1425                 if mobj is not None:
1426                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1427                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1428                         for expression in format_expressions:
1429                                 try:
1430                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1431                                 except:
1432                                         pass
1433
1434                 # description
1435                 try:
1436                         lxml.etree
1437                 except NameError:
1438                         video_description = u'No description available.'
1439                         mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
1440                         if mobj is not None:
1441                                 video_description = mobj.group(1).decode('utf-8')
1442                 else:
1443                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1444                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1445                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1446                         # TODO use another parser
1447                         
1448                 # closed captions
1449                 video_subtitles = None
1450                 if self._downloader.params.get('writesubtitles', False):
1451                         self.report_video_subtitles_download(video_id)
1452                         request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
1453                         try:
1454                                 srt_list = urllib2.urlopen(request).read()
1455                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1456                                 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1457                         else:
1458                                 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
1459                                 if srt_lang_list:
1460                                         if self._downloader.params.get('subtitleslang', False):
1461                                                 srt_lang = self._downloader.params.get('subtitleslang')
1462                                         elif 'en' in srt_lang_list:
1463                                                 srt_lang = 'en'
1464                                         else:
1465                                                 srt_lang = srt_lang_list[0]
1466                                         if not srt_lang in srt_lang_list:
1467                                                 self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
1468                                         else:
1469                                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
1470                                                 try:
1471                                                         srt_xml = urllib2.urlopen(request).read()
1472                                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1473                                                         self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1474                                                 else:
1475                                                         video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
1476                                 else:
1477                                         self._downloader.trouble(u'WARNING: video has no closed captions')
1478
1479                 # token
1480                 video_token = urllib.unquote_plus(video_info['token'][0])
1481
1482                 # Decide which formats to download
1483                 req_format = self._downloader.params.get('format', None)
1484
1485                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1486                         self.report_rtmp_download()
1487                         video_url_list = [(None, video_info['conn'][0])]
1488                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1489                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1490                         url_data = [parse_qs(uds) for uds in url_data_strs]
1491                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1492                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1493
1494                         format_limit = self._downloader.params.get('format_limit', None)
1495                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1496                         if format_limit is not None and format_limit in available_formats:
1497                                 format_list = available_formats[available_formats.index(format_limit):]
1498                         else:
1499                                 format_list = available_formats
1500                         existing_formats = [x for x in format_list if x in url_map]
1501                         if len(existing_formats) == 0:
1502                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1503                                 return
1504                         if self._downloader.params.get('listformats', None):
1505                                 self._print_formats(existing_formats)
1506                                 return
1507                         if req_format is None or req_format == 'best':
1508                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1509                         elif req_format == 'worst':
1510                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1511                         elif req_format in ('-1', 'all'):
1512                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1513                         else:
1514                                 # Specific formats. We pick the first in a slash-delimeted sequence.
1515                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1516                                 req_formats = req_format.split('/')
1517                                 video_url_list = None
1518                                 for rf in req_formats:
1519                                         if rf in url_map:
1520                                                 video_url_list = [(rf, url_map[rf])]
1521                                                 break
1522                                 if video_url_list is None:
1523                                         self._downloader.trouble(u'ERROR: requested format not available')
1524                                         return
1525                 else:
1526                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1527                         return
1528
1529                 for format_param, video_real_url in video_url_list:
1530                         # At this point we have a new video
1531                         self._downloader.increment_downloads()
1532
1533                         # Extension
1534                         video_extension = self._video_extensions.get(format_param, 'flv')
1535
1536                         try:
1537                                 # Process video information
1538                                 self._downloader.process_info({
1539                                         'id':           video_id.decode('utf-8'),
1540                                         'url':          video_real_url.decode('utf-8'),
1541                                         'uploader':     video_uploader.decode('utf-8'),
1542                                         'upload_date':  upload_date,
1543                                         'title':        video_title,
1544                                         'stitle':       simple_title,
1545                                         'ext':          video_extension.decode('utf-8'),
1546                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1547                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1548                                         'description':  video_description,
1549                                         'player_url':   player_url,
1550                                         'subtitles':    video_subtitles
1551                                 })
1552                         except UnavailableVideoError, err:
1553                                 self._downloader.trouble(u'\nERROR: unable to download video')
1554
1555
1556 class MetacafeIE(InfoExtractor):
1557         """Information Extractor for metacafe.com."""
1558
1559         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1560         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1561         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1562         _youtube_ie = None
1563         IE_NAME = u'metacafe'
1564
1565         def __init__(self, youtube_ie, downloader=None):
1566                 InfoExtractor.__init__(self, downloader)
1567                 self._youtube_ie = youtube_ie
1568
1569         def report_disclaimer(self):
1570                 """Report disclaimer retrieval."""
1571                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1572
1573         def report_age_confirmation(self):
1574                 """Report attempt to confirm age."""
1575                 self._downloader.to_screen(u'[metacafe] Confirming age')
1576
1577         def report_download_webpage(self, video_id):
1578                 """Report webpage download."""
1579                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1580
1581         def report_extraction(self, video_id):
1582                 """Report information extraction."""
1583                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1584
1585         def _real_initialize(self):
1586                 # Retrieve disclaimer
1587                 request = urllib2.Request(self._DISCLAIMER)
1588                 try:
1589                         self.report_disclaimer()
1590                         disclaimer = urllib2.urlopen(request).read()
1591                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1592                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1593                         return
1594
1595                 # Confirm age
1596                 disclaimer_form = {
1597                         'filters': '0',
1598                         'submit': "Continue - I'm over 18",
1599                         }
1600                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1601                 try:
1602                         self.report_age_confirmation()
1603                         disclaimer = urllib2.urlopen(request).read()
1604                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1605                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1606                         return
1607
1608         def _real_extract(self, url):
1609                 # Extract id and simplified title from URL
1610                 mobj = re.match(self._VALID_URL, url)
1611                 if mobj is None:
1612                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1613                         return
1614
1615                 video_id = mobj.group(1)
1616
1617                 # Check if video comes from YouTube
1618                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1619                 if mobj2 is not None:
1620                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1621                         return
1622
1623                 # At this point we have a new video
1624                 self._downloader.increment_downloads()
1625
1626                 simple_title = mobj.group(2).decode('utf-8')
1627
1628                 # Retrieve video webpage to extract further information
1629                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1630                 try:
1631                         self.report_download_webpage(video_id)
1632                         webpage = urllib2.urlopen(request).read()
1633                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1634                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1635                         return
1636
1637                 # Extract URL, uploader and title from webpage
1638                 self.report_extraction(video_id)
1639                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1640                 if mobj is not None:
1641                         mediaURL = urllib.unquote(mobj.group(1))
1642                         video_extension = mediaURL[-3:]
1643
1644                         # Extract gdaKey if available
1645                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1646                         if mobj is None:
1647                                 video_url = mediaURL
1648                         else:
1649                                 gdaKey = mobj.group(1)
1650                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1651                 else:
1652                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1653                         if mobj is None:
1654                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1655                                 return
1656                         vardict = parse_qs(mobj.group(1))
1657                         if 'mediaData' not in vardict:
1658                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1659                                 return
1660                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1661                         if mobj is None:
1662                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1663                                 return
1664                         mediaURL = mobj.group(1).replace('\\/', '/')
1665                         video_extension = mediaURL[-3:]
1666                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1667
1668                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1669                 if mobj is None:
1670                         self._downloader.trouble(u'ERROR: unable to extract title')
1671                         return
1672                 video_title = mobj.group(1).decode('utf-8')
1673                 video_title = sanitize_title(video_title)
1674
1675                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1676                 if mobj is None:
1677                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1678                         return
1679                 video_uploader = mobj.group(1)
1680
1681                 try:
1682                         # Process video information
1683                         self._downloader.process_info({
1684                                 'id':           video_id.decode('utf-8'),
1685                                 'url':          video_url.decode('utf-8'),
1686                                 'uploader':     video_uploader.decode('utf-8'),
1687                                 'upload_date':  u'NA',
1688                                 'title':        video_title,
1689                                 'stitle':       simple_title,
1690                                 'ext':          video_extension.decode('utf-8'),
1691                                 'format':       u'NA',
1692                                 'player_url':   None,
1693                         })
1694                 except UnavailableVideoError:
1695                         self._downloader.trouble(u'\nERROR: unable to download video')
1696
1697
1698 class DailymotionIE(InfoExtractor):
1699         """Information Extractor for Dailymotion"""
1700
1701         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1702         IE_NAME = u'dailymotion'
1703
1704         def __init__(self, downloader=None):
1705                 InfoExtractor.__init__(self, downloader)
1706
1707         def report_download_webpage(self, video_id):
1708                 """Report webpage download."""
1709                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1710
1711         def report_extraction(self, video_id):
1712                 """Report information extraction."""
1713                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1714
1715         def _real_extract(self, url):
1716                 # Extract id and simplified title from URL
1717                 mobj = re.match(self._VALID_URL, url)
1718                 if mobj is None:
1719                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1720                         return
1721
1722                 # At this point we have a new video
1723                 self._downloader.increment_downloads()
1724                 video_id = mobj.group(1)
1725
1726                 video_extension = 'flv'
1727
1728                 # Retrieve video webpage to extract further information
1729                 request = urllib2.Request(url)
1730                 request.add_header('Cookie', 'family_filter=off')
1731                 try:
1732                         self.report_download_webpage(video_id)
1733                         webpage = urllib2.urlopen(request).read()
1734                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1735                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1736                         return
1737
1738                 # Extract URL, uploader and title from webpage
1739                 self.report_extraction(video_id)
1740                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1741                 if mobj is None:
1742                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1743                         return
1744                 sequence = urllib.unquote(mobj.group(1))
1745                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1746                 if mobj is None:
1747                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1748                         return
1749                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1750
1751                 # if needed add http://www.dailymotion.com/ if relative URL
1752
1753                 video_url = mediaURL
1754
1755                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1756                 if mobj is None:
1757                         self._downloader.trouble(u'ERROR: unable to extract title')
1758                         return
1759                 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1760                 video_title = sanitize_title(video_title)
1761                 simple_title = _simplify_title(video_title)
1762
1763                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1764                 if mobj is None:
1765                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1766                         return
1767                 video_uploader = mobj.group(1)
1768
1769                 try:
1770                         # Process video information
1771                         self._downloader.process_info({
1772                                 'id':           video_id.decode('utf-8'),
1773                                 'url':          video_url.decode('utf-8'),
1774                                 'uploader':     video_uploader.decode('utf-8'),
1775                                 'upload_date':  u'NA',
1776                                 'title':        video_title,
1777                                 'stitle':       simple_title,
1778                                 'ext':          video_extension.decode('utf-8'),
1779                                 'format':       u'NA',
1780                                 'player_url':   None,
1781                         })
1782                 except UnavailableVideoError:
1783                         self._downloader.trouble(u'\nERROR: unable to download video')
1784
1785
1786 class GoogleIE(InfoExtractor):
1787         """Information extractor for video.google.com."""
1788
1789         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1790         IE_NAME = u'video.google'
1791
1792         def __init__(self, downloader=None):
1793                 InfoExtractor.__init__(self, downloader)
1794
1795         def report_download_webpage(self, video_id):
1796                 """Report webpage download."""
1797                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1798
1799         def report_extraction(self, video_id):
1800                 """Report information extraction."""
1801                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1802
1803         def _real_extract(self, url):
1804                 # Extract id from URL
1805                 mobj = re.match(self._VALID_URL, url)
1806                 if mobj is None:
1807                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1808                         return
1809
1810                 # At this point we have a new video
1811                 self._downloader.increment_downloads()
1812                 video_id = mobj.group(1)
1813
1814                 video_extension = 'mp4'
1815
1816                 # Retrieve video webpage to extract further information
1817                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1818                 try:
1819                         self.report_download_webpage(video_id)
1820                         webpage = urllib2.urlopen(request).read()
1821                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1822                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1823                         return
1824
1825                 # Extract URL, uploader, and title from webpage
1826                 self.report_extraction(video_id)
1827                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1828                 if mobj is None:
1829                         video_extension = 'flv'
1830                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1831                 if mobj is None:
1832                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1833                         return
1834                 mediaURL = urllib.unquote(mobj.group(1))
1835                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1836                 mediaURL = mediaURL.replace('\\x26', '\x26')
1837
1838                 video_url = mediaURL
1839
1840                 mobj = re.search(r'<title>(.*)</title>', webpage)
1841                 if mobj is None:
1842                         self._downloader.trouble(u'ERROR: unable to extract title')
1843                         return
1844                 video_title = mobj.group(1).decode('utf-8')
1845                 video_title = sanitize_title(video_title)
1846                 simple_title = _simplify_title(video_title)
1847
1848                 # Extract video description
1849                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1850                 if mobj is None:
1851                         self._downloader.trouble(u'ERROR: unable to extract video description')
1852                         return
1853                 video_description = mobj.group(1).decode('utf-8')
1854                 if not video_description:
1855                         video_description = 'No description available.'
1856
1857                 # Extract video thumbnail
1858                 if self._downloader.params.get('forcethumbnail', False):
1859                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1860                         try:
1861                                 webpage = urllib2.urlopen(request).read()
1862                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1863                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1864                                 return
1865                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1866                         if mobj is None:
1867                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1868                                 return
1869                         video_thumbnail = mobj.group(1)
1870                 else:   # we need something to pass to process_info
1871                         video_thumbnail = ''
1872
1873                 try:
1874                         # Process video information
1875                         self._downloader.process_info({
1876                                 'id':           video_id.decode('utf-8'),
1877                                 'url':          video_url.decode('utf-8'),
1878                                 'uploader':     u'NA',
1879                                 'upload_date':  u'NA',
1880                                 'title':        video_title,
1881                                 'stitle':       simple_title,
1882                                 'ext':          video_extension.decode('utf-8'),
1883                                 'format':       u'NA',
1884                                 'player_url':   None,
1885                         })
1886                 except UnavailableVideoError:
1887                         self._downloader.trouble(u'\nERROR: unable to download video')
1888
1889
1890 class PhotobucketIE(InfoExtractor):
1891         """Information extractor for photobucket.com."""
1892
1893         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1894         IE_NAME = u'photobucket'
1895
1896         def __init__(self, downloader=None):
1897                 InfoExtractor.__init__(self, downloader)
1898
1899         def report_download_webpage(self, video_id):
1900                 """Report webpage download."""
1901                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1902
1903         def report_extraction(self, video_id):
1904                 """Report information extraction."""
1905                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1906
1907         def _real_extract(self, url):
1908                 # Extract id from URL
1909                 mobj = re.match(self._VALID_URL, url)
1910                 if mobj is None:
1911                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1912                         return
1913
1914                 # At this point we have a new video
1915                 self._downloader.increment_downloads()
1916                 video_id = mobj.group(1)
1917
1918                 video_extension = 'flv'
1919
1920                 # Retrieve video webpage to extract further information
1921                 request = urllib2.Request(url)
1922                 try:
1923                         self.report_download_webpage(video_id)
1924                         webpage = urllib2.urlopen(request).read()
1925                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1926                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1927                         return
1928
1929                 # Extract URL, uploader, and title from webpage
1930                 self.report_extraction(video_id)
1931                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1932                 if mobj is None:
1933                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1934                         return
1935                 mediaURL = urllib.unquote(mobj.group(1))
1936
1937                 video_url = mediaURL
1938
1939                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1940                 if mobj is None:
1941                         self._downloader.trouble(u'ERROR: unable to extract title')
1942                         return
1943                 video_title = mobj.group(1).decode('utf-8')
1944                 video_title = sanitize_title(video_title)
1945                 simple_title = _simplify_title(vide_title)
1946
1947                 video_uploader = mobj.group(2).decode('utf-8')
1948
1949                 try:
1950                         # Process video information
1951                         self._downloader.process_info({
1952                                 'id':           video_id.decode('utf-8'),
1953                                 'url':          video_url.decode('utf-8'),
1954                                 'uploader':     video_uploader,
1955                                 'upload_date':  u'NA',
1956                                 'title':        video_title,
1957                                 'stitle':       simple_title,
1958                                 'ext':          video_extension.decode('utf-8'),
1959                                 'format':       u'NA',
1960                                 'player_url':   None,
1961                         })
1962                 except UnavailableVideoError:
1963                         self._downloader.trouble(u'\nERROR: unable to download video')
1964
1965
1966 class YahooIE(InfoExtractor):
1967         """Information extractor for video.yahoo.com."""
1968
1969         # _VALID_URL matches all Yahoo! Video URLs
1970         # _VPAGE_URL matches only the extractable '/watch/' URLs
1971         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1972         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1973         IE_NAME = u'video.yahoo'
1974
1975         def __init__(self, downloader=None):
1976                 InfoExtractor.__init__(self, downloader)
1977
1978         def report_download_webpage(self, video_id):
1979                 """Report webpage download."""
1980                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1981
1982         def report_extraction(self, video_id):
1983                 """Report information extraction."""
1984                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1985
1986         def _real_extract(self, url, new_video=True):
1987                 # Extract ID from URL
1988                 mobj = re.match(self._VALID_URL, url)
1989                 if mobj is None:
1990                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1991                         return
1992
1993                 # At this point we have a new video
1994                 self._downloader.increment_downloads()
1995                 video_id = mobj.group(2)
1996                 video_extension = 'flv'
1997
1998                 # Rewrite valid but non-extractable URLs as
1999                 # extractable English language /watch/ URLs
2000                 if re.match(self._VPAGE_URL, url) is None:
2001                         request = urllib2.Request(url)
2002                         try:
2003                                 webpage = urllib2.urlopen(request).read()
2004                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2005                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2006                                 return
2007
2008                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
2009                         if mobj is None:
2010                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
2011                                 return
2012                         yahoo_id = mobj.group(1)
2013
2014                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
2015                         if mobj is None:
2016                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
2017                                 return
2018                         yahoo_vid = mobj.group(1)
2019
2020                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
2021                         return self._real_extract(url, new_video=False)
2022
2023                 # Retrieve video webpage to extract further information
2024                 request = urllib2.Request(url)
2025                 try:
2026                         self.report_download_webpage(video_id)
2027                         webpage = urllib2.urlopen(request).read()
2028                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2029                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2030                         return
2031
2032                 # Extract uploader and title from webpage
2033                 self.report_extraction(video_id)
2034                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
2035                 if mobj is None:
2036                         self._downloader.trouble(u'ERROR: unable to extract video title')
2037                         return
2038                 video_title = mobj.group(1).decode('utf-8')
2039                 simple_title = _simplify_title(video_title)
2040
2041                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
2042                 if mobj is None:
2043                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
2044                         return
2045                 video_uploader = mobj.group(1).decode('utf-8')
2046
2047                 # Extract video thumbnail
2048                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
2049                 if mobj is None:
2050                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2051                         return
2052                 video_thumbnail = mobj.group(1).decode('utf-8')
2053
2054                 # Extract video description
2055                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
2056                 if mobj is None:
2057                         self._downloader.trouble(u'ERROR: unable to extract video description')
2058                         return
2059                 video_description = mobj.group(1).decode('utf-8')
2060                 if not video_description:
2061                         video_description = 'No description available.'
2062
2063                 # Extract video height and width
2064                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
2065                 if mobj is None:
2066                         self._downloader.trouble(u'ERROR: unable to extract video height')
2067                         return
2068                 yv_video_height = mobj.group(1)
2069
2070                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
2071                 if mobj is None:
2072                         self._downloader.trouble(u'ERROR: unable to extract video width')
2073                         return
2074                 yv_video_width = mobj.group(1)
2075
2076                 # Retrieve video playlist to extract media URL
2077                 # I'm not completely sure what all these options are, but we
2078                 # seem to need most of them, otherwise the server sends a 401.
2079                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
2080                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
2081                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
2082                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
2083                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
2084                 try:
2085                         self.report_download_webpage(video_id)
2086                         webpage = urllib2.urlopen(request).read()
2087                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2088                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2089                         return
2090
2091                 # Extract media URL from playlist XML
2092                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2093                 if mobj is None:
2094                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
2095                         return
2096                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2097                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
2098
2099                 try:
2100                         # Process video information
2101                         self._downloader.process_info({
2102                                 'id':           video_id.decode('utf-8'),
2103                                 'url':          video_url,
2104                                 'uploader':     video_uploader,
2105                                 'upload_date':  u'NA',
2106                                 'title':        video_title,
2107                                 'stitle':       simple_title,
2108                                 'ext':          video_extension.decode('utf-8'),
2109                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2110                                 'description':  video_description,
2111                                 'thumbnail':    video_thumbnail,
2112                                 'player_url':   None,
2113                         })
2114                 except UnavailableVideoError:
2115                         self._downloader.trouble(u'\nERROR: unable to download video')
2116
2117
2118 class VimeoIE(InfoExtractor):
2119         """Information extractor for vimeo.com."""
2120
2121         # _VALID_URL matches Vimeo URLs
2122         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2123         IE_NAME = u'vimeo'
2124
2125         def __init__(self, downloader=None):
2126                 InfoExtractor.__init__(self, downloader)
2127
2128         def report_download_webpage(self, video_id):
2129                 """Report webpage download."""
2130                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2131
2132         def report_extraction(self, video_id):
2133                 """Report information extraction."""
2134                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2135
2136         def _real_extract(self, url, new_video=True):
2137                 # Extract ID from URL
2138                 mobj = re.match(self._VALID_URL, url)
2139                 if mobj is None:
2140                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2141                         return
2142
2143                 # At this point we have a new video
2144                 self._downloader.increment_downloads()
2145                 video_id = mobj.group(1)
2146
2147                 # Retrieve video webpage to extract further information
2148                 request = urllib2.Request(url, None, std_headers)
2149                 try:
2150                         self.report_download_webpage(video_id)
2151                         webpage = urllib2.urlopen(request).read()
2152                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2153                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2154                         return
2155
2156                 # Now we begin extracting as much information as we can from what we
2157                 # retrieved. First we extract the information common to all extractors,
2158                 # and latter we extract those that are Vimeo specific.
2159                 self.report_extraction(video_id)
2160
2161                 # Extract the config JSON
2162                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
2163                 try:
2164                         config = json.loads(config)
2165                 except:
2166                         self._downloader.trouble(u'ERROR: unable to extract info section')
2167                         return
2168                 
2169                 # Extract title
2170                 video_title = config["video"]["title"]
2171                 simple_title = _simplify_title(video_title)
2172
2173                 # Extract uploader
2174                 video_uploader = config["video"]["owner"]["name"]
2175
2176                 # Extract video thumbnail
2177                 video_thumbnail = config["video"]["thumbnail"]
2178
2179                 # Extract video description
2180                 try:
2181                         lxml.etree
2182                 except NameError:
2183                         video_description = u'No description available.'
2184                         mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
2185                         if mobj is not None:
2186                                 video_description = mobj.group(1)
2187                 else:
2188                         html_parser = lxml.etree.HTMLParser()
2189                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
2190                         video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
2191                         # TODO use another parser
2192
2193                 # Extract upload date
2194                 video_upload_date = u'NA'
2195                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
2196                 if mobj is not None:
2197                         video_upload_date = mobj.group(1)
2198
2199                 # Vimeo specific: extract request signature and timestamp
2200                 sig = config['request']['signature']
2201                 timestamp = config['request']['timestamp']
2202
2203                 # Vimeo specific: extract video codec and quality information
2204                 # TODO bind to format param
2205                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
2206                 for codec in codecs:
2207                         if codec[0] in config["video"]["files"]:
2208                                 video_codec = codec[0]
2209                                 video_extension = codec[1]
2210                                 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
2211                                 else: quality = 'sd'
2212                                 break
2213                 else:
2214                         self._downloader.trouble(u'ERROR: no known codec found')
2215                         return
2216
2217                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
2218                                         %(video_id, sig, timestamp, quality, video_codec.upper())
2219
2220                 try:
2221                         # Process video information
2222                         self._downloader.process_info({
2223                                 'id':           video_id,
2224                                 'url':          video_url,
2225                                 'uploader':     video_uploader,
2226                                 'upload_date':  video_upload_date,
2227                                 'title':        video_title,
2228                                 'stitle':       simple_title,
2229                                 'ext':          video_extension,
2230                                 'thumbnail':    video_thumbnail,
2231                                 'description':  video_description,
2232                                 'player_url':   None,
2233                         })
2234                 except UnavailableVideoError:
2235                         self._downloader.trouble(u'ERROR: unable to download video')
2236
2237
2238 class GenericIE(InfoExtractor):
2239         """Generic last-resort information extractor."""
2240
2241         _VALID_URL = r'.*'
2242         IE_NAME = u'generic'
2243
2244         def __init__(self, downloader=None):
2245                 InfoExtractor.__init__(self, downloader)
2246
2247         def report_download_webpage(self, video_id):
2248                 """Report webpage download."""
2249                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2250                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2251
2252         def report_extraction(self, video_id):
2253                 """Report information extraction."""
2254                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2255
2256         def _real_extract(self, url):
2257                 # At this point we have a new video
2258                 self._downloader.increment_downloads()
2259
2260                 video_id = url.split('/')[-1]
2261                 request = urllib2.Request(url)
2262                 try:
2263                         self.report_download_webpage(video_id)
2264                         webpage = urllib2.urlopen(request).read()
2265                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2266                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2267                         return
2268                 except ValueError, err:
2269                         # since this is the last-resort InfoExtractor, if
2270                         # this error is thrown, it'll be thrown here
2271                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2272                         return
2273
2274                 self.report_extraction(video_id)
2275                 # Start with something easy: JW Player in SWFObject
2276                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2277                 if mobj is None:
2278                         # Broaden the search a little bit
2279                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2280                 if mobj is None:
2281                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2282                         return
2283
2284                 # It's possible that one of the regexes
2285                 # matched, but returned an empty group:
2286                 if mobj.group(1) is None:
2287                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2288                         return
2289
2290                 video_url = urllib.unquote(mobj.group(1))
2291                 video_id = os.path.basename(video_url)
2292
2293                 # here's a fun little line of code for you:
2294                 video_extension = os.path.splitext(video_id)[1][1:]
2295                 video_id = os.path.splitext(video_id)[0]
2296
2297                 # it's tempting to parse this further, but you would
2298                 # have to take into account all the variations like
2299                 #   Video Title - Site Name
2300                 #   Site Name | Video Title
2301                 #   Video Title - Tagline | Site Name
2302                 # and so on and so forth; it's just not practical
2303                 mobj = re.search(r'<title>(.*)</title>', webpage)
2304                 if mobj is None:
2305                         self._downloader.trouble(u'ERROR: unable to extract title')
2306                         return
2307                 video_title = mobj.group(1).decode('utf-8')
2308                 video_title = sanitize_title(video_title)
2309                 simple_title = _simplify_title(video_title)
2310
2311                 # video uploader is domain name
2312                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2313                 if mobj is None:
2314                         self._downloader.trouble(u'ERROR: unable to extract title')
2315                         return
2316                 video_uploader = mobj.group(1).decode('utf-8')
2317
2318                 try:
2319                         # Process video information
2320                         self._downloader.process_info({
2321                                 'id':           video_id.decode('utf-8'),
2322                                 'url':          video_url.decode('utf-8'),
2323                                 'uploader':     video_uploader,
2324                                 'upload_date':  u'NA',
2325                                 'title':        video_title,
2326                                 'stitle':       simple_title,
2327                                 'ext':          video_extension.decode('utf-8'),
2328                                 'format':       u'NA',
2329                                 'player_url':   None,
2330                         })
2331                 except UnavailableVideoError, err:
2332                         self._downloader.trouble(u'\nERROR: unable to download video')
2333
2334
2335 class YoutubeSearchIE(InfoExtractor):
2336         """Information Extractor for YouTube search queries."""
2337         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2338         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
2339         _youtube_ie = None
2340         _max_youtube_results = 1000
2341         IE_NAME = u'youtube:search'
2342
2343         def __init__(self, youtube_ie, downloader=None):
2344                 InfoExtractor.__init__(self, downloader)
2345                 self._youtube_ie = youtube_ie
2346
2347         def report_download_page(self, query, pagenum):
2348                 """Report attempt to download playlist page with given number."""
2349                 query = query.decode(preferredencoding())
2350                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2351
2352         def _real_initialize(self):
2353                 self._youtube_ie.initialize()
2354
2355         def _real_extract(self, query):
2356                 mobj = re.match(self._VALID_URL, query)
2357                 if mobj is None:
2358                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2359                         return
2360
2361                 prefix, query = query.split(':')
2362                 prefix = prefix[8:]
2363                 query = query.encode('utf-8')
2364                 if prefix == '':
2365                         self._download_n_results(query, 1)
2366                         return
2367                 elif prefix == 'all':
2368                         self._download_n_results(query, self._max_youtube_results)
2369                         return
2370                 else:
2371                         try:
2372                                 n = long(prefix)
2373                                 if n <= 0:
2374                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2375                                         return
2376                                 elif n > self._max_youtube_results:
2377                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2378                                         n = self._max_youtube_results
2379                                 self._download_n_results(query, n)
2380                                 return
2381                         except ValueError: # parsing prefix as integer fails
2382                                 self._download_n_results(query, 1)
2383                                 return
2384
2385         def _download_n_results(self, query, n):
2386                 """Downloads a specified number of results for a query"""
2387
2388                 video_ids = []
2389                 pagenum = 0
2390                 limit = n
2391
2392                 while (50 * pagenum) < limit:
2393                         self.report_download_page(query, pagenum+1)
2394                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
2395                         request = urllib2.Request(result_url)
2396                         try:
2397                                 data = urllib2.urlopen(request).read()
2398                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2399                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
2400                                 return
2401                         api_response = json.loads(data)['data']
2402
2403                         new_ids = list(video['id'] for video in api_response['items'])
2404                         video_ids += new_ids
2405
2406                         limit = min(n, api_response['totalItems'])
2407                         pagenum += 1
2408
2409                 if len(video_ids) > n:
2410                         video_ids = video_ids[:n]
2411                 for id in video_ids:
2412                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2413                 return
2414
2415
2416 class GoogleSearchIE(InfoExtractor):
2417         """Information Extractor for Google Video search queries."""
2418         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2419         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2420         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
2421         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
2422         _google_ie = None
2423         _max_google_results = 1000
2424         IE_NAME = u'video.google:search'
2425
2426         def __init__(self, google_ie, downloader=None):
2427                 InfoExtractor.__init__(self, downloader)
2428                 self._google_ie = google_ie
2429
2430         def report_download_page(self, query, pagenum):
2431                 """Report attempt to download playlist page with given number."""
2432                 query = query.decode(preferredencoding())
2433                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2434
2435         def _real_initialize(self):
2436                 self._google_ie.initialize()
2437
2438         def _real_extract(self, query):
2439                 mobj = re.match(self._VALID_URL, query)
2440                 if mobj is None:
2441                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2442                         return
2443
2444                 prefix, query = query.split(':')
2445                 prefix = prefix[8:]
2446                 query = query.encode('utf-8')
2447                 if prefix == '':
2448                         self._download_n_results(query, 1)
2449                         return
2450                 elif prefix == 'all':
2451                         self._download_n_results(query, self._max_google_results)
2452                         return
2453                 else:
2454                         try:
2455                                 n = long(prefix)
2456                                 if n <= 0:
2457                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2458                                         return
2459                                 elif n > self._max_google_results:
2460                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2461                                         n = self._max_google_results
2462                                 self._download_n_results(query, n)
2463                                 return
2464                         except ValueError: # parsing prefix as integer fails
2465                                 self._download_n_results(query, 1)
2466                                 return
2467
2468         def _download_n_results(self, query, n):
2469                 """Downloads a specified number of results for a query"""
2470
2471                 video_ids = []
2472                 pagenum = 0
2473
2474                 while True:
2475                         self.report_download_page(query, pagenum)
2476                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
2477                         request = urllib2.Request(result_url)
2478                         try:
2479                                 page = urllib2.urlopen(request).read()
2480                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2481                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2482                                 return
2483
2484                         # Extract video identifiers
2485                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2486                                 video_id = mobj.group(1)
2487                                 if video_id not in video_ids:
2488                                         video_ids.append(video_id)
2489                                         if len(video_ids) == n:
2490                                                 # Specified n videos reached
2491                                                 for id in video_ids:
2492                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2493                                                 return
2494
2495                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2496                                 for id in video_ids:
2497                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2498                                 return
2499
2500                         pagenum = pagenum + 1
2501
2502
2503 class YahooSearchIE(InfoExtractor):
2504         """Information Extractor for Yahoo! Video search queries."""
2505         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2506         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2507         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2508         _MORE_PAGES_INDICATOR = r'\s*Next'
2509         _yahoo_ie = None
2510         _max_yahoo_results = 1000
2511         IE_NAME = u'video.yahoo:search'
2512
2513         def __init__(self, yahoo_ie, downloader=None):
2514                 InfoExtractor.__init__(self, downloader)
2515                 self._yahoo_ie = yahoo_ie
2516
2517         def report_download_page(self, query, pagenum):
2518                 """Report attempt to download playlist page with given number."""
2519                 query = query.decode(preferredencoding())
2520                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2521
2522         def _real_initialize(self):
2523                 self._yahoo_ie.initialize()
2524
2525         def _real_extract(self, query):
2526                 mobj = re.match(self._VALID_URL, query)
2527                 if mobj is None:
2528                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2529                         return
2530
2531                 prefix, query = query.split(':')
2532                 prefix = prefix[8:]
2533                 query = query.encode('utf-8')
2534                 if prefix == '':
2535                         self._download_n_results(query, 1)
2536                         return
2537                 elif prefix == 'all':
2538                         self._download_n_results(query, self._max_yahoo_results)
2539                         return
2540                 else:
2541                         try:
2542                                 n = long(prefix)
2543                                 if n <= 0:
2544                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2545                                         return
2546                                 elif n > self._max_yahoo_results:
2547                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2548                                         n = self._max_yahoo_results
2549                                 self._download_n_results(query, n)
2550                                 return
2551                         except ValueError: # parsing prefix as integer fails
2552                                 self._download_n_results(query, 1)
2553                                 return
2554
2555         def _download_n_results(self, query, n):
2556                 """Downloads a specified number of results for a query"""
2557
2558                 video_ids = []
2559                 already_seen = set()
2560                 pagenum = 1
2561
2562                 while True:
2563                         self.report_download_page(query, pagenum)
2564                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2565                         request = urllib2.Request(result_url)
2566                         try:
2567                                 page = urllib2.urlopen(request).read()
2568                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2569                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2570                                 return
2571
2572                         # Extract video identifiers
2573                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2574                                 video_id = mobj.group(1)
2575                                 if video_id not in already_seen:
2576                                         video_ids.append(video_id)
2577                                         already_seen.add(video_id)
2578                                         if len(video_ids) == n:
2579                                                 # Specified n videos reached
2580                                                 for id in video_ids:
2581                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2582                                                 return
2583
2584                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2585                                 for id in video_ids:
2586                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2587                                 return
2588
2589                         pagenum = pagenum + 1
2590
2591
2592 class YoutubePlaylistIE(InfoExtractor):
2593         """Information Extractor for YouTube playlists."""
2594
2595         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2596         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2597         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=PL%s&'
2598         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2599         _youtube_ie = None
2600         IE_NAME = u'youtube:playlist'
2601
2602         def __init__(self, youtube_ie, downloader=None):
2603                 InfoExtractor.__init__(self, downloader)
2604                 self._youtube_ie = youtube_ie
2605
2606         def report_download_page(self, playlist_id, pagenum):
2607                 """Report attempt to download playlist page with given number."""
2608                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2609
2610         def _real_initialize(self):
2611                 self._youtube_ie.initialize()
2612
2613         def _real_extract(self, url):
2614                 # Extract playlist id
2615                 mobj = re.match(self._VALID_URL, url)
2616                 if mobj is None:
2617                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2618                         return
2619
2620                 # Single video case
2621                 if mobj.group(3) is not None:
2622                         self._youtube_ie.extract(mobj.group(3))
2623                         return
2624
2625                 # Download playlist pages
2626                 # prefix is 'p' as default for playlists but there are other types that need extra care
2627                 playlist_prefix = mobj.group(1)
2628                 if playlist_prefix == 'a':
2629                         playlist_access = 'artist'
2630                 else:
2631                         playlist_prefix = 'p'
2632                         playlist_access = 'view_play_list'
2633                 playlist_id = mobj.group(2)
2634                 video_ids = []
2635                 pagenum = 1
2636
2637                 while True:
2638                         self.report_download_page(playlist_id, pagenum)
2639                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2640                         request = urllib2.Request(url)
2641                         try:
2642                                 page = urllib2.urlopen(request).read()
2643                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2644                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2645                                 return
2646
2647                         # Extract video identifiers
2648                         ids_in_page = []
2649                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
2650                                 if mobj.group(1) not in ids_in_page:
2651                                         ids_in_page.append(mobj.group(1))
2652                         video_ids.extend(ids_in_page)
2653
2654                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2655                                 break
2656                         pagenum = pagenum + 1
2657
2658                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2659                 playlistend = self._downloader.params.get('playlistend', -1)
2660                 if playlistend == -1:
2661                         video_ids = video_ids[playliststart:]
2662                 else:
2663                         video_ids = video_ids[playliststart:playlistend]
2664
2665                 for id in video_ids:
2666                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2667                 return
2668
2669
2670 class YoutubeUserIE(InfoExtractor):
2671         """Information Extractor for YouTube users."""
2672
2673         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2674         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2675         _GDATA_PAGE_SIZE = 50
2676         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2677         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2678         _youtube_ie = None
2679         IE_NAME = u'youtube:user'
2680
2681         def __init__(self, youtube_ie, downloader=None):
2682                 InfoExtractor.__init__(self, downloader)
2683                 self._youtube_ie = youtube_ie
2684
2685         def report_download_page(self, username, start_index):
2686                 """Report attempt to download user page."""
2687                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2688                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2689
2690         def _real_initialize(self):
2691                 self._youtube_ie.initialize()
2692
2693         def _real_extract(self, url):
2694                 # Extract username
2695                 mobj = re.match(self._VALID_URL, url)
2696                 if mobj is None:
2697                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2698                         return
2699
2700                 username = mobj.group(1)
2701
2702                 # Download video ids using YouTube Data API. Result size per
2703                 # query is limited (currently to 50 videos) so we need to query
2704                 # page by page until there are no video ids - it means we got
2705                 # all of them.
2706
2707                 video_ids = []
2708                 pagenum = 0
2709
2710                 while True:
2711                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2712                         self.report_download_page(username, start_index)
2713
2714                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2715
2716                         try:
2717                                 page = urllib2.urlopen(request).read()
2718                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2719                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2720                                 return
2721
2722                         # Extract video identifiers
2723                         ids_in_page = []
2724
2725                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2726                                 if mobj.group(1) not in ids_in_page:
2727                                         ids_in_page.append(mobj.group(1))
2728
2729                         video_ids.extend(ids_in_page)
2730
2731                         # A little optimization - if current page is not
2732                         # "full", ie. does not contain PAGE_SIZE video ids then
2733                         # we can assume that this page is the last one - there
2734                         # are no more ids on further pages - no need to query
2735                         # again.
2736
2737                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2738                                 break
2739
2740                         pagenum += 1
2741
2742                 all_ids_count = len(video_ids)
2743                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2744                 playlistend = self._downloader.params.get('playlistend', -1)
2745
2746                 if playlistend == -1:
2747                         video_ids = video_ids[playliststart:]
2748                 else:
2749                         video_ids = video_ids[playliststart:playlistend]
2750
2751                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2752                                 (username, all_ids_count, len(video_ids)))
2753
2754                 for video_id in video_ids:
2755                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2756
2757
2758 class DepositFilesIE(InfoExtractor):
2759         """Information extractor for depositfiles.com"""
2760
2761         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2762         IE_NAME = u'DepositFiles'
2763
2764         def __init__(self, downloader=None):
2765                 InfoExtractor.__init__(self, downloader)
2766
2767         def report_download_webpage(self, file_id):
2768                 """Report webpage download."""
2769                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2770
2771         def report_extraction(self, file_id):
2772                 """Report information extraction."""
2773                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2774
2775         def _real_extract(self, url):
2776                 # At this point we have a new file
2777                 self._downloader.increment_downloads()
2778
2779                 file_id = url.split('/')[-1]
2780                 # Rebuild url in english locale
2781                 url = 'http://depositfiles.com/en/files/' + file_id
2782
2783                 # Retrieve file webpage with 'Free download' button pressed
2784                 free_download_indication = { 'gateway_result' : '1' }
2785                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2786                 try:
2787                         self.report_download_webpage(file_id)
2788                         webpage = urllib2.urlopen(request).read()
2789                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2790                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2791                         return
2792
2793                 # Search for the real file URL
2794                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2795                 if (mobj is None) or (mobj.group(1) is None):
2796                         # Try to figure out reason of the error.
2797                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2798                         if (mobj is not None) and (mobj.group(1) is not None):
2799                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2800                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2801                         else:
2802                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2803                         return
2804
2805                 file_url = mobj.group(1)
2806                 file_extension = os.path.splitext(file_url)[1][1:]
2807
2808                 # Search for file title
2809                 mobj = re.search(r'<b title="(.*?)">', webpage)
2810                 if mobj is None:
2811                         self._downloader.trouble(u'ERROR: unable to extract title')
2812                         return
2813                 file_title = mobj.group(1).decode('utf-8')
2814
2815                 try:
2816                         # Process file information
2817                         self._downloader.process_info({
2818                                 'id':           file_id.decode('utf-8'),
2819                                 'url':          file_url.decode('utf-8'),
2820                                 'uploader':     u'NA',
2821                                 'upload_date':  u'NA',
2822                                 'title':        file_title,
2823                                 'stitle':       file_title,
2824                                 'ext':          file_extension.decode('utf-8'),
2825                                 'format':       u'NA',
2826                                 'player_url':   None,
2827                         })
2828                 except UnavailableVideoError, err:
2829                         self._downloader.trouble(u'ERROR: unable to download file')
2830
2831
2832 class FacebookIE(InfoExtractor):
2833         """Information Extractor for Facebook"""
2834
2835         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2836         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2837         _NETRC_MACHINE = 'facebook'
2838         _available_formats = ['video', 'highqual', 'lowqual']
2839         _video_extensions = {
2840                 'video': 'mp4',
2841                 'highqual': 'mp4',
2842                 'lowqual': 'mp4',
2843         }
2844         IE_NAME = u'facebook'
2845
2846         def __init__(self, downloader=None):
2847                 InfoExtractor.__init__(self, downloader)
2848
2849         def _reporter(self, message):
2850                 """Add header and report message."""
2851                 self._downloader.to_screen(u'[facebook] %s' % message)
2852
2853         def report_login(self):
2854                 """Report attempt to log in."""
2855                 self._reporter(u'Logging in')
2856
2857         def report_video_webpage_download(self, video_id):
2858                 """Report attempt to download video webpage."""
2859                 self._reporter(u'%s: Downloading video webpage' % video_id)
2860
2861         def report_information_extraction(self, video_id):
2862                 """Report attempt to extract video information."""
2863                 self._reporter(u'%s: Extracting video information' % video_id)
2864
2865         def _parse_page(self, video_webpage):
2866                 """Extract video information from page"""
2867                 # General data
2868                 data = {'title': r'\("video_title", "(.*?)"\)',
2869                         'description': r'<div class="datawrap">(.*?)</div>',
2870                         'owner': r'\("video_owner_name", "(.*?)"\)',
2871                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2872                         }
2873                 video_info = {}
2874                 for piece in data.keys():
2875                         mobj = re.search(data[piece], video_webpage)
2876                         if mobj is not None:
2877                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2878
2879                 # Video urls
2880                 video_urls = {}
2881                 for fmt in self._available_formats:
2882                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2883                         if mobj is not None:
2884                                 # URL is in a Javascript segment inside an escaped Unicode format within
2885                                 # the generally utf-8 page
2886                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2887                 video_info['video_urls'] = video_urls
2888
2889                 return video_info
2890
2891         def _real_initialize(self):
2892                 if self._downloader is None:
2893                         return
2894
2895                 useremail = None
2896                 password = None
2897                 downloader_params = self._downloader.params
2898
2899                 # Attempt to use provided username and password or .netrc data
2900                 if downloader_params.get('username', None) is not None:
2901                         useremail = downloader_params['username']
2902                         password = downloader_params['password']
2903                 elif downloader_params.get('usenetrc', False):
2904                         try:
2905                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2906                                 if info is not None:
2907                                         useremail = info[0]
2908                                         password = info[2]
2909                                 else:
2910                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2911                         except (IOError, netrc.NetrcParseError), err:
2912                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2913                                 return
2914
2915                 if useremail is None:
2916                         return
2917
2918                 # Log in
2919                 login_form = {
2920                         'email': useremail,
2921                         'pass': password,
2922                         'login': 'Log+In'
2923                         }
2924                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2925                 try:
2926                         self.report_login()
2927                         login_results = urllib2.urlopen(request).read()
2928                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2929                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2930                                 return
2931                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2932                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2933                         return
2934
2935         def _real_extract(self, url):
2936                 mobj = re.match(self._VALID_URL, url)
2937                 if mobj is None:
2938                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2939                         return
2940                 video_id = mobj.group('ID')
2941
2942                 # Get video webpage
2943                 self.report_video_webpage_download(video_id)
2944                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2945                 try:
2946                         page = urllib2.urlopen(request)
2947                         video_webpage = page.read()
2948                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2949                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2950                         return
2951
2952                 # Start extracting information
2953                 self.report_information_extraction(video_id)
2954
2955                 # Extract information
2956                 video_info = self._parse_page(video_webpage)
2957
2958                 # uploader
2959                 if 'owner' not in video_info:
2960                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2961                         return
2962                 video_uploader = video_info['owner']
2963
2964                 # title
2965                 if 'title' not in video_info:
2966                         self._downloader.trouble(u'ERROR: unable to extract video title')
2967                         return
2968                 video_title = video_info['title']
2969                 video_title = video_title.decode('utf-8')
2970                 video_title = sanitize_title(video_title)
2971
2972                 simple_title = _simplify_title(video_title)
2973
2974                 # thumbnail image
2975                 if 'thumbnail' not in video_info:
2976                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2977                         video_thumbnail = ''
2978                 else:
2979                         video_thumbnail = video_info['thumbnail']
2980
2981                 # upload date
2982                 upload_date = u'NA'
2983                 if 'upload_date' in video_info:
2984                         upload_time = video_info['upload_date']
2985                         timetuple = email.utils.parsedate_tz(upload_time)
2986                         if timetuple is not None:
2987                                 try:
2988                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2989                                 except:
2990                                         pass
2991
2992                 # description
2993                 video_description = video_info.get('description', 'No description available.')
2994
2995                 url_map = video_info['video_urls']
2996                 if len(url_map.keys()) > 0:
2997                         # Decide which formats to download
2998                         req_format = self._downloader.params.get('format', None)
2999                         format_limit = self._downloader.params.get('format_limit', None)
3000
3001                         if format_limit is not None and format_limit in self._available_formats:
3002                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
3003                         else:
3004                                 format_list = self._available_formats
3005                         existing_formats = [x for x in format_list if x in url_map]
3006                         if len(existing_formats) == 0:
3007                                 self._downloader.trouble(u'ERROR: no known formats available for video')
3008                                 return
3009                         if req_format is None:
3010                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
3011                         elif req_format == 'worst':
3012                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
3013                         elif req_format == '-1':
3014                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
3015                         else:
3016                                 # Specific format
3017                                 if req_format not in url_map:
3018                                         self._downloader.trouble(u'ERROR: requested format not available')
3019                                         return
3020                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
3021
3022                 for format_param, video_real_url in video_url_list:
3023
3024                         # At this point we have a new video
3025                         self._downloader.increment_downloads()
3026
3027                         # Extension
3028                         video_extension = self._video_extensions.get(format_param, 'mp4')
3029
3030                         try:
3031                                 # Process video information
3032                                 self._downloader.process_info({
3033                                         'id':           video_id.decode('utf-8'),
3034                                         'url':          video_real_url.decode('utf-8'),
3035                                         'uploader':     video_uploader.decode('utf-8'),
3036                                         'upload_date':  upload_date,
3037                                         'title':        video_title,
3038                                         'stitle':       simple_title,
3039                                         'ext':          video_extension.decode('utf-8'),
3040                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
3041                                         'thumbnail':    video_thumbnail.decode('utf-8'),
3042                                         'description':  video_description.decode('utf-8'),
3043                                         'player_url':   None,
3044                                 })
3045                         except UnavailableVideoError, err:
3046                                 self._downloader.trouble(u'\nERROR: unable to download video')
3047
3048 class BlipTVIE(InfoExtractor):
3049         """Information extractor for blip.tv"""
3050
3051         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
3052         _URL_EXT = r'^.*\.([a-z0-9]+)$'
3053         IE_NAME = u'blip.tv'
3054
3055         def report_extraction(self, file_id):
3056                 """Report information extraction."""
3057                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3058
3059         def report_direct_download(self, title):
3060                 """Report information extraction."""
3061                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
3062
3063         def _real_extract(self, url):
3064                 mobj = re.match(self._VALID_URL, url)
3065                 if mobj is None:
3066                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3067                         return
3068
3069                 if '?' in url:
3070                         cchar = '&'
3071                 else:
3072                         cchar = '?'
3073                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
3074                 request = urllib2.Request(json_url)
3075                 self.report_extraction(mobj.group(1))
3076                 info = None
3077                 try:
3078                         urlh = urllib2.urlopen(request)
3079                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
3080                                 basename = url.split('/')[-1]
3081                                 title,ext = os.path.splitext(basename)
3082                                 title = title.decode('UTF-8')
3083                                 ext = ext.replace('.', '')
3084                                 self.report_direct_download(title)
3085                                 info = {
3086                                         'id': title,
3087                                         'url': url,
3088                                         'title': title,
3089                                         'stitle': _simplify_title(title),
3090                                         'ext': ext,
3091                                         'urlhandle': urlh
3092                                 }
3093                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3094                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3095                         return
3096                 if info is None: # Regular URL
3097                         try:
3098                                 json_code = urlh.read()
3099                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3100                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3101                                 return
3102
3103                         try:
3104                                 json_data = json.loads(json_code)
3105                                 if 'Post' in json_data:
3106                                         data = json_data['Post']
3107                                 else:
3108                                         data = json_data
3109         
3110                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3111                                 video_url = data['media']['url']
3112                                 umobj = re.match(self._URL_EXT, video_url)
3113                                 if umobj is None:
3114                                         raise ValueError('Can not determine filename extension')
3115                                 ext = umobj.group(1)
3116         
3117                                 info = {
3118                                         'id': data['item_id'],
3119                                         'url': video_url,
3120                                         'uploader': data['display_name'],
3121                                         'upload_date': upload_date,
3122                                         'title': data['title'],
3123                                         'stitle': _simplify_title(data['title']),
3124                                         'ext': ext,
3125                                         'format': data['media']['mimeType'],
3126                                         'thumbnail': data['thumbnailUrl'],
3127                                         'description': data['description'],
3128                                         'player_url': data['embedUrl']
3129                                 }
3130                         except (ValueError,KeyError), err:
3131                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3132                                 return
3133
3134                 self._downloader.increment_downloads()
3135
3136                 try:
3137                         self._downloader.process_info(info)
3138                 except UnavailableVideoError, err:
3139                         self._downloader.trouble(u'\nERROR: unable to download video')
3140
3141
3142 class MyVideoIE(InfoExtractor):
3143         """Information Extractor for myvideo.de."""
3144
3145         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3146         IE_NAME = u'myvideo'
3147
3148         def __init__(self, downloader=None):
3149                 InfoExtractor.__init__(self, downloader)
3150         
3151         def report_download_webpage(self, video_id):
3152                 """Report webpage download."""
3153                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3154
3155         def report_extraction(self, video_id):
3156                 """Report information extraction."""
3157                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3158
3159         def _real_extract(self,url):
3160                 mobj = re.match(self._VALID_URL, url)
3161                 if mobj is None:
3162                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
3163                         return
3164
3165                 video_id = mobj.group(1)
3166
3167                 # Get video webpage
3168                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3169                 try:
3170                         self.report_download_webpage(video_id)
3171                         webpage = urllib2.urlopen(request).read()
3172                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3173                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3174                         return
3175
3176                 self.report_extraction(video_id)
3177                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3178                                  webpage)
3179                 if mobj is None:
3180                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3181                         return
3182                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3183
3184                 mobj = re.search('<title>([^<]+)</title>', webpage)
3185                 if mobj is None:
3186                         self._downloader.trouble(u'ERROR: unable to extract title')
3187                         return
3188
3189                 video_title = mobj.group(1)
3190                 video_title = sanitize_title(video_title)
3191
3192                 simple_title = _simplify_title(video_title)
3193
3194                 try:
3195                         self._downloader.process_info({
3196                                 'id':           video_id,
3197                                 'url':          video_url,
3198                                 'uploader':     u'NA',
3199                                 'upload_date':  u'NA',
3200                                 'title':        video_title,
3201                                 'stitle':       simple_title,
3202                                 'ext':          u'flv',
3203                                 'format':       u'NA',
3204                                 'player_url':   None,
3205                         })
3206                 except UnavailableVideoError:
3207                         self._downloader.trouble(u'\nERROR: Unable to download video')
3208
3209 class ComedyCentralIE(InfoExtractor):
3210         """Information extractor for The Daily Show and Colbert Report """
3211
3212         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3213         IE_NAME = u'comedycentral'
3214
3215         def report_extraction(self, episode_id):
3216                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3217         
3218         def report_config_download(self, episode_id):
3219                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3220
3221         def report_index_download(self, episode_id):
3222                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3223
3224         def report_player_url(self, episode_id):
3225                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3226
3227         def _real_extract(self, url):
3228                 mobj = re.match(self._VALID_URL, url)
3229                 if mobj is None:
3230                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3231                         return
3232
3233                 if mobj.group('shortname'):
3234                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3235                                 url = u'http://www.thedailyshow.com/full-episodes/'
3236                         else:
3237                                 url = u'http://www.colbertnation.com/full-episodes/'
3238                         mobj = re.match(self._VALID_URL, url)
3239                         assert mobj is not None
3240
3241                 dlNewest = not mobj.group('episode')
3242                 if dlNewest:
3243                         epTitle = mobj.group('showname')
3244                 else:
3245                         epTitle = mobj.group('episode')
3246
3247                 req = urllib2.Request(url)
3248                 self.report_extraction(epTitle)
3249                 try:
3250                         htmlHandle = urllib2.urlopen(req)
3251                         html = htmlHandle.read()
3252                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3253                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3254                         return
3255                 if dlNewest:
3256                         url = htmlHandle.geturl()
3257                         mobj = re.match(self._VALID_URL, url)
3258                         if mobj is None:
3259                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3260                                 return
3261                         if mobj.group('episode') == '':
3262                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3263                                 return
3264                         epTitle = mobj.group('episode')
3265
3266                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3267                 if len(mMovieParams) == 0:
3268                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3269                         return
3270
3271                 playerUrl_raw = mMovieParams[0][0]
3272                 self.report_player_url(epTitle)
3273                 try:
3274                         urlHandle = urllib2.urlopen(playerUrl_raw)
3275                         playerUrl = urlHandle.geturl()
3276                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3277                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3278                         return
3279
3280                 uri = mMovieParams[0][1]
3281                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3282                 self.report_index_download(epTitle)
3283                 try:
3284                         indexXml = urllib2.urlopen(indexUrl).read()
3285                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3286                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3287                         return
3288
3289                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3290                 itemEls = idoc.findall('.//item')
3291                 for itemEl in itemEls:
3292                         mediaId = itemEl.findall('./guid')[0].text
3293                         shortMediaId = mediaId.split(':')[-1]
3294                         showId = mediaId.split(':')[-2].replace('.com', '')
3295                         officialTitle = itemEl.findall('./title')[0].text
3296                         officialDate = itemEl.findall('./pubDate')[0].text
3297
3298                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3299                                                 urllib.urlencode({'uri': mediaId}))
3300                         configReq = urllib2.Request(configUrl)
3301                         self.report_config_download(epTitle)
3302                         try:
3303                                 configXml = urllib2.urlopen(configReq).read()
3304                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3305                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3306                                 return
3307
3308                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3309                         turls = []
3310                         for rendition in cdoc.findall('.//rendition'):
3311                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3312                                 turls.append(finfo)
3313
3314                         if len(turls) == 0:
3315                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3316                                 continue
3317
3318                         # For now, just pick the highest bitrate
3319                         format,video_url = turls[-1]
3320
3321                         self._downloader.increment_downloads()
3322
3323                         effTitle = showId + u'-' + epTitle
3324                         info = {
3325                                 'id': shortMediaId,
3326                                 'url': video_url,
3327                                 'uploader': showId,
3328                                 'upload_date': officialDate,
3329                                 'title': effTitle,
3330                                 'stitle': _simplify_title(effTitle),
3331                                 'ext': 'mp4',
3332                                 'format': format,
3333                                 'thumbnail': None,
3334                                 'description': officialTitle,
3335                                 'player_url': playerUrl
3336                         }
3337
3338                         try:
3339                                 self._downloader.process_info(info)
3340                         except UnavailableVideoError, err:
3341                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3342                                 continue
3343
3344
3345 class EscapistIE(InfoExtractor):
3346         """Information extractor for The Escapist """
3347
3348         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3349         IE_NAME = u'escapist'
3350
3351         def report_extraction(self, showName):
3352                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3353
3354         def report_config_download(self, showName):
3355                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3356
3357         def _real_extract(self, url):
3358                 mobj = re.match(self._VALID_URL, url)
3359                 if mobj is None:
3360                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3361                         return
3362                 showName = mobj.group('showname')
3363                 videoId = mobj.group('episode')
3364
3365                 self.report_extraction(showName)
3366                 try:
3367                         webPage = urllib2.urlopen(url).read()
3368                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3369                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3370                         return
3371
3372                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3373                 description = unescapeHTML(descMatch.group(1))
3374                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3375                 imgUrl = unescapeHTML(imgMatch.group(1))
3376                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3377                 playerUrl = unescapeHTML(playerUrlMatch.group(1))
3378                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3379                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3380
3381                 self.report_config_download(showName)
3382                 try:
3383                         configJSON = urllib2.urlopen(configUrl).read()
3384                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3385                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3386                         return
3387
3388                 # Technically, it's JavaScript, not JSON
3389                 configJSON = configJSON.replace("'", '"')
3390
3391                 try:
3392                         config = json.loads(configJSON)
3393                 except (ValueError,), err:
3394                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3395                         return
3396
3397                 playlist = config['playlist']
3398                 videoUrl = playlist[1]['url']
3399
3400                 self._downloader.increment_downloads()
3401                 info = {
3402                         'id': videoId,
3403                         'url': videoUrl,
3404                         'uploader': showName,
3405                         'upload_date': None,
3406                         'title': showName,
3407                         'stitle': _simplify_title(showName),
3408                         'ext': 'flv',
3409                         'format': 'flv',
3410                         'thumbnail': imgUrl,
3411                         'description': description,
3412                         'player_url': playerUrl,
3413                 }
3414
3415                 try:
3416                         self._downloader.process_info(info)
3417                 except UnavailableVideoError, err:
3418                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3419
3420
3421 class CollegeHumorIE(InfoExtractor):
3422         """Information extractor for collegehumor.com"""
3423
3424         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3425         IE_NAME = u'collegehumor'
3426
3427         def report_webpage(self, video_id):
3428                 """Report information extraction."""
3429                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3430
3431         def report_extraction(self, video_id):
3432                 """Report information extraction."""
3433                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3434
3435         def _real_extract(self, url):
3436                 mobj = re.match(self._VALID_URL, url)
3437                 if mobj is None:
3438                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3439                         return
3440                 video_id = mobj.group('videoid')
3441
3442                 self.report_webpage(video_id)
3443                 request = urllib2.Request(url)
3444                 try:
3445                         webpage = urllib2.urlopen(request).read()
3446                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3447                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3448                         return
3449
3450                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3451                 if m is None:
3452                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3453                         return
3454                 internal_video_id = m.group('internalvideoid')
3455
3456                 info = {
3457                         'id': video_id,
3458                         'internal_id': internal_video_id,
3459                 }
3460
3461                 self.report_extraction(video_id)
3462                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3463                 try:
3464                         metaXml = urllib2.urlopen(xmlUrl).read()
3465                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3466                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3467                         return
3468
3469                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3470                 try:
3471                         videoNode = mdoc.findall('./video')[0]
3472                         info['description'] = videoNode.findall('./description')[0].text
3473                         info['title'] = videoNode.findall('./caption')[0].text
3474                         info['stitle'] = _simplify_title(info['title'])
3475                         info['url'] = videoNode.findall('./file')[0].text
3476                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3477                         info['ext'] = info['url'].rpartition('.')[2]
3478                         info['format'] = info['ext']
3479                 except IndexError:
3480                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3481                         return
3482
3483                 self._downloader.increment_downloads()
3484
3485                 try:
3486                         self._downloader.process_info(info)
3487                 except UnavailableVideoError, err:
3488                         self._downloader.trouble(u'\nERROR: unable to download video')
3489
3490
3491 class XVideosIE(InfoExtractor):
3492         """Information extractor for xvideos.com"""
3493
3494         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3495         IE_NAME = u'xvideos'
3496
3497         def report_webpage(self, video_id):
3498                 """Report information extraction."""
3499                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3500
3501         def report_extraction(self, video_id):
3502                 """Report information extraction."""
3503                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3504
3505         def _real_extract(self, url):
3506                 mobj = re.match(self._VALID_URL, url)
3507                 if mobj is None:
3508                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3509                         return
3510                 video_id = mobj.group(1).decode('utf-8')
3511
3512                 self.report_webpage(video_id)
3513
3514                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3515                 try:
3516                         webpage = urllib2.urlopen(request).read()
3517                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3518                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3519                         return
3520
3521                 self.report_extraction(video_id)
3522
3523
3524                 # Extract video URL
3525                 mobj = re.search(r'flv_url=(.+?)&', webpage)
3526                 if mobj is None:
3527                         self._downloader.trouble(u'ERROR: unable to extract video url')
3528                         return
3529                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3530
3531
3532                 # Extract title
3533                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3534                 if mobj is None:
3535                         self._downloader.trouble(u'ERROR: unable to extract video title')
3536                         return
3537                 video_title = mobj.group(1).decode('utf-8')
3538
3539
3540                 # Extract video thumbnail
3541                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3542                 if mobj is None:
3543                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3544                         return
3545                 video_thumbnail = mobj.group(1).decode('utf-8')
3546
3547
3548
3549                 self._downloader.increment_downloads()
3550                 info = {
3551                         'id': video_id,
3552                         'url': video_url,
3553                         'uploader': None,
3554                         'upload_date': None,
3555                         'title': video_title,
3556                         'stitle': _simplify_title(video_title),
3557                         'ext': 'flv',
3558                         'format': 'flv',
3559                         'thumbnail': video_thumbnail,
3560                         'description': None,
3561                         'player_url': None,
3562                 }
3563
3564                 try:
3565                         self._downloader.process_info(info)
3566                 except UnavailableVideoError, err:
3567                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3568
3569
3570 class SoundcloudIE(InfoExtractor):
3571         """Information extractor for soundcloud.com
3572            To access the media, the uid of the song and a stream token
3573            must be extracted from the page source and the script must make
3574            a request to media.soundcloud.com/crossdomain.xml. Then
3575            the media can be grabbed by requesting from an url composed
3576            of the stream token and uid
3577          """
3578
3579         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3580         IE_NAME = u'soundcloud'
3581
3582         def __init__(self, downloader=None):
3583                 InfoExtractor.__init__(self, downloader)
3584
3585         def report_webpage(self, video_id):
3586                 """Report information extraction."""
3587                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3588
3589         def report_extraction(self, video_id):
3590                 """Report information extraction."""
3591                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3592
3593         def _real_extract(self, url):
3594                 mobj = re.match(self._VALID_URL, url)
3595                 if mobj is None:
3596                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3597                         return
3598
3599                 # extract uploader (which is in the url)
3600                 uploader = mobj.group(1).decode('utf-8')
3601                 # extract simple title (uploader + slug of song title)
3602                 slug_title =  mobj.group(2).decode('utf-8')
3603                 simple_title = uploader + '-' + slug_title
3604
3605                 self.report_webpage('%s/%s' % (uploader, slug_title))
3606
3607                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3608                 try:
3609                         webpage = urllib2.urlopen(request).read()
3610                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3611                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3612                         return
3613
3614                 self.report_extraction('%s/%s' % (uploader, slug_title))
3615
3616                 # extract uid and stream token that soundcloud hands out for access
3617                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3618                 if mobj:
3619                         video_id = mobj.group(1)
3620                         stream_token = mobj.group(2)
3621
3622                 # extract unsimplified title
3623                 mobj = re.search('"title":"(.*?)",', webpage)
3624                 if mobj:
3625                         title = mobj.group(1)
3626
3627                 # construct media url (with uid/token)
3628                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3629                 mediaURL = mediaURL % (video_id, stream_token)
3630
3631                 # description
3632                 description = u'No description available'
3633                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3634                 if mobj:
3635                         description = mobj.group(1)
3636                 
3637                 # upload date
3638                 upload_date = None
3639                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3640                 if mobj:
3641                         try:
3642                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3643                         except Exception, e:
3644                                 print str(e)
3645
3646                 # for soundcloud, a request to a cross domain is required for cookies
3647                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3648
3649                 try:
3650                         self._downloader.process_info({
3651                                 'id':           video_id.decode('utf-8'),
3652                                 'url':          mediaURL,
3653                                 'uploader':     uploader.decode('utf-8'),
3654                                 'upload_date':  upload_date,
3655                                 'title':        simple_title.decode('utf-8'),
3656                                 'stitle':       simple_title.decode('utf-8'),
3657                                 'ext':          u'mp3',
3658                                 'format':       u'NA',
3659                                 'player_url':   None,
3660                                 'description': description.decode('utf-8')
3661                         })
3662                 except UnavailableVideoError:
3663                         self._downloader.trouble(u'\nERROR: unable to download video')
3664
3665
3666 class InfoQIE(InfoExtractor):
3667         """Information extractor for infoq.com"""
3668
3669         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3670         IE_NAME = u'infoq'
3671
3672         def report_webpage(self, video_id):
3673                 """Report information extraction."""
3674                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3675
3676         def report_extraction(self, video_id):
3677                 """Report information extraction."""
3678                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3679
3680         def _real_extract(self, url):
3681                 mobj = re.match(self._VALID_URL, url)
3682                 if mobj is None:
3683                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3684                         return
3685
3686                 self.report_webpage(url)
3687
3688                 request = urllib2.Request(url)
3689                 try:
3690                         webpage = urllib2.urlopen(request).read()
3691                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3692                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3693                         return
3694
3695                 self.report_extraction(url)
3696
3697
3698                 # Extract video URL
3699                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3700                 if mobj is None:
3701                         self._downloader.trouble(u'ERROR: unable to extract video url')
3702                         return
3703                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3704
3705
3706                 # Extract title
3707                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3708                 if mobj is None:
3709                         self._downloader.trouble(u'ERROR: unable to extract video title')
3710                         return
3711                 video_title = mobj.group(1).decode('utf-8')
3712
3713                 # Extract description
3714                 video_description = u'No description available.'
3715                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3716                 if mobj is not None:
3717                         video_description = mobj.group(1).decode('utf-8')
3718
3719                 video_filename = video_url.split('/')[-1]
3720                 video_id, extension = video_filename.split('.')
3721
3722                 self._downloader.increment_downloads()
3723                 info = {
3724                         'id': video_id,
3725                         'url': video_url,
3726                         'uploader': None,
3727                         'upload_date': None,
3728                         'title': video_title,
3729                         'stitle': _simplify_title(video_title),
3730                         'ext': extension,
3731                         'format': extension, # Extension is always(?) mp4, but seems to be flv
3732                         'thumbnail': None,
3733                         'description': video_description,
3734                         'player_url': None,
3735                 }
3736
3737                 try:
3738                         self._downloader.process_info(info)
3739                 except UnavailableVideoError, err:
3740                         self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3741
3742 class MixcloudIE(InfoExtractor):
3743         """Information extractor for www.mixcloud.com"""
3744         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3745         IE_NAME = u'mixcloud'
3746
3747         def __init__(self, downloader=None):
3748                 InfoExtractor.__init__(self, downloader)
3749
3750         def report_download_json(self, file_id):
3751                 """Report JSON download."""
3752                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3753
3754         def report_extraction(self, file_id):
3755                 """Report information extraction."""
3756                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3757
3758         def get_urls(self, jsonData, fmt, bitrate='best'):
3759                 """Get urls from 'audio_formats' section in json"""
3760                 file_url = None
3761                 try:
3762                         bitrate_list = jsonData[fmt]
3763                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3764                                 bitrate = max(bitrate_list) # select highest
3765
3766                         url_list = jsonData[fmt][bitrate]
3767                 except TypeError: # we have no bitrate info.
3768                         url_list = jsonData[fmt]
3769                                 
3770                 return url_list
3771
3772         def check_urls(self, url_list):
3773                 """Returns 1st active url from list"""
3774                 for url in url_list:
3775                         try:
3776                                 urllib2.urlopen(url)
3777                                 return url
3778                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3779                                 url = None
3780
3781                 return None
3782
3783         def _print_formats(self, formats):
3784                 print 'Available formats:'
3785                 for fmt in formats.keys():
3786                         for b in formats[fmt]:
3787                                 try:
3788                                         ext = formats[fmt][b][0]
3789                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3790                                 except TypeError: # we have no bitrate info
3791                                         ext = formats[fmt][0]
3792                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3793                                         break
3794
3795         def _real_extract(self, url):
3796                 mobj = re.match(self._VALID_URL, url)
3797                 if mobj is None:
3798                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3799                         return
3800                 # extract uploader & filename from url
3801                 uploader = mobj.group(1).decode('utf-8')
3802                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3803
3804                 # construct API request
3805                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3806                 # retrieve .json file with links to files
3807                 request = urllib2.Request(file_url)
3808                 try:
3809                         self.report_download_json(file_url)
3810                         jsonData = urllib2.urlopen(request).read()
3811                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3812                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3813                         return
3814
3815                 # parse JSON
3816                 json_data = json.loads(jsonData)
3817                 player_url = json_data['player_swf_url']
3818                 formats = dict(json_data['audio_formats'])
3819
3820                 req_format = self._downloader.params.get('format', None)
3821                 bitrate = None
3822
3823                 if self._downloader.params.get('listformats', None):
3824                         self._print_formats(formats)
3825                         return
3826
3827                 if req_format is None or req_format == 'best':
3828                         for format_param in formats.keys():
3829                                 url_list = self.get_urls(formats, format_param)
3830                                 # check urls
3831                                 file_url = self.check_urls(url_list)
3832                                 if file_url is not None:
3833                                         break # got it!
3834                 else:
3835                         if req_format not in formats.keys():
3836                                 self._downloader.trouble(u'ERROR: format is not available')
3837                                 return
3838
3839                         url_list = self.get_urls(formats, req_format)
3840                         file_url = self.check_urls(url_list)
3841                         format_param = req_format
3842
3843                 # We have audio
3844                 self._downloader.increment_downloads()
3845                 try:
3846                         # Process file information
3847                         self._downloader.process_info({
3848                                 'id': file_id.decode('utf-8'),
3849                                 'url': file_url.decode('utf-8'),
3850                                 'uploader':     uploader.decode('utf-8'),
3851                                 'upload_date': u'NA',
3852                                 'title': json_data['name'],
3853                                 'stitle': _simplify_title(json_data['name']),
3854                                 'ext': file_url.split('.')[-1].decode('utf-8'),
3855                                 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3856                                 'thumbnail': json_data['thumbnail_url'],
3857                                 'description': json_data['description'],
3858                                 'player_url': player_url.decode('utf-8'),
3859                         })
3860                 except UnavailableVideoError, err:
3861                         self._downloader.trouble(u'ERROR: unable to download file')
3862
3863 class StanfordOpenClassroomIE(InfoExtractor):
3864         """Information extractor for Stanford's Open ClassRoom"""
3865
3866         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3867         IE_NAME = u'stanfordoc'
3868
3869         def report_download_webpage(self, objid):
3870                 """Report information extraction."""
3871                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3872
3873         def report_extraction(self, video_id):
3874                 """Report information extraction."""
3875                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3876
3877         def _real_extract(self, url):
3878                 mobj = re.match(self._VALID_URL, url)
3879                 if mobj is None:
3880                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3881                         return
3882
3883                 if mobj.group('course') and mobj.group('video'): # A specific video
3884                         course = mobj.group('course')
3885                         video = mobj.group('video')
3886                         info = {
3887                                 'id': _simplify_title(course + '_' + video),
3888                         }
3889         
3890                         self.report_extraction(info['id'])
3891                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3892                         xmlUrl = baseUrl + video + '.xml'
3893                         try:
3894                                 metaXml = urllib2.urlopen(xmlUrl).read()
3895                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3896                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3897                                 return
3898                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
3899                         try:
3900                                 info['title'] = mdoc.findall('./title')[0].text
3901                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3902                         except IndexError:
3903                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3904                                 return
3905                         info['stitle'] = _simplify_title(info['title'])
3906                         info['ext'] = info['url'].rpartition('.')[2]
3907                         info['format'] = info['ext']
3908                         self._downloader.increment_downloads()
3909                         try:
3910                                 self._downloader.process_info(info)
3911                         except UnavailableVideoError, err:
3912                                 self._downloader.trouble(u'\nERROR: unable to download video')
3913                 elif mobj.group('course'): # A course page
3914                         course = mobj.group('course')
3915                         info = {
3916                                 'id': _simplify_title(course),
3917                                 'type': 'playlist',
3918                         }
3919
3920                         self.report_download_webpage(info['id'])
3921                         try:
3922                                 coursepage = urllib2.urlopen(url).read()
3923                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3924                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3925                                 return
3926
3927                         m = re.search('<h1>([^<]+)</h1>', coursepage)
3928                         if m:
3929                                 info['title'] = unescapeHTML(m.group(1))
3930                         else:
3931                                 info['title'] = info['id']
3932                         info['stitle'] = _simplify_title(info['title'])
3933
3934                         m = re.search('<description>([^<]+)</description>', coursepage)
3935                         if m:
3936                                 info['description'] = unescapeHTML(m.group(1))
3937
3938                         links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3939                         info['list'] = [
3940                                 {
3941                                         'type': 'reference',
3942                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3943                                 }
3944                                         for vpage in links]
3945
3946                         for entry in info['list']:
3947                                 assert entry['type'] == 'reference'
3948                                 self.extract(entry['url'])
3949                 else: # Root page
3950                         info = {
3951                                 'id': 'Stanford OpenClassroom',
3952                                 'type': 'playlist',
3953                         }
3954
3955                         self.report_download_webpage(info['id'])
3956                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3957                         try:
3958                                 rootpage = urllib2.urlopen(rootURL).read()
3959                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3960                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3961                                 return
3962
3963                         info['title'] = info['id']
3964                         info['stitle'] = _simplify_title(info['title'])
3965
3966                         links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3967                         info['list'] = [
3968                                 {
3969                                         'type': 'reference',
3970                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3971                                 }
3972                                         for cpage in links]
3973
3974                         for entry in info['list']:
3975                                 assert entry['type'] == 'reference'
3976                                 self.extract(entry['url'])
3977
3978 class MTVIE(InfoExtractor):
3979         """Information extractor for MTV.com"""
3980
3981         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3982         IE_NAME = u'mtv'
3983
3984         def report_webpage(self, video_id):
3985                 """Report information extraction."""
3986                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3987
3988         def report_extraction(self, video_id):
3989                 """Report information extraction."""
3990                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3991
3992         def _real_extract(self, url):
3993                 mobj = re.match(self._VALID_URL, url)
3994                 if mobj is None:
3995                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3996                         return
3997                 if not mobj.group('proto'):
3998                         url = 'http://' + url
3999                 video_id = mobj.group('videoid')
4000                 self.report_webpage(video_id)
4001
4002                 request = urllib2.Request(url)
4003                 try:
4004                         webpage = urllib2.urlopen(request).read()
4005                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4006                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
4007                         return
4008
4009                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
4010                 if mobj is None:
4011                         self._downloader.trouble(u'ERROR: unable to extract song name')
4012                         return
4013                 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4014                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
4015                 if mobj is None:
4016                         self._downloader.trouble(u'ERROR: unable to extract performer')
4017                         return
4018                 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4019                 video_title = performer + ' - ' + song_name 
4020
4021                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
4022                 if mobj is None:
4023                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
4024                         return
4025                 mtvn_uri = mobj.group(1)
4026
4027                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
4028                 if mobj is None:
4029                         self._downloader.trouble(u'ERROR: unable to extract content id')
4030                         return
4031                 content_id = mobj.group(1)
4032
4033                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
4034                 self.report_extraction(video_id)
4035                 request = urllib2.Request(videogen_url)
4036                 try:
4037                         metadataXml = urllib2.urlopen(request).read()
4038                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4039                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
4040                         return
4041
4042                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
4043                 renditions = mdoc.findall('.//rendition')
4044
4045                 # For now, always pick the highest quality.
4046                 rendition = renditions[-1]
4047
4048                 try:
4049                         _,_,ext = rendition.attrib['type'].partition('/')
4050                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
4051                         video_url = rendition.find('./src').text
4052                 except KeyError:
4053                         self._downloader.trouble('Invalid rendition field.')
4054                         return
4055
4056                 self._downloader.increment_downloads()
4057                 info = {
4058                         'id': video_id,
4059                         'url': video_url,
4060                         'uploader': performer,
4061                         'title': video_title,
4062                         'stitle': _simplify_title(video_title),
4063                         'ext': ext,
4064                         'format': format,
4065                 }
4066
4067                 try:
4068                         self._downloader.process_info(info)
4069                 except UnavailableVideoError, err:
4070                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
4071
4072
4073 class PostProcessor(object):
4074         """Post Processor class.
4075
4076         PostProcessor objects can be added to downloaders with their
4077         add_post_processor() method. When the downloader has finished a
4078         successful download, it will take its internal chain of PostProcessors
4079         and start calling the run() method on each one of them, first with
4080         an initial argument and then with the returned value of the previous
4081         PostProcessor.
4082
4083         The chain will be stopped if one of them ever returns None or the end
4084         of the chain is reached.
4085
4086         PostProcessor objects follow a "mutual registration" process similar
4087         to InfoExtractor objects.
4088         """
4089
4090         _downloader = None
4091
4092         def __init__(self, downloader=None):
4093                 self._downloader = downloader
4094
4095         def set_downloader(self, downloader):
4096                 """Sets the downloader for this PP."""
4097                 self._downloader = downloader
4098
4099         def run(self, information):
4100                 """Run the PostProcessor.
4101
4102                 The "information" argument is a dictionary like the ones
4103                 composed by InfoExtractors. The only difference is that this
4104                 one has an extra field called "filepath" that points to the
4105                 downloaded file.
4106
4107                 When this method returns None, the postprocessing chain is
4108                 stopped. However, this method may return an information
4109                 dictionary that will be passed to the next postprocessing
4110                 object in the chain. It can be the one it received after
4111                 changing some fields.
4112
4113                 In addition, this method may raise a PostProcessingError
4114                 exception that will be taken into account by the downloader
4115                 it was called from.
4116                 """
4117                 return information # by default, do nothing
4118
4119 class AudioConversionError(BaseException):
4120         def __init__(self, message):
4121                 self.message = message
4122
4123 class FFmpegExtractAudioPP(PostProcessor):
4124
4125         def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4126                 PostProcessor.__init__(self, downloader)
4127                 if preferredcodec is None:
4128                         preferredcodec = 'best'
4129                 self._preferredcodec = preferredcodec
4130                 self._preferredquality = preferredquality
4131                 self._keepvideo = keepvideo
4132
4133         @staticmethod
4134         def get_audio_codec(path):
4135                 try:
4136                         cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4137                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4138                         output = handle.communicate()[0]
4139                         if handle.wait() != 0:
4140                                 return None
4141                 except (IOError, OSError):
4142                         return None
4143                 audio_codec = None
4144                 for line in output.split('\n'):
4145                         if line.startswith('codec_name='):
4146                                 audio_codec = line.split('=')[1].strip()
4147                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4148                                 return audio_codec
4149                 return None
4150
4151         @staticmethod
4152         def run_ffmpeg(path, out_path, codec, more_opts):
4153                 if codec is None:
4154                         acodec_opts = []
4155                 else:
4156                         acodec_opts = ['-acodec', codec]
4157                 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4158                 try:
4159                         p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4160                         stdout,stderr = p.communicate()
4161                 except (IOError, OSError):
4162                         e = sys.exc_info()[1]
4163                         if isinstance(e, OSError) and e.errno == 2:
4164                                 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4165                         else:
4166                                 raise e
4167                 if p.returncode != 0:
4168                         msg = stderr.strip().split('\n')[-1]
4169                         raise AudioConversionError(msg)
4170
4171         def run(self, information):
4172                 path = information['filepath']
4173
4174                 filecodec = self.get_audio_codec(path)
4175                 if filecodec is None:
4176                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4177                         return None
4178
4179                 more_opts = []
4180                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4181                         if self._preferredcodec == 'm4a' and filecodec == 'aac':
4182                                 # Lossless, but in another container
4183                                 acodec = 'copy'
4184                                 extension = self._preferredcodec
4185                                 more_opts = ['-absf', 'aac_adtstoasc']
4186                         elif filecodec in ['aac', 'mp3', 'vorbis']:
4187                                 # Lossless if possible
4188                                 acodec = 'copy'
4189                                 extension = filecodec
4190                                 if filecodec == 'aac':
4191                                         more_opts = ['-f', 'adts']
4192                                 if filecodec == 'vorbis':
4193                                         extension = 'ogg'
4194                         else:
4195                                 # MP3 otherwise.
4196                                 acodec = 'libmp3lame'
4197                                 extension = 'mp3'
4198                                 more_opts = []
4199                                 if self._preferredquality is not None:
4200                                         more_opts += ['-ab', self._preferredquality]
4201                 else:
4202                         # We convert the audio (lossy)
4203                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4204                         extension = self._preferredcodec
4205                         more_opts = []
4206                         if self._preferredquality is not None:
4207                                 more_opts += ['-ab', self._preferredquality]
4208                         if self._preferredcodec == 'aac':
4209                                 more_opts += ['-f', 'adts']
4210                         if self._preferredcodec == 'm4a':
4211                                 more_opts += ['-absf', 'aac_adtstoasc']
4212                         if self._preferredcodec == 'vorbis':
4213                                 extension = 'ogg'
4214                         if self._preferredcodec == 'wav':
4215                                 extension = 'wav'
4216                                 more_opts += ['-f', 'wav']
4217
4218                 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4219                 new_path = prefix + sep + extension
4220                 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4221                 try:
4222                         self.run_ffmpeg(path, new_path, acodec, more_opts)
4223                 except:
4224                         etype,e,tb = sys.exc_info()
4225                         if isinstance(e, AudioConversionError):
4226                                 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4227                         else:
4228                                 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4229                         return None
4230
4231                 # Try to update the date time for extracted audio file.
4232                 if information.get('filetime') is not None:
4233                         try:
4234                                 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4235                         except:
4236                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4237
4238                 if not self._keepvideo:
4239                         try:
4240                                 os.remove(_encodeFilename(path))
4241                         except (IOError, OSError):
4242                                 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4243                                 return None
4244
4245                 information['filepath'] = new_path
4246                 return information
4247
4248
4249 def updateSelf(downloader, filename):
4250         ''' Update the program file with the latest version from the repository '''
4251         # Note: downloader only used for options
4252         if not os.access(filename, os.W_OK):
4253                 sys.exit('ERROR: no write permissions on %s' % filename)
4254
4255         downloader.to_screen(u'Updating to latest version...')
4256
4257         try:
4258                 try:
4259                         urlh = urllib.urlopen(UPDATE_URL)
4260                         newcontent = urlh.read()
4261                         
4262                         vmatch = re.search("__version__ = '([^']+)'", newcontent)
4263                         if vmatch is not None and vmatch.group(1) == __version__:
4264                                 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4265                                 return
4266                 finally:
4267                         urlh.close()
4268         except (IOError, OSError), err:
4269                 sys.exit('ERROR: unable to download latest version')
4270
4271         try:
4272                 outf = open(filename, 'wb')
4273                 try:
4274                         outf.write(newcontent)
4275                 finally:
4276                         outf.close()
4277         except (IOError, OSError), err:
4278                 sys.exit('ERROR: unable to overwrite current version')
4279
4280         downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4281
4282 def parseOpts():
4283         def _readOptions(filename_bytes):
4284                 try:
4285                         optionf = open(filename_bytes)
4286                 except IOError:
4287                         return [] # silently skip if file is not present
4288                 try:
4289                         res = []
4290                         for l in optionf:
4291                                 res += shlex.split(l, comments=True)
4292                 finally:
4293                         optionf.close()
4294                 return res
4295
4296         def _format_option_string(option):
4297                 ''' ('-o', '--option') -> -o, --format METAVAR'''
4298
4299                 opts = []
4300
4301                 if option._short_opts: opts.append(option._short_opts[0])
4302                 if option._long_opts: opts.append(option._long_opts[0])
4303                 if len(opts) > 1: opts.insert(1, ', ')
4304
4305                 if option.takes_value(): opts.append(' %s' % option.metavar)
4306
4307                 return "".join(opts)
4308
4309         def _find_term_columns():
4310                 columns = os.environ.get('COLUMNS', None)
4311                 if columns:
4312                         return int(columns)
4313
4314                 try:
4315                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4316                         out,err = sp.communicate()
4317                         return int(out.split()[1])
4318                 except:
4319                         pass
4320                 return None
4321
4322         max_width = 80
4323         max_help_position = 80
4324
4325         # No need to wrap help messages if we're on a wide console
4326         columns = _find_term_columns()
4327         if columns: max_width = columns
4328
4329         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4330         fmt.format_option_strings = _format_option_string
4331
4332         kw = {
4333                 'version'   : __version__,
4334                 'formatter' : fmt,
4335                 'usage' : '%prog [options] url [url...]',
4336                 'conflict_handler' : 'resolve',
4337         }
4338
4339         parser = optparse.OptionParser(**kw)
4340
4341         # option groups
4342         general        = optparse.OptionGroup(parser, 'General Options')
4343         selection      = optparse.OptionGroup(parser, 'Video Selection')
4344         authentication = optparse.OptionGroup(parser, 'Authentication Options')
4345         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
4346         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
4347         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
4348         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4349
4350         general.add_option('-h', '--help',
4351                         action='help', help='print this help text and exit')
4352         general.add_option('-v', '--version',
4353                         action='version', help='print program version and exit')
4354         general.add_option('-U', '--update',
4355                         action='store_true', dest='update_self', help='update this program to latest version')
4356         general.add_option('-i', '--ignore-errors',
4357                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4358         general.add_option('-r', '--rate-limit',
4359                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4360         general.add_option('-R', '--retries',
4361                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4362         general.add_option('--dump-user-agent',
4363                         action='store_true', dest='dump_user_agent',
4364                         help='display the current browser identification', default=False)
4365         general.add_option('--list-extractors',
4366                         action='store_true', dest='list_extractors',
4367                         help='List all supported extractors and the URLs they would handle', default=False)
4368
4369         selection.add_option('--playlist-start',
4370                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4371         selection.add_option('--playlist-end',
4372                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4373         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4374         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4375         selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4376
4377         authentication.add_option('-u', '--username',
4378                         dest='username', metavar='USERNAME', help='account username')
4379         authentication.add_option('-p', '--password',
4380                         dest='password', metavar='PASSWORD', help='account password')
4381         authentication.add_option('-n', '--netrc',
4382                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4383
4384
4385         video_format.add_option('-f', '--format',
4386                         action='store', dest='format', metavar='FORMAT', help='video format code')
4387         video_format.add_option('--all-formats',
4388                         action='store_const', dest='format', help='download all available video formats', const='all')
4389         video_format.add_option('--prefer-free-formats',
4390                         action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4391         video_format.add_option('--max-quality',
4392                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4393         video_format.add_option('-F', '--list-formats',
4394                         action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4395         video_format.add_option('--write-srt',
4396                         action='store_true', dest='writesubtitles',
4397                         help='write video closed captions to a .srt file (currently youtube only)', default=False)
4398         video_format.add_option('--srt-lang',
4399                         action='store', dest='subtitleslang', metavar='LANG',
4400                         help='language of the closed captions to download (optional) use IETF language tags like \'en\'')
4401
4402
4403         verbosity.add_option('-q', '--quiet',
4404                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
4405         verbosity.add_option('-s', '--simulate',
4406                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4407         verbosity.add_option('--skip-download',
4408                         action='store_true', dest='skip_download', help='do not download the video', default=False)
4409         verbosity.add_option('-g', '--get-url',
4410                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4411         verbosity.add_option('-e', '--get-title',
4412                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4413         verbosity.add_option('--get-thumbnail',
4414                         action='store_true', dest='getthumbnail',
4415                         help='simulate, quiet but print thumbnail URL', default=False)
4416         verbosity.add_option('--get-description',
4417                         action='store_true', dest='getdescription',
4418                         help='simulate, quiet but print video description', default=False)
4419         verbosity.add_option('--get-filename',
4420                         action='store_true', dest='getfilename',
4421                         help='simulate, quiet but print output filename', default=False)
4422         verbosity.add_option('--get-format',
4423                         action='store_true', dest='getformat',
4424                         help='simulate, quiet but print output format', default=False)
4425         verbosity.add_option('--no-progress',
4426                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4427         verbosity.add_option('--console-title',
4428                         action='store_true', dest='consoletitle',
4429                         help='display progress in console titlebar', default=False)
4430         verbosity.add_option('-v', '--verbose',
4431                         action='store_true', dest='verbose', help='print various debugging information', default=False)
4432
4433
4434         filesystem.add_option('-t', '--title',
4435                         action='store_true', dest='usetitle', help='use title in file name', default=False)
4436         filesystem.add_option('-l', '--literal',
4437                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4438         filesystem.add_option('-A', '--auto-number',
4439                         action='store_true', dest='autonumber',
4440                         help='number downloaded files starting from 00000', default=False)
4441         filesystem.add_option('-o', '--output',
4442                         dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4443         filesystem.add_option('-a', '--batch-file',
4444                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4445         filesystem.add_option('-w', '--no-overwrites',
4446                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4447         filesystem.add_option('-c', '--continue',
4448                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4449         filesystem.add_option('--no-continue',
4450                         action='store_false', dest='continue_dl',
4451                         help='do not resume partially downloaded files (restart from beginning)')
4452         filesystem.add_option('--cookies',
4453                         dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4454         filesystem.add_option('--no-part',
4455                         action='store_true', dest='nopart', help='do not use .part files', default=False)
4456         filesystem.add_option('--no-mtime',
4457                         action='store_false', dest='updatetime',
4458                         help='do not use the Last-modified header to set the file modification time', default=True)
4459         filesystem.add_option('--write-description',
4460                         action='store_true', dest='writedescription',
4461                         help='write video description to a .description file', default=False)
4462         filesystem.add_option('--write-info-json',
4463                         action='store_true', dest='writeinfojson',
4464                         help='write video metadata to a .info.json file', default=False)
4465
4466
4467         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4468                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4469         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4470                         help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4471         postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4472                         help='ffmpeg audio bitrate specification, 128k by default')
4473         postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4474                         help='keeps the video file on disk after the post-processing; the video is erased by default')
4475
4476
4477         parser.add_option_group(general)
4478         parser.add_option_group(selection)
4479         parser.add_option_group(filesystem)
4480         parser.add_option_group(verbosity)
4481         parser.add_option_group(video_format)
4482         parser.add_option_group(authentication)
4483         parser.add_option_group(postproc)
4484
4485         xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4486         if xdg_config_home:
4487                 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4488         else:
4489                 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4490         argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4491         opts, args = parser.parse_args(argv)
4492
4493         return parser, opts, args
4494
4495 def gen_extractors():
4496         """ Return a list of an instance of every supported extractor.
4497         The order does matter; the first extractor matched is the one handling the URL.
4498         """
4499         youtube_ie = YoutubeIE()
4500         google_ie = GoogleIE()
4501         yahoo_ie = YahooIE()
4502         return [
4503                 YoutubePlaylistIE(youtube_ie),
4504                 YoutubeUserIE(youtube_ie),
4505                 YoutubeSearchIE(youtube_ie),
4506                 youtube_ie,
4507                 MetacafeIE(youtube_ie),
4508                 DailymotionIE(),
4509                 google_ie,
4510                 GoogleSearchIE(google_ie),
4511                 PhotobucketIE(),
4512                 yahoo_ie,
4513                 YahooSearchIE(yahoo_ie),
4514                 DepositFilesIE(),
4515                 FacebookIE(),
4516                 BlipTVIE(),
4517                 VimeoIE(),
4518                 MyVideoIE(),
4519                 ComedyCentralIE(),
4520                 EscapistIE(),
4521                 CollegeHumorIE(),
4522                 XVideosIE(),
4523                 SoundcloudIE(),
4524                 InfoQIE(),
4525                 MixcloudIE(),
4526                 StanfordOpenClassroomIE(),
4527                 MTVIE(),
4528
4529                 GenericIE()
4530         ]
4531
4532 def _real_main():
4533         parser, opts, args = parseOpts()
4534
4535         # Open appropriate CookieJar
4536         if opts.cookiefile is None:
4537                 jar = cookielib.CookieJar()
4538         else:
4539                 try:
4540                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
4541                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4542                                 jar.load()
4543                 except (IOError, OSError), err:
4544                         sys.exit(u'ERROR: unable to open cookie file')
4545
4546         # Dump user agent
4547         if opts.dump_user_agent:
4548                 print std_headers['User-Agent']
4549                 sys.exit(0)
4550
4551         # Batch file verification
4552         batchurls = []
4553         if opts.batchfile is not None:
4554                 try:
4555                         if opts.batchfile == '-':
4556                                 batchfd = sys.stdin
4557                         else:
4558                                 batchfd = open(opts.batchfile, 'r')
4559                         batchurls = batchfd.readlines()
4560                         batchurls = [x.strip() for x in batchurls]
4561                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4562                 except IOError:
4563                         sys.exit(u'ERROR: batch file could not be read')
4564         all_urls = batchurls + args
4565
4566         # General configuration
4567         cookie_processor = urllib2.HTTPCookieProcessor(jar)
4568         proxy_handler = urllib2.ProxyHandler()
4569         opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4570         urllib2.install_opener(opener)
4571         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4572
4573         if opts.verbose:
4574                 print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4575
4576         extractors = gen_extractors()
4577
4578         if opts.list_extractors:
4579                 for ie in extractors:
4580                         print(ie.IE_NAME)
4581                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4582                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4583                         for mu in matchedUrls:
4584                                 print(u'  ' + mu)
4585                 sys.exit(0)
4586
4587         # Conflicting, missing and erroneous options
4588         if opts.usenetrc and (opts.username is not None or opts.password is not None):
4589                 parser.error(u'using .netrc conflicts with giving username/password')
4590         if opts.password is not None and opts.username is None:
4591                 parser.error(u'account username missing')
4592         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4593                 parser.error(u'using output template conflicts with using title, literal title or auto number')
4594         if opts.usetitle and opts.useliteral:
4595                 parser.error(u'using title conflicts with using literal title')
4596         if opts.username is not None and opts.password is None:
4597                 opts.password = getpass.getpass(u'Type account password and press return:')
4598         if opts.ratelimit is not None:
4599                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4600                 if numeric_limit is None:
4601                         parser.error(u'invalid rate limit specified')
4602                 opts.ratelimit = numeric_limit
4603         if opts.retries is not None:
4604                 try:
4605                         opts.retries = long(opts.retries)
4606                 except (TypeError, ValueError), err:
4607                         parser.error(u'invalid retry count specified')
4608         try:
4609                 opts.playliststart = int(opts.playliststart)
4610                 if opts.playliststart <= 0:
4611                         raise ValueError(u'Playlist start must be positive')
4612         except (TypeError, ValueError), err:
4613                 parser.error(u'invalid playlist start number specified')
4614         try:
4615                 opts.playlistend = int(opts.playlistend)
4616                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4617                         raise ValueError(u'Playlist end must be greater than playlist start')
4618         except (TypeError, ValueError), err:
4619                 parser.error(u'invalid playlist end number specified')
4620         if opts.extractaudio:
4621                 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4622                         parser.error(u'invalid audio format specified')
4623
4624         # File downloader
4625         fd = FileDownloader({
4626                 'usenetrc': opts.usenetrc,
4627                 'username': opts.username,
4628                 'password': opts.password,
4629                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4630                 'forceurl': opts.geturl,
4631                 'forcetitle': opts.gettitle,
4632                 'forcethumbnail': opts.getthumbnail,
4633                 'forcedescription': opts.getdescription,
4634                 'forcefilename': opts.getfilename,
4635                 'forceformat': opts.getformat,
4636                 'simulate': opts.simulate,
4637                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4638                 'format': opts.format,
4639                 'format_limit': opts.format_limit,
4640                 'listformats': opts.listformats,
4641                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4642                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4643                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4644                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4645                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4646                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4647                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4648                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4649                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4650                         or u'%(id)s.%(ext)s'),
4651                 'ignoreerrors': opts.ignoreerrors,
4652                 'ratelimit': opts.ratelimit,
4653                 'nooverwrites': opts.nooverwrites,
4654                 'retries': opts.retries,
4655                 'continuedl': opts.continue_dl,
4656                 'noprogress': opts.noprogress,
4657                 'playliststart': opts.playliststart,
4658                 'playlistend': opts.playlistend,
4659                 'logtostderr': opts.outtmpl == '-',
4660                 'consoletitle': opts.consoletitle,
4661                 'nopart': opts.nopart,
4662                 'updatetime': opts.updatetime,
4663                 'writedescription': opts.writedescription,
4664                 'writeinfojson': opts.writeinfojson,
4665                 'writesubtitles': opts.writesubtitles,
4666                 'subtitleslang': opts.subtitleslang,
4667                 'matchtitle': opts.matchtitle,
4668                 'rejecttitle': opts.rejecttitle,
4669                 'max_downloads': opts.max_downloads,
4670                 'prefer_free_formats': opts.prefer_free_formats,
4671                 'verbose': opts.verbose,
4672                 })
4673         for extractor in extractors:
4674                 fd.add_info_extractor(extractor)
4675
4676         # PostProcessors
4677         if opts.extractaudio:
4678                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4679
4680         # Update version
4681         if opts.update_self:
4682                 updateSelf(fd, sys.argv[0])
4683
4684         # Maybe do nothing
4685         if len(all_urls) < 1:
4686                 if not opts.update_self:
4687                         parser.error(u'you must provide at least one URL')
4688                 else:
4689                         sys.exit()
4690         
4691         try:
4692                 retcode = fd.download(all_urls)
4693         except MaxDownloadsReached:
4694                 fd.to_screen(u'--max-download limit reached, aborting.')
4695                 retcode = 101
4696
4697         # Dump cookie jar if requested
4698         if opts.cookiefile is not None:
4699                 try:
4700                         jar.save()
4701                 except (IOError, OSError), err:
4702                         sys.exit(u'ERROR: unable to save cookie jar')
4703
4704         sys.exit(retcode)
4705
4706 def main():
4707         try:
4708                 _real_main()
4709         except DownloadError:
4710                 sys.exit(1)
4711         except SameFileError:
4712                 sys.exit(u'ERROR: fixed output name but more than one file to download')
4713         except KeyboardInterrupt:
4714                 sys.exit(u'\nERROR: Interrupted by user')
4715
4716 if __name__ == '__main__':
4717         main()
4718
4719 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: