OpenClassRoom videos (#234)
[youtube-dl] / youtube_dl / __init__.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 __author__  = (
5         'Ricardo Garcia Gonzalez',
6         'Danny Colligan',
7         'Benjamin Johnson',
8         'Vasyl\' Vavrychuk',
9         'Witold Baryluk',
10         'Paweł Paprota',
11         'Gergely Imreh',
12         'Rogério Brito',
13         'Philipp Hagemeister',
14         'Sören Schulze',
15         'Kevin Ngo',
16         'Ori Avtalion',
17         'shizeeg',
18         )
19
20 __license__ = 'Public Domain'
21 __version__ = '2011.11.23'
22
23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
24
25 import cookielib
26 import datetime
27 import gzip
28 import htmlentitydefs
29 import HTMLParser
30 import httplib
31 import locale
32 import math
33 import netrc
34 import os
35 import os.path
36 import re
37 import socket
38 import string
39 import subprocess
40 import sys
41 import time
42 import urllib
43 import urllib2
44 import warnings
45 import zlib
46
47 if os.name == 'nt':
48         import ctypes
49
50 try:
51         import email.utils
52 except ImportError: # Python 2.4
53         import email.Utils
54 try:
55         import cStringIO as StringIO
56 except ImportError:
57         import StringIO
58
59 # parse_qs was moved from the cgi module to the urlparse module recently.
60 try:
61         from urlparse import parse_qs
62 except ImportError:
63         from cgi import parse_qs
64
65 try:
66         import lxml.etree
67 except ImportError:
68         pass # Handled below
69
70 try:
71         import xml.etree.ElementTree
72 except ImportError: # Python<2.5: Not officially supported, but let it slip
73         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
74
75 std_headers = {
76         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
77         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
78         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
79         'Accept-Encoding': 'gzip, deflate',
80         'Accept-Language': 'en-us,en;q=0.5',
81 }
82
83 try:
84         import json
85 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
86         import re
87         class json(object):
88                 @staticmethod
89                 def loads(s):
90                         s = s.decode('UTF-8')
91                         def raiseError(msg, i):
92                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
93                         def skipSpace(i, expectMore=True):
94                                 while i < len(s) and s[i] in ' \t\r\n':
95                                         i += 1
96                                 if expectMore:
97                                         if i >= len(s):
98                                                 raiseError('Premature end', i)
99                                 return i
100                         def decodeEscape(match):
101                                 esc = match.group(1)
102                                 _STATIC = {
103                                         '"': '"',
104                                         '\\': '\\',
105                                         '/': '/',
106                                         'b': unichr(0x8),
107                                         'f': unichr(0xc),
108                                         'n': '\n',
109                                         'r': '\r',
110                                         't': '\t',
111                                 }
112                                 if esc in _STATIC:
113                                         return _STATIC[esc]
114                                 if esc[0] == 'u':
115                                         if len(esc) == 1+4:
116                                                 return unichr(int(esc[1:5], 16))
117                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
118                                                 hi = int(esc[1:5], 16)
119                                                 low = int(esc[7:11], 16)
120                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
121                                 raise ValueError('Unknown escape ' + str(esc))
122                         def parseString(i):
123                                 i += 1
124                                 e = i
125                                 while True:
126                                         e = s.index('"', e)
127                                         bslashes = 0
128                                         while s[e-bslashes-1] == '\\':
129                                                 bslashes += 1
130                                         if bslashes % 2 == 1:
131                                                 e += 1
132                                                 continue
133                                         break
134                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
135                                 stri = rexp.sub(decodeEscape, s[i:e])
136                                 return (e+1,stri)
137                         def parseObj(i):
138                                 i += 1
139                                 res = {}
140                                 i = skipSpace(i)
141                                 if s[i] == '}': # Empty dictionary
142                                         return (i+1,res)
143                                 while True:
144                                         if s[i] != '"':
145                                                 raiseError('Expected a string object key', i)
146                                         i,key = parseString(i)
147                                         i = skipSpace(i)
148                                         if i >= len(s) or s[i] != ':':
149                                                 raiseError('Expected a colon', i)
150                                         i,val = parse(i+1)
151                                         res[key] = val
152                                         i = skipSpace(i)
153                                         if s[i] == '}':
154                                                 return (i+1, res)
155                                         if s[i] != ',':
156                                                 raiseError('Expected comma or closing curly brace', i)
157                                         i = skipSpace(i+1)
158                         def parseArray(i):
159                                 res = []
160                                 i = skipSpace(i+1)
161                                 if s[i] == ']': # Empty array
162                                         return (i+1,res)
163                                 while True:
164                                         i,val = parse(i)
165                                         res.append(val)
166                                         i = skipSpace(i) # Raise exception if premature end
167                                         if s[i] == ']':
168                                                 return (i+1, res)
169                                         if s[i] != ',':
170                                                 raiseError('Expected a comma or closing bracket', i)
171                                         i = skipSpace(i+1)
172                         def parseDiscrete(i):
173                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
174                                         if s.startswith(k, i):
175                                                 return (i+len(k), v)
176                                 raiseError('Not a boolean (or null)', i)
177                         def parseNumber(i):
178                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
179                                 if mobj is None:
180                                         raiseError('Not a number', i)
181                                 nums = mobj.group(1)
182                                 if '.' in nums or 'e' in nums or 'E' in nums:
183                                         return (i+len(nums), float(nums))
184                                 return (i+len(nums), int(nums))
185                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
186                         def parse(i):
187                                 i = skipSpace(i)
188                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
189                                 i = skipSpace(i, False)
190                                 return (i,res)
191                         i,res = parse(0)
192                         if i < len(s):
193                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
194                         return res
195
196 def preferredencoding():
197         """Get preferred encoding.
198
199         Returns the best encoding scheme for the system, based on
200         locale.getpreferredencoding() and some further tweaks.
201         """
202         def yield_preferredencoding():
203                 try:
204                         pref = locale.getpreferredencoding()
205                         u'TEST'.encode(pref)
206                 except:
207                         pref = 'UTF-8'
208                 while True:
209                         yield pref
210         return yield_preferredencoding().next()
211
212
213 def htmlentity_transform(matchobj):
214         """Transforms an HTML entity to a Unicode character.
215
216         This function receives a match object and is intended to be used with
217         the re.sub() function.
218         """
219         entity = matchobj.group(1)
220
221         # Known non-numeric HTML entity
222         if entity in htmlentitydefs.name2codepoint:
223                 return unichr(htmlentitydefs.name2codepoint[entity])
224
225         # Unicode character
226         mobj = re.match(ur'(?u)#(x?\d+)', entity)
227         if mobj is not None:
228                 numstr = mobj.group(1)
229                 if numstr.startswith(u'x'):
230                         base = 16
231                         numstr = u'0%s' % numstr
232                 else:
233                         base = 10
234                 return unichr(long(numstr, base))
235
236         # Unknown entity in name, return its literal representation
237         return (u'&%s;' % entity)
238
239
240 def sanitize_title(utitle):
241         """Sanitizes a video title so it could be used as part of a filename."""
242         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
243         return utitle.replace(unicode(os.sep), u'%')
244
245
246 def sanitize_open(filename, open_mode):
247         """Try to open the given filename, and slightly tweak it if this fails.
248
249         Attempts to open the given filename. If this fails, it tries to change
250         the filename slightly, step by step, until it's either able to open it
251         or it fails and raises a final exception, like the standard open()
252         function.
253
254         It returns the tuple (stream, definitive_file_name).
255         """
256         try:
257                 if filename == u'-':
258                         if sys.platform == 'win32':
259                                 import msvcrt
260                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
261                         return (sys.stdout, filename)
262                 stream = open(filename, open_mode)
263                 return (stream, filename)
264         except (IOError, OSError), err:
265                 # In case of error, try to remove win32 forbidden chars
266                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
267
268                 # An exception here should be caught in the caller
269                 stream = open(filename, open_mode)
270                 return (stream, filename)
271
272
273 def timeconvert(timestr):
274         """Convert RFC 2822 defined time string into system timestamp"""
275         timestamp = None
276         timetuple = email.utils.parsedate_tz(timestr)
277         if timetuple is not None:
278                 timestamp = email.utils.mktime_tz(timetuple)
279         return timestamp
280
281 def _simplify_title(title):
282         expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
283         return expr.sub(u'_', title).strip(u'_')
284
285 class DownloadError(Exception):
286         """Download Error exception.
287
288         This exception may be thrown by FileDownloader objects if they are not
289         configured to continue on errors. They will contain the appropriate
290         error message.
291         """
292         pass
293
294
295 class SameFileError(Exception):
296         """Same File exception.
297
298         This exception will be thrown by FileDownloader objects if they detect
299         multiple files would have to be downloaded to the same file on disk.
300         """
301         pass
302
303
304 class PostProcessingError(Exception):
305         """Post Processing exception.
306
307         This exception may be raised by PostProcessor's .run() method to
308         indicate an error in the postprocessing task.
309         """
310         pass
311
312
313 class UnavailableVideoError(Exception):
314         """Unavailable Format exception.
315
316         This exception will be thrown when a video is requested
317         in a format that is not available for that video.
318         """
319         pass
320
321
322 class ContentTooShortError(Exception):
323         """Content Too Short exception.
324
325         This exception may be raised by FileDownloader objects when a file they
326         download is too small for what the server announced first, indicating
327         the connection was probably interrupted.
328         """
329         # Both in bytes
330         downloaded = None
331         expected = None
332
333         def __init__(self, downloaded, expected):
334                 self.downloaded = downloaded
335                 self.expected = expected
336
337
338 class YoutubeDLHandler(urllib2.HTTPHandler):
339         """Handler for HTTP requests and responses.
340
341         This class, when installed with an OpenerDirector, automatically adds
342         the standard headers to every HTTP request and handles gzipped and
343         deflated responses from web servers. If compression is to be avoided in
344         a particular request, the original request in the program code only has
345         to include the HTTP header "Youtubedl-No-Compression", which will be
346         removed before making the real request.
347
348         Part of this code was copied from:
349
350         http://techknack.net/python-urllib2-handlers/
351
352         Andrew Rowls, the author of that code, agreed to release it to the
353         public domain.
354         """
355
356         @staticmethod
357         def deflate(data):
358                 try:
359                         return zlib.decompress(data, -zlib.MAX_WBITS)
360                 except zlib.error:
361                         return zlib.decompress(data)
362
363         @staticmethod
364         def addinfourl_wrapper(stream, headers, url, code):
365                 if hasattr(urllib2.addinfourl, 'getcode'):
366                         return urllib2.addinfourl(stream, headers, url, code)
367                 ret = urllib2.addinfourl(stream, headers, url)
368                 ret.code = code
369                 return ret
370
371         def http_request(self, req):
372                 for h in std_headers:
373                         if h in req.headers:
374                                 del req.headers[h]
375                         req.add_header(h, std_headers[h])
376                 if 'Youtubedl-no-compression' in req.headers:
377                         if 'Accept-encoding' in req.headers:
378                                 del req.headers['Accept-encoding']
379                         del req.headers['Youtubedl-no-compression']
380                 return req
381
382         def http_response(self, req, resp):
383                 old_resp = resp
384                 # gzip
385                 if resp.headers.get('Content-encoding', '') == 'gzip':
386                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
387                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
388                         resp.msg = old_resp.msg
389                 # deflate
390                 if resp.headers.get('Content-encoding', '') == 'deflate':
391                         gz = StringIO.StringIO(self.deflate(resp.read()))
392                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
393                         resp.msg = old_resp.msg
394                 return resp
395
396
397 class FileDownloader(object):
398         """File Downloader class.
399
400         File downloader objects are the ones responsible of downloading the
401         actual video file and writing it to disk if the user has requested
402         it, among some other tasks. In most cases there should be one per
403         program. As, given a video URL, the downloader doesn't know how to
404         extract all the needed information, task that InfoExtractors do, it
405         has to pass the URL to one of them.
406
407         For this, file downloader objects have a method that allows
408         InfoExtractors to be registered in a given order. When it is passed
409         a URL, the file downloader handles it to the first InfoExtractor it
410         finds that reports being able to handle it. The InfoExtractor extracts
411         all the information about the video or videos the URL refers to, and
412         asks the FileDownloader to process the video information, possibly
413         downloading the video.
414
415         File downloaders accept a lot of parameters. In order not to saturate
416         the object constructor with arguments, it receives a dictionary of
417         options instead. These options are available through the params
418         attribute for the InfoExtractors to use. The FileDownloader also
419         registers itself as the downloader in charge for the InfoExtractors
420         that are added to it, so this is a "mutual registration".
421
422         Available options:
423
424         username:         Username for authentication purposes.
425         password:         Password for authentication purposes.
426         usenetrc:         Use netrc for authentication instead.
427         quiet:            Do not print messages to stdout.
428         forceurl:         Force printing final URL.
429         forcetitle:       Force printing title.
430         forcethumbnail:   Force printing thumbnail URL.
431         forcedescription: Force printing description.
432         forcefilename:    Force printing final filename.
433         simulate:         Do not download the video files.
434         format:           Video format code.
435         format_limit:     Highest quality format to try.
436         outtmpl:          Template for output names.
437         ignoreerrors:     Do not stop on download errors.
438         ratelimit:        Download speed limit, in bytes/sec.
439         nooverwrites:     Prevent overwriting files.
440         retries:          Number of times to retry for HTTP error 5xx
441         continuedl:       Try to continue downloads if possible.
442         noprogress:       Do not print the progress bar.
443         playliststart:    Playlist item to start at.
444         playlistend:      Playlist item to end at.
445         matchtitle:       Download only matching titles.
446         rejecttitle:      Reject downloads for matching titles.
447         logtostderr:      Log messages to stderr instead of stdout.
448         consoletitle:     Display progress in console window's titlebar.
449         nopart:           Do not use temporary .part files.
450         updatetime:       Use the Last-modified header to set output file timestamps.
451         writedescription: Write the video description to a .description file
452         writeinfojson:    Write the video description to a .info.json file
453         """
454
455         params = None
456         _ies = []
457         _pps = []
458         _download_retcode = None
459         _num_downloads = None
460         _screen_file = None
461
462         def __init__(self, params):
463                 """Create a FileDownloader object with the given options."""
464                 self._ies = []
465                 self._pps = []
466                 self._download_retcode = 0
467                 self._num_downloads = 0
468                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
469                 self.params = params
470
471         @staticmethod
472         def format_bytes(bytes):
473                 if bytes is None:
474                         return 'N/A'
475                 if type(bytes) is str:
476                         bytes = float(bytes)
477                 if bytes == 0.0:
478                         exponent = 0
479                 else:
480                         exponent = long(math.log(bytes, 1024.0))
481                 suffix = 'bkMGTPEZY'[exponent]
482                 converted = float(bytes) / float(1024 ** exponent)
483                 return '%.2f%s' % (converted, suffix)
484
485         @staticmethod
486         def calc_percent(byte_counter, data_len):
487                 if data_len is None:
488                         return '---.-%'
489                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
490
491         @staticmethod
492         def calc_eta(start, now, total, current):
493                 if total is None:
494                         return '--:--'
495                 dif = now - start
496                 if current == 0 or dif < 0.001: # One millisecond
497                         return '--:--'
498                 rate = float(current) / dif
499                 eta = long((float(total) - float(current)) / rate)
500                 (eta_mins, eta_secs) = divmod(eta, 60)
501                 if eta_mins > 99:
502                         return '--:--'
503                 return '%02d:%02d' % (eta_mins, eta_secs)
504
505         @staticmethod
506         def calc_speed(start, now, bytes):
507                 dif = now - start
508                 if bytes == 0 or dif < 0.001: # One millisecond
509                         return '%10s' % '---b/s'
510                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
511
512         @staticmethod
513         def best_block_size(elapsed_time, bytes):
514                 new_min = max(bytes / 2.0, 1.0)
515                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
516                 if elapsed_time < 0.001:
517                         return long(new_max)
518                 rate = bytes / elapsed_time
519                 if rate > new_max:
520                         return long(new_max)
521                 if rate < new_min:
522                         return long(new_min)
523                 return long(rate)
524
525         @staticmethod
526         def parse_bytes(bytestr):
527                 """Parse a string indicating a byte quantity into a long integer."""
528                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
529                 if matchobj is None:
530                         return None
531                 number = float(matchobj.group(1))
532                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
533                 return long(round(number * multiplier))
534
535         def add_info_extractor(self, ie):
536                 """Add an InfoExtractor object to the end of the list."""
537                 self._ies.append(ie)
538                 ie.set_downloader(self)
539
540         def add_post_processor(self, pp):
541                 """Add a PostProcessor object to the end of the chain."""
542                 self._pps.append(pp)
543                 pp.set_downloader(self)
544
545         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
546                 """Print message to stdout if not in quiet mode."""
547                 try:
548                         if not self.params.get('quiet', False):
549                                 terminator = [u'\n', u''][skip_eol]
550                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
551                         self._screen_file.flush()
552                 except (UnicodeEncodeError), err:
553                         if not ignore_encoding_errors:
554                                 raise
555
556         def to_stderr(self, message):
557                 """Print message to stderr."""
558                 print >>sys.stderr, message.encode(preferredencoding())
559
560         def to_cons_title(self, message):
561                 """Set console/terminal window title to message."""
562                 if not self.params.get('consoletitle', False):
563                         return
564                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
565                         # c_wchar_p() might not be necessary if `message` is
566                         # already of type unicode()
567                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
568                 elif 'TERM' in os.environ:
569                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
570
571         def fixed_template(self):
572                 """Checks if the output template is fixed."""
573                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
574
575         def trouble(self, message=None):
576                 """Determine action to take when a download problem appears.
577
578                 Depending on if the downloader has been configured to ignore
579                 download errors or not, this method may throw an exception or
580                 not when errors are found, after printing the message.
581                 """
582                 if message is not None:
583                         self.to_stderr(message)
584                 if not self.params.get('ignoreerrors', False):
585                         raise DownloadError(message)
586                 self._download_retcode = 1
587
588         def slow_down(self, start_time, byte_counter):
589                 """Sleep if the download speed is over the rate limit."""
590                 rate_limit = self.params.get('ratelimit', None)
591                 if rate_limit is None or byte_counter == 0:
592                         return
593                 now = time.time()
594                 elapsed = now - start_time
595                 if elapsed <= 0.0:
596                         return
597                 speed = float(byte_counter) / elapsed
598                 if speed > rate_limit:
599                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
600
601         def temp_name(self, filename):
602                 """Returns a temporary filename for the given filename."""
603                 if self.params.get('nopart', False) or filename == u'-' or \
604                                 (os.path.exists(filename) and not os.path.isfile(filename)):
605                         return filename
606                 return filename + u'.part'
607
608         def undo_temp_name(self, filename):
609                 if filename.endswith(u'.part'):
610                         return filename[:-len(u'.part')]
611                 return filename
612
613         def try_rename(self, old_filename, new_filename):
614                 try:
615                         if old_filename == new_filename:
616                                 return
617                         os.rename(old_filename, new_filename)
618                 except (IOError, OSError), err:
619                         self.trouble(u'ERROR: unable to rename file')
620
621         def try_utime(self, filename, last_modified_hdr):
622                 """Try to set the last-modified time of the given file."""
623                 if last_modified_hdr is None:
624                         return
625                 if not os.path.isfile(filename):
626                         return
627                 timestr = last_modified_hdr
628                 if timestr is None:
629                         return
630                 filetime = timeconvert(timestr)
631                 if filetime is None:
632                         return filetime
633                 try:
634                         os.utime(filename, (time.time(), filetime))
635                 except:
636                         pass
637                 return filetime
638
639         def report_writedescription(self, descfn):
640                 """ Report that the description file is being written """
641                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
642
643         def report_writeinfojson(self, infofn):
644                 """ Report that the metadata file has been written """
645                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
646
647         def report_destination(self, filename):
648                 """Report destination filename."""
649                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
650
651         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
652                 """Report download progress."""
653                 if self.params.get('noprogress', False):
654                         return
655                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
656                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
657                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
658                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
659
660         def report_resuming_byte(self, resume_len):
661                 """Report attempt to resume at given byte."""
662                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
663
664         def report_retry(self, count, retries):
665                 """Report retry in case of HTTP error 5xx"""
666                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
667
668         def report_file_already_downloaded(self, file_name):
669                 """Report file has already been fully downloaded."""
670                 try:
671                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
672                 except (UnicodeEncodeError), err:
673                         self.to_screen(u'[download] The file has already been downloaded')
674
675         def report_unable_to_resume(self):
676                 """Report it was impossible to resume download."""
677                 self.to_screen(u'[download] Unable to resume')
678
679         def report_finish(self):
680                 """Report download finished."""
681                 if self.params.get('noprogress', False):
682                         self.to_screen(u'[download] Download completed')
683                 else:
684                         self.to_screen(u'')
685
686         def increment_downloads(self):
687                 """Increment the ordinal that assigns a number to each file."""
688                 self._num_downloads += 1
689
690         def prepare_filename(self, info_dict):
691                 """Generate the output filename."""
692                 try:
693                         template_dict = dict(info_dict)
694                         template_dict['epoch'] = unicode(long(time.time()))
695                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
696                         filename = self.params['outtmpl'] % template_dict
697                         return filename
698                 except (ValueError, KeyError), err:
699                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
700                         return None
701
702         def _match_entry(self, info_dict):
703                 """ Returns None iff the file should be downloaded """
704
705                 title = info_dict['title']
706                 matchtitle = self.params.get('matchtitle', False)
707                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
708                         return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
709                 rejecttitle = self.params.get('rejecttitle', False)
710                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
711                         return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
712                 return None
713
714         def process_dict(self, info_dict):
715                 """ Download and handle the extracted information.
716                 For details on the specification of the various types of content, refer to the _process_* functions. """
717                 if info_dict['type'] == 'playlist':
718                         self._process_playlist(info_dict)
719                 elif info_dict['type'] == 'legacy-video':
720                         self.process_info(info_dict)
721                 else:
722                         raise ValueError('Invalid item type')
723
724         def _process_playlist(self, info_dict):
725                 assert info_dict['type'] == 'playlist'
726                 assert 'title' in info_dict
727                 assert 'stitle' in info_dict
728                 entries = info_dict['list']
729
730                 for e in entries:
731                         self.process_dict(e)
732
733         def process_info(self, info_dict):
734                 """Process a single dictionary returned by an InfoExtractor."""
735
736                 reason = self._match_entry(info_dict)
737                 if reason is not None:
738                         self.to_screen(u'[download] ' + reason)
739                         return
740
741                 max_downloads = self.params.get('max_downloads')
742                 if max_downloads is not None:
743                         if self._num_downloads > int(max_downloads):
744                                 self.to_screen(u'[download] Maximum number of downloads reached. Skipping ' + info_dict['title'])
745                                 return
746
747                 filename = self.prepare_filename(info_dict)
748                 
749                 # Forced printings
750                 if self.params.get('forcetitle', False):
751                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
752                 if self.params.get('forceurl', False):
753                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
754                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
755                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
756                 if self.params.get('forcedescription', False) and 'description' in info_dict:
757                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
758                 if self.params.get('forcefilename', False) and filename is not None:
759                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
760                 if self.params.get('forceformat', False):
761                         print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
762
763                 # Do nothing else if in simulate mode
764                 if self.params.get('simulate', False):
765                         return
766
767                 if filename is None:
768                         return
769
770                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
771                         self.to_stderr(u'WARNING: file exists and will be skipped')
772                         return
773
774                 try:
775                         dn = os.path.dirname(filename)
776                         if dn != '' and not os.path.exists(dn):
777                                 os.makedirs(dn)
778                 except (OSError, IOError), err:
779                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
780                         return
781
782                 if self.params.get('writedescription', False):
783                         try:
784                                 descfn = filename + '.description'
785                                 self.report_writedescription(descfn)
786                                 descfile = open(descfn, 'wb')
787                                 try:
788                                         descfile.write(info_dict['description'].encode('utf-8'))
789                                 finally:
790                                         descfile.close()
791                         except (OSError, IOError):
792                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
793                                 return
794
795                 if self.params.get('writeinfojson', False):
796                         infofn = filename + '.info.json'
797                         self.report_writeinfojson(infofn)
798                         try:
799                                 json.dump
800                         except (NameError,AttributeError):
801                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
802                                 return
803                         try:
804                                 infof = open(infofn, 'wb')
805                                 try:
806                                         json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
807                                         json.dump(json_info_dict, infof)
808                                 finally:
809                                         infof.close()
810                         except (OSError, IOError):
811                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
812                                 return
813
814                 if not self.params.get('skip_download', False):
815                         try:
816                                 success = self._do_download(filename, info_dict)
817                         except (OSError, IOError), err:
818                                 raise UnavailableVideoError
819                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
820                                 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
821                                 return
822                         except (ContentTooShortError, ), err:
823                                 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
824                                 return
825         
826                         if success:
827                                 try:
828                                         self.post_process(filename, info_dict)
829                                 except (PostProcessingError), err:
830                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
831                                         return
832
833         def download(self, url_list):
834                 """Download a given list of URLs."""
835                 if len(url_list) > 1 and self.fixed_template():
836                         raise SameFileError(self.params['outtmpl'])
837
838                 for url in url_list:
839                         suitable_found = False
840                         for ie in self._ies:
841                                 # Go to next InfoExtractor if not suitable
842                                 if not ie.suitable(url):
843                                         continue
844
845                                 # Suitable InfoExtractor found
846                                 suitable_found = True
847
848                                 # Extract information from URL and process it
849                                 ie.extract(url)
850
851                                 # Suitable InfoExtractor had been found; go to next URL
852                                 break
853
854                         if not suitable_found:
855                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
856
857                 return self._download_retcode
858
859         def post_process(self, filename, ie_info):
860                 """Run the postprocessing chain on the given file."""
861                 info = dict(ie_info)
862                 info['filepath'] = filename
863                 for pp in self._pps:
864                         info = pp.run(info)
865                         if info is None:
866                                 break
867
868         def _download_with_rtmpdump(self, filename, url, player_url):
869                 self.report_destination(filename)
870                 tmpfilename = self.temp_name(filename)
871
872                 # Check for rtmpdump first
873                 try:
874                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
875                 except (OSError, IOError):
876                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
877                         return False
878
879                 # Download using rtmpdump. rtmpdump returns exit code 2 when
880                 # the connection was interrumpted and resuming appears to be
881                 # possible. This is part of rtmpdump's normal usage, AFAIK.
882                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
883                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
884                 while retval == 2 or retval == 1:
885                         prevsize = os.path.getsize(tmpfilename)
886                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
887                         time.sleep(5.0) # This seems to be needed
888                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
889                         cursize = os.path.getsize(tmpfilename)
890                         if prevsize == cursize and retval == 1:
891                                 break
892                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
893                         if prevsize == cursize and retval == 2 and cursize > 1024:
894                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
895                                 retval = 0
896                                 break
897                 if retval == 0:
898                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
899                         self.try_rename(tmpfilename, filename)
900                         return True
901                 else:
902                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
903                         return False
904
905         def _do_download(self, filename, info_dict):
906                 url = info_dict['url']
907                 player_url = info_dict.get('player_url', None)
908
909                 # Check file already present
910                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
911                         self.report_file_already_downloaded(filename)
912                         return True
913
914                 # Attempt to download using rtmpdump
915                 if url.startswith('rtmp'):
916                         return self._download_with_rtmpdump(filename, url, player_url)
917
918                 tmpfilename = self.temp_name(filename)
919                 stream = None
920
921                 # Do not include the Accept-Encoding header
922                 headers = {'Youtubedl-no-compression': 'True'}
923                 basic_request = urllib2.Request(url, None, headers)
924                 request = urllib2.Request(url, None, headers)
925
926                 # Establish possible resume length
927                 if os.path.isfile(tmpfilename):
928                         resume_len = os.path.getsize(tmpfilename)
929                 else:
930                         resume_len = 0
931
932                 open_mode = 'wb'
933                 if resume_len != 0:
934                         if self.params.get('continuedl', False):
935                                 self.report_resuming_byte(resume_len)
936                                 request.add_header('Range','bytes=%d-' % resume_len)
937                                 open_mode = 'ab'
938                         else:
939                                 resume_len = 0
940
941                 count = 0
942                 retries = self.params.get('retries', 0)
943                 while count <= retries:
944                         # Establish connection
945                         try:
946                                 if count == 0 and 'urlhandle' in info_dict:
947                                         data = info_dict['urlhandle']
948                                 data = urllib2.urlopen(request)
949                                 break
950                         except (urllib2.HTTPError, ), err:
951                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
952                                         # Unexpected HTTP error
953                                         raise
954                                 elif err.code == 416:
955                                         # Unable to resume (requested range not satisfiable)
956                                         try:
957                                                 # Open the connection again without the range header
958                                                 data = urllib2.urlopen(basic_request)
959                                                 content_length = data.info()['Content-Length']
960                                         except (urllib2.HTTPError, ), err:
961                                                 if err.code < 500 or err.code >= 600:
962                                                         raise
963                                         else:
964                                                 # Examine the reported length
965                                                 if (content_length is not None and
966                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
967                                                         # The file had already been fully downloaded.
968                                                         # Explanation to the above condition: in issue #175 it was revealed that
969                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
970                                                         # changing the file size slightly and causing problems for some users. So
971                                                         # I decided to implement a suggested change and consider the file
972                                                         # completely downloaded if the file size differs less than 100 bytes from
973                                                         # the one in the hard drive.
974                                                         self.report_file_already_downloaded(filename)
975                                                         self.try_rename(tmpfilename, filename)
976                                                         return True
977                                                 else:
978                                                         # The length does not match, we start the download over
979                                                         self.report_unable_to_resume()
980                                                         open_mode = 'wb'
981                                                         break
982                         # Retry
983                         count += 1
984                         if count <= retries:
985                                 self.report_retry(count, retries)
986
987                 if count > retries:
988                         self.trouble(u'ERROR: giving up after %s retries' % retries)
989                         return False
990
991                 data_len = data.info().get('Content-length', None)
992                 if data_len is not None:
993                         data_len = long(data_len) + resume_len
994                 data_len_str = self.format_bytes(data_len)
995                 byte_counter = 0 + resume_len
996                 block_size = 1024
997                 start = time.time()
998                 while True:
999                         # Download and write
1000                         before = time.time()
1001                         data_block = data.read(block_size)
1002                         after = time.time()
1003                         if len(data_block) == 0:
1004                                 break
1005                         byte_counter += len(data_block)
1006
1007                         # Open file just in time
1008                         if stream is None:
1009                                 try:
1010                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1011                                         assert stream is not None
1012                                         filename = self.undo_temp_name(tmpfilename)
1013                                         self.report_destination(filename)
1014                                 except (OSError, IOError), err:
1015                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1016                                         return False
1017                         try:
1018                                 stream.write(data_block)
1019                         except (IOError, OSError), err:
1020                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1021                                 return False
1022                         block_size = self.best_block_size(after - before, len(data_block))
1023
1024                         # Progress message
1025                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1026                         if data_len is None:
1027                                 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1028                         else:
1029                                 percent_str = self.calc_percent(byte_counter, data_len)
1030                                 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1031                                 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1032
1033                         # Apply rate limit
1034                         self.slow_down(start, byte_counter - resume_len)
1035
1036                 if stream is None:
1037                         self.trouble(u'\nERROR: Did not get any data blocks')
1038                         return False
1039                 stream.close()
1040                 self.report_finish()
1041                 if data_len is not None and byte_counter != data_len:
1042                         raise ContentTooShortError(byte_counter, long(data_len))
1043                 self.try_rename(tmpfilename, filename)
1044
1045                 # Update file modification time
1046                 if self.params.get('updatetime', True):
1047                         info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1048
1049                 return True
1050
1051
1052 class InfoExtractor(object):
1053         """Information Extractor class.
1054
1055         Information extractors are the classes that, given a URL, extract
1056         information from the video (or videos) the URL refers to. This
1057         information includes the real video URL, the video title and simplified
1058         title, author and others. The information is stored in a dictionary
1059         which is then passed to the FileDownloader. The FileDownloader
1060         processes this information possibly downloading the video to the file
1061         system, among other possible outcomes. The dictionaries must include
1062         the following fields:
1063
1064         id:             Video identifier.
1065         url:            Final video URL.
1066         uploader:       Nickname of the video uploader.
1067         title:          Literal title.
1068         stitle:         Simplified title.
1069         ext:            Video filename extension.
1070         format:         Video format.
1071         player_url:     SWF Player URL (may be None).
1072
1073         The following fields are optional. Their primary purpose is to allow
1074         youtube-dl to serve as the backend for a video search function, such
1075         as the one in youtube2mp3.  They are only used when their respective
1076         forced printing functions are called:
1077
1078         thumbnail:      Full URL to a video thumbnail image.
1079         description:    One-line video description.
1080
1081         Subclasses of this one should re-define the _real_initialize() and
1082         _real_extract() methods and define a _VALID_URL regexp.
1083         Probably, they should also be added to the list of extractors.
1084         """
1085
1086         _ready = False
1087         _downloader = None
1088
1089         def __init__(self, downloader=None):
1090                 """Constructor. Receives an optional downloader."""
1091                 self._ready = False
1092                 self.set_downloader(downloader)
1093
1094         def suitable(self, url):
1095                 """Receives a URL and returns True if suitable for this IE."""
1096                 return re.match(self._VALID_URL, url) is not None
1097
1098         def initialize(self):
1099                 """Initializes an instance (authentication, etc)."""
1100                 if not self._ready:
1101                         self._real_initialize()
1102                         self._ready = True
1103
1104         def extract(self, url):
1105                 """Extracts URL information and returns it in list of dicts."""
1106                 self.initialize()
1107                 return self._real_extract(url)
1108
1109         def set_downloader(self, downloader):
1110                 """Sets the downloader for this IE."""
1111                 self._downloader = downloader
1112
1113         def _real_initialize(self):
1114                 """Real initialization process. Redefine in subclasses."""
1115                 pass
1116
1117         def _real_extract(self, url):
1118                 """Real extraction process. Redefine in subclasses."""
1119                 pass
1120
1121
1122 class YoutubeIE(InfoExtractor):
1123         """Information extractor for youtube.com."""
1124
1125         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1126         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1127         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1128         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1129         _NETRC_MACHINE = 'youtube'
1130         # Listed in order of quality
1131         _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1132         _video_extensions = {
1133                 '13': '3gp',
1134                 '17': 'mp4',
1135                 '18': 'mp4',
1136                 '22': 'mp4',
1137                 '37': 'mp4',
1138                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1139                 '43': 'webm',
1140                 '44': 'webm',
1141                 '45': 'webm',
1142         }
1143         _video_dimensions = {
1144                 '5': '240x400',
1145                 '6': '???',
1146                 '13': '???',
1147                 '17': '144x176',
1148                 '18': '360x640',
1149                 '22': '720x1280',
1150                 '34': '360x640',
1151                 '35': '480x854',
1152                 '37': '1080x1920',
1153                 '38': '3072x4096',
1154                 '43': '360x640',
1155                 '44': '480x854',
1156                 '45': '720x1280',
1157         }       
1158         IE_NAME = u'youtube'
1159
1160         def report_lang(self):
1161                 """Report attempt to set language."""
1162                 self._downloader.to_screen(u'[youtube] Setting language')
1163
1164         def report_login(self):
1165                 """Report attempt to log in."""
1166                 self._downloader.to_screen(u'[youtube] Logging in')
1167
1168         def report_age_confirmation(self):
1169                 """Report attempt to confirm age."""
1170                 self._downloader.to_screen(u'[youtube] Confirming age')
1171
1172         def report_video_webpage_download(self, video_id):
1173                 """Report attempt to download video webpage."""
1174                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1175
1176         def report_video_info_webpage_download(self, video_id):
1177                 """Report attempt to download video info webpage."""
1178                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1179
1180         def report_information_extraction(self, video_id):
1181                 """Report attempt to extract video information."""
1182                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1183
1184         def report_unavailable_format(self, video_id, format):
1185                 """Report extracted video URL."""
1186                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1187
1188         def report_rtmp_download(self):
1189                 """Indicate the download will use the RTMP protocol."""
1190                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1191
1192         def _print_formats(self, formats):
1193                 print 'Available formats:'
1194                 for x in formats:
1195                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1196
1197         def _real_initialize(self):
1198                 if self._downloader is None:
1199                         return
1200
1201                 username = None
1202                 password = None
1203                 downloader_params = self._downloader.params
1204
1205                 # Attempt to use provided username and password or .netrc data
1206                 if downloader_params.get('username', None) is not None:
1207                         username = downloader_params['username']
1208                         password = downloader_params['password']
1209                 elif downloader_params.get('usenetrc', False):
1210                         try:
1211                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1212                                 if info is not None:
1213                                         username = info[0]
1214                                         password = info[2]
1215                                 else:
1216                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1217                         except (IOError, netrc.NetrcParseError), err:
1218                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1219                                 return
1220
1221                 # Set language
1222                 request = urllib2.Request(self._LANG_URL)
1223                 try:
1224                         self.report_lang()
1225                         urllib2.urlopen(request).read()
1226                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1227                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1228                         return
1229
1230                 # No authentication to be performed
1231                 if username is None:
1232                         return
1233
1234                 # Log in
1235                 login_form = {
1236                                 'current_form': 'loginForm',
1237                                 'next':         '/',
1238                                 'action_login': 'Log In',
1239                                 'username':     username,
1240                                 'password':     password,
1241                                 }
1242                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1243                 try:
1244                         self.report_login()
1245                         login_results = urllib2.urlopen(request).read()
1246                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1247                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1248                                 return
1249                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1250                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1251                         return
1252
1253                 # Confirm age
1254                 age_form = {
1255                                 'next_url':             '/',
1256                                 'action_confirm':       'Confirm',
1257                                 }
1258                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1259                 try:
1260                         self.report_age_confirmation()
1261                         age_results = urllib2.urlopen(request).read()
1262                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1263                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1264                         return
1265
1266         def _real_extract(self, url):
1267                 # Extract video id from URL
1268                 mobj = re.match(self._VALID_URL, url)
1269                 if mobj is None:
1270                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1271                         return
1272                 video_id = mobj.group(2)
1273
1274                 # Get video webpage
1275                 self.report_video_webpage_download(video_id)
1276                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1277                 try:
1278                         video_webpage = urllib2.urlopen(request).read()
1279                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1280                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1281                         return
1282
1283                 # Attempt to extract SWF player URL
1284                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1285                 if mobj is not None:
1286                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1287                 else:
1288                         player_url = None
1289
1290                 # Get video info
1291                 self.report_video_info_webpage_download(video_id)
1292                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1293                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1294                                         % (video_id, el_type))
1295                         request = urllib2.Request(video_info_url)
1296                         try:
1297                                 video_info_webpage = urllib2.urlopen(request).read()
1298                                 video_info = parse_qs(video_info_webpage)
1299                                 if 'token' in video_info:
1300                                         break
1301                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1302                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1303                                 return
1304                 if 'token' not in video_info:
1305                         if 'reason' in video_info:
1306                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1307                         else:
1308                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1309                         return
1310
1311                 # Start extracting information
1312                 self.report_information_extraction(video_id)
1313
1314                 # uploader
1315                 if 'author' not in video_info:
1316                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1317                         return
1318                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1319
1320                 # title
1321                 if 'title' not in video_info:
1322                         self._downloader.trouble(u'ERROR: unable to extract video title')
1323                         return
1324                 video_title = urllib.unquote_plus(video_info['title'][0])
1325                 video_title = video_title.decode('utf-8')
1326                 video_title = sanitize_title(video_title)
1327
1328                 # simplified title
1329                 simple_title = _simplify_title(video_title)
1330
1331                 # thumbnail image
1332                 if 'thumbnail_url' not in video_info:
1333                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1334                         video_thumbnail = ''
1335                 else:   # don't panic if we can't find it
1336                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1337
1338                 # upload date
1339                 upload_date = u'NA'
1340                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1341                 if mobj is not None:
1342                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1343                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1344                         for expression in format_expressions:
1345                                 try:
1346                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1347                                 except:
1348                                         pass
1349
1350                 # description
1351                 try:
1352                         lxml.etree
1353                 except NameError:
1354                         video_description = u'No description available.'
1355                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1356                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1357                                 if mobj is not None:
1358                                         video_description = mobj.group(1).decode('utf-8')
1359                 else:
1360                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1361                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1362                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1363                         # TODO use another parser
1364
1365                 # token
1366                 video_token = urllib.unquote_plus(video_info['token'][0])
1367
1368                 # Decide which formats to download
1369                 req_format = self._downloader.params.get('format', None)
1370
1371                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1372                         self.report_rtmp_download()
1373                         video_url_list = [(None, video_info['conn'][0])]
1374                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1375                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1376                         url_data = [parse_qs(uds) for uds in url_data_strs]
1377                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1378                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1379
1380                         format_limit = self._downloader.params.get('format_limit', None)
1381                         if format_limit is not None and format_limit in self._available_formats:
1382                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1383                         else:
1384                                 format_list = self._available_formats
1385                         existing_formats = [x for x in format_list if x in url_map]
1386                         if len(existing_formats) == 0:
1387                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1388                                 return
1389                         if self._downloader.params.get('listformats', None):
1390                                 self._print_formats(existing_formats)
1391                                 return
1392                         if req_format is None or req_format == 'best':
1393                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1394                         elif req_format == 'worst':
1395                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1396                         elif req_format in ('-1', 'all'):
1397                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1398                         else:
1399                                 # Specific formats. We pick the first in a slash-delimeted sequence.
1400                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1401                                 req_formats = req_format.split('/')
1402                                 video_url_list = None
1403                                 for rf in req_formats:
1404                                         if rf in url_map:
1405                                                 video_url_list = [(rf, url_map[rf])]
1406                                                 break
1407                                 if video_url_list is None:
1408                                         self._downloader.trouble(u'ERROR: requested format not available')
1409                                         return
1410                 else:
1411                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1412                         return
1413
1414                 for format_param, video_real_url in video_url_list:
1415                         # At this point we have a new video
1416                         self._downloader.increment_downloads()
1417
1418                         # Extension
1419                         video_extension = self._video_extensions.get(format_param, 'flv')
1420
1421                         try:
1422                                 # Process video information
1423                                 self._downloader.process_info({
1424                                         'id':           video_id.decode('utf-8'),
1425                                         'url':          video_real_url.decode('utf-8'),
1426                                         'uploader':     video_uploader.decode('utf-8'),
1427                                         'upload_date':  upload_date,
1428                                         'title':        video_title,
1429                                         'stitle':       simple_title,
1430                                         'ext':          video_extension.decode('utf-8'),
1431                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1432                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1433                                         'description':  video_description,
1434                                         'player_url':   player_url,
1435                                 })
1436                         except UnavailableVideoError, err:
1437                                 self._downloader.trouble(u'\nERROR: unable to download video')
1438
1439
1440 class MetacafeIE(InfoExtractor):
1441         """Information Extractor for metacafe.com."""
1442
1443         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1444         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1445         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1446         _youtube_ie = None
1447         IE_NAME = u'metacafe'
1448
1449         def __init__(self, youtube_ie, downloader=None):
1450                 InfoExtractor.__init__(self, downloader)
1451                 self._youtube_ie = youtube_ie
1452
1453         def report_disclaimer(self):
1454                 """Report disclaimer retrieval."""
1455                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1456
1457         def report_age_confirmation(self):
1458                 """Report attempt to confirm age."""
1459                 self._downloader.to_screen(u'[metacafe] Confirming age')
1460
1461         def report_download_webpage(self, video_id):
1462                 """Report webpage download."""
1463                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1464
1465         def report_extraction(self, video_id):
1466                 """Report information extraction."""
1467                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1468
1469         def _real_initialize(self):
1470                 # Retrieve disclaimer
1471                 request = urllib2.Request(self._DISCLAIMER)
1472                 try:
1473                         self.report_disclaimer()
1474                         disclaimer = urllib2.urlopen(request).read()
1475                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1476                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1477                         return
1478
1479                 # Confirm age
1480                 disclaimer_form = {
1481                         'filters': '0',
1482                         'submit': "Continue - I'm over 18",
1483                         }
1484                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1485                 try:
1486                         self.report_age_confirmation()
1487                         disclaimer = urllib2.urlopen(request).read()
1488                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1489                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1490                         return
1491
1492         def _real_extract(self, url):
1493                 # Extract id and simplified title from URL
1494                 mobj = re.match(self._VALID_URL, url)
1495                 if mobj is None:
1496                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1497                         return
1498
1499                 video_id = mobj.group(1)
1500
1501                 # Check if video comes from YouTube
1502                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1503                 if mobj2 is not None:
1504                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1505                         return
1506
1507                 # At this point we have a new video
1508                 self._downloader.increment_downloads()
1509
1510                 simple_title = mobj.group(2).decode('utf-8')
1511
1512                 # Retrieve video webpage to extract further information
1513                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1514                 try:
1515                         self.report_download_webpage(video_id)
1516                         webpage = urllib2.urlopen(request).read()
1517                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1518                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1519                         return
1520
1521                 # Extract URL, uploader and title from webpage
1522                 self.report_extraction(video_id)
1523                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1524                 if mobj is not None:
1525                         mediaURL = urllib.unquote(mobj.group(1))
1526                         video_extension = mediaURL[-3:]
1527
1528                         # Extract gdaKey if available
1529                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1530                         if mobj is None:
1531                                 video_url = mediaURL
1532                         else:
1533                                 gdaKey = mobj.group(1)
1534                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1535                 else:
1536                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1537                         if mobj is None:
1538                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1539                                 return
1540                         vardict = parse_qs(mobj.group(1))
1541                         if 'mediaData' not in vardict:
1542                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1543                                 return
1544                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1545                         if mobj is None:
1546                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1547                                 return
1548                         mediaURL = mobj.group(1).replace('\\/', '/')
1549                         video_extension = mediaURL[-3:]
1550                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1551
1552                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1553                 if mobj is None:
1554                         self._downloader.trouble(u'ERROR: unable to extract title')
1555                         return
1556                 video_title = mobj.group(1).decode('utf-8')
1557                 video_title = sanitize_title(video_title)
1558
1559                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1560                 if mobj is None:
1561                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1562                         return
1563                 video_uploader = mobj.group(1)
1564
1565                 try:
1566                         # Process video information
1567                         self._downloader.process_info({
1568                                 'id':           video_id.decode('utf-8'),
1569                                 'url':          video_url.decode('utf-8'),
1570                                 'uploader':     video_uploader.decode('utf-8'),
1571                                 'upload_date':  u'NA',
1572                                 'title':        video_title,
1573                                 'stitle':       simple_title,
1574                                 'ext':          video_extension.decode('utf-8'),
1575                                 'format':       u'NA',
1576                                 'player_url':   None,
1577                         })
1578                 except UnavailableVideoError:
1579                         self._downloader.trouble(u'\nERROR: unable to download video')
1580
1581
1582 class DailymotionIE(InfoExtractor):
1583         """Information Extractor for Dailymotion"""
1584
1585         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1586         IE_NAME = u'dailymotion'
1587
1588         def __init__(self, downloader=None):
1589                 InfoExtractor.__init__(self, downloader)
1590
1591         def report_download_webpage(self, video_id):
1592                 """Report webpage download."""
1593                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1594
1595         def report_extraction(self, video_id):
1596                 """Report information extraction."""
1597                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1598
1599         def _real_extract(self, url):
1600                 # Extract id and simplified title from URL
1601                 mobj = re.match(self._VALID_URL, url)
1602                 if mobj is None:
1603                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1604                         return
1605
1606                 # At this point we have a new video
1607                 self._downloader.increment_downloads()
1608                 video_id = mobj.group(1)
1609
1610                 simple_title = mobj.group(2).decode('utf-8')
1611                 video_extension = 'flv'
1612
1613                 # Retrieve video webpage to extract further information
1614                 request = urllib2.Request(url)
1615                 request.add_header('Cookie', 'family_filter=off')
1616                 try:
1617                         self.report_download_webpage(video_id)
1618                         webpage = urllib2.urlopen(request).read()
1619                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1620                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1621                         return
1622
1623                 # Extract URL, uploader and title from webpage
1624                 self.report_extraction(video_id)
1625                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1626                 if mobj is None:
1627                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1628                         return
1629                 sequence = urllib.unquote(mobj.group(1))
1630                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1631                 if mobj is None:
1632                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1633                         return
1634                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1635
1636                 # if needed add http://www.dailymotion.com/ if relative URL
1637
1638                 video_url = mediaURL
1639
1640                 mobj = re.search(r'(?im)<title>\s*(.+)\s*-\s*Video\s+Dailymotion</title>', webpage)
1641                 if mobj is None:
1642                         self._downloader.trouble(u'ERROR: unable to extract title')
1643                         return
1644                 video_title = mobj.group(1).decode('utf-8')
1645                 video_title = sanitize_title(video_title)
1646
1647                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1648                 if mobj is None:
1649                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1650                         return
1651                 video_uploader = mobj.group(1)
1652
1653                 try:
1654                         # Process video information
1655                         self._downloader.process_info({
1656                                 'id':           video_id.decode('utf-8'),
1657                                 'url':          video_url.decode('utf-8'),
1658                                 'uploader':     video_uploader.decode('utf-8'),
1659                                 'upload_date':  u'NA',
1660                                 'title':        video_title,
1661                                 'stitle':       simple_title,
1662                                 'ext':          video_extension.decode('utf-8'),
1663                                 'format':       u'NA',
1664                                 'player_url':   None,
1665                         })
1666                 except UnavailableVideoError:
1667                         self._downloader.trouble(u'\nERROR: unable to download video')
1668
1669
1670 class GoogleIE(InfoExtractor):
1671         """Information extractor for video.google.com."""
1672
1673         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1674         IE_NAME = u'video.google'
1675
1676         def __init__(self, downloader=None):
1677                 InfoExtractor.__init__(self, downloader)
1678
1679         def report_download_webpage(self, video_id):
1680                 """Report webpage download."""
1681                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1682
1683         def report_extraction(self, video_id):
1684                 """Report information extraction."""
1685                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1686
1687         def _real_extract(self, url):
1688                 # Extract id from URL
1689                 mobj = re.match(self._VALID_URL, url)
1690                 if mobj is None:
1691                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1692                         return
1693
1694                 # At this point we have a new video
1695                 self._downloader.increment_downloads()
1696                 video_id = mobj.group(1)
1697
1698                 video_extension = 'mp4'
1699
1700                 # Retrieve video webpage to extract further information
1701                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1702                 try:
1703                         self.report_download_webpage(video_id)
1704                         webpage = urllib2.urlopen(request).read()
1705                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1706                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1707                         return
1708
1709                 # Extract URL, uploader, and title from webpage
1710                 self.report_extraction(video_id)
1711                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1712                 if mobj is None:
1713                         video_extension = 'flv'
1714                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1715                 if mobj is None:
1716                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1717                         return
1718                 mediaURL = urllib.unquote(mobj.group(1))
1719                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1720                 mediaURL = mediaURL.replace('\\x26', '\x26')
1721
1722                 video_url = mediaURL
1723
1724                 mobj = re.search(r'<title>(.*)</title>', webpage)
1725                 if mobj is None:
1726                         self._downloader.trouble(u'ERROR: unable to extract title')
1727                         return
1728                 video_title = mobj.group(1).decode('utf-8')
1729                 video_title = sanitize_title(video_title)
1730                 simple_title = _simplify_title(video_title)
1731
1732                 # Extract video description
1733                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1734                 if mobj is None:
1735                         self._downloader.trouble(u'ERROR: unable to extract video description')
1736                         return
1737                 video_description = mobj.group(1).decode('utf-8')
1738                 if not video_description:
1739                         video_description = 'No description available.'
1740
1741                 # Extract video thumbnail
1742                 if self._downloader.params.get('forcethumbnail', False):
1743                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1744                         try:
1745                                 webpage = urllib2.urlopen(request).read()
1746                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1747                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1748                                 return
1749                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1750                         if mobj is None:
1751                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1752                                 return
1753                         video_thumbnail = mobj.group(1)
1754                 else:   # we need something to pass to process_info
1755                         video_thumbnail = ''
1756
1757                 try:
1758                         # Process video information
1759                         self._downloader.process_info({
1760                                 'id':           video_id.decode('utf-8'),
1761                                 'url':          video_url.decode('utf-8'),
1762                                 'uploader':     u'NA',
1763                                 'upload_date':  u'NA',
1764                                 'title':        video_title,
1765                                 'stitle':       simple_title,
1766                                 'ext':          video_extension.decode('utf-8'),
1767                                 'format':       u'NA',
1768                                 'player_url':   None,
1769                         })
1770                 except UnavailableVideoError:
1771                         self._downloader.trouble(u'\nERROR: unable to download video')
1772
1773
1774 class PhotobucketIE(InfoExtractor):
1775         """Information extractor for photobucket.com."""
1776
1777         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1778         IE_NAME = u'photobucket'
1779
1780         def __init__(self, downloader=None):
1781                 InfoExtractor.__init__(self, downloader)
1782
1783         def report_download_webpage(self, video_id):
1784                 """Report webpage download."""
1785                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1786
1787         def report_extraction(self, video_id):
1788                 """Report information extraction."""
1789                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1790
1791         def _real_extract(self, url):
1792                 # Extract id from URL
1793                 mobj = re.match(self._VALID_URL, url)
1794                 if mobj is None:
1795                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1796                         return
1797
1798                 # At this point we have a new video
1799                 self._downloader.increment_downloads()
1800                 video_id = mobj.group(1)
1801
1802                 video_extension = 'flv'
1803
1804                 # Retrieve video webpage to extract further information
1805                 request = urllib2.Request(url)
1806                 try:
1807                         self.report_download_webpage(video_id)
1808                         webpage = urllib2.urlopen(request).read()
1809                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1810                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1811                         return
1812
1813                 # Extract URL, uploader, and title from webpage
1814                 self.report_extraction(video_id)
1815                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1816                 if mobj is None:
1817                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1818                         return
1819                 mediaURL = urllib.unquote(mobj.group(1))
1820
1821                 video_url = mediaURL
1822
1823                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1824                 if mobj is None:
1825                         self._downloader.trouble(u'ERROR: unable to extract title')
1826                         return
1827                 video_title = mobj.group(1).decode('utf-8')
1828                 video_title = sanitize_title(video_title)
1829                 simple_title = _simplify_title(vide_title)
1830
1831                 video_uploader = mobj.group(2).decode('utf-8')
1832
1833                 try:
1834                         # Process video information
1835                         self._downloader.process_info({
1836                                 'id':           video_id.decode('utf-8'),
1837                                 'url':          video_url.decode('utf-8'),
1838                                 'uploader':     video_uploader,
1839                                 'upload_date':  u'NA',
1840                                 'title':        video_title,
1841                                 'stitle':       simple_title,
1842                                 'ext':          video_extension.decode('utf-8'),
1843                                 'format':       u'NA',
1844                                 'player_url':   None,
1845                         })
1846                 except UnavailableVideoError:
1847                         self._downloader.trouble(u'\nERROR: unable to download video')
1848
1849
1850 class YahooIE(InfoExtractor):
1851         """Information extractor for video.yahoo.com."""
1852
1853         # _VALID_URL matches all Yahoo! Video URLs
1854         # _VPAGE_URL matches only the extractable '/watch/' URLs
1855         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1856         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1857         IE_NAME = u'video.yahoo'
1858
1859         def __init__(self, downloader=None):
1860                 InfoExtractor.__init__(self, downloader)
1861
1862         def report_download_webpage(self, video_id):
1863                 """Report webpage download."""
1864                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1865
1866         def report_extraction(self, video_id):
1867                 """Report information extraction."""
1868                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1869
1870         def _real_extract(self, url, new_video=True):
1871                 # Extract ID from URL
1872                 mobj = re.match(self._VALID_URL, url)
1873                 if mobj is None:
1874                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1875                         return
1876
1877                 # At this point we have a new video
1878                 self._downloader.increment_downloads()
1879                 video_id = mobj.group(2)
1880                 video_extension = 'flv'
1881
1882                 # Rewrite valid but non-extractable URLs as
1883                 # extractable English language /watch/ URLs
1884                 if re.match(self._VPAGE_URL, url) is None:
1885                         request = urllib2.Request(url)
1886                         try:
1887                                 webpage = urllib2.urlopen(request).read()
1888                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1889                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1890                                 return
1891
1892                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1893                         if mobj is None:
1894                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1895                                 return
1896                         yahoo_id = mobj.group(1)
1897
1898                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1899                         if mobj is None:
1900                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1901                                 return
1902                         yahoo_vid = mobj.group(1)
1903
1904                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1905                         return self._real_extract(url, new_video=False)
1906
1907                 # Retrieve video webpage to extract further information
1908                 request = urllib2.Request(url)
1909                 try:
1910                         self.report_download_webpage(video_id)
1911                         webpage = urllib2.urlopen(request).read()
1912                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1913                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1914                         return
1915
1916                 # Extract uploader and title from webpage
1917                 self.report_extraction(video_id)
1918                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1919                 if mobj is None:
1920                         self._downloader.trouble(u'ERROR: unable to extract video title')
1921                         return
1922                 video_title = mobj.group(1).decode('utf-8')
1923                 simple_title = _simplify_title(video_title)
1924
1925                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1926                 if mobj is None:
1927                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1928                         return
1929                 video_uploader = mobj.group(1).decode('utf-8')
1930
1931                 # Extract video thumbnail
1932                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1933                 if mobj is None:
1934                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1935                         return
1936                 video_thumbnail = mobj.group(1).decode('utf-8')
1937
1938                 # Extract video description
1939                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1940                 if mobj is None:
1941                         self._downloader.trouble(u'ERROR: unable to extract video description')
1942                         return
1943                 video_description = mobj.group(1).decode('utf-8')
1944                 if not video_description:
1945                         video_description = 'No description available.'
1946
1947                 # Extract video height and width
1948                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1949                 if mobj is None:
1950                         self._downloader.trouble(u'ERROR: unable to extract video height')
1951                         return
1952                 yv_video_height = mobj.group(1)
1953
1954                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1955                 if mobj is None:
1956                         self._downloader.trouble(u'ERROR: unable to extract video width')
1957                         return
1958                 yv_video_width = mobj.group(1)
1959
1960                 # Retrieve video playlist to extract media URL
1961                 # I'm not completely sure what all these options are, but we
1962                 # seem to need most of them, otherwise the server sends a 401.
1963                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1964                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1965                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1966                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1967                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1968                 try:
1969                         self.report_download_webpage(video_id)
1970                         webpage = urllib2.urlopen(request).read()
1971                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1972                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1973                         return
1974
1975                 # Extract media URL from playlist XML
1976                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1977                 if mobj is None:
1978                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1979                         return
1980                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1981                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1982
1983                 try:
1984                         # Process video information
1985                         self._downloader.process_info({
1986                                 'id':           video_id.decode('utf-8'),
1987                                 'url':          video_url,
1988                                 'uploader':     video_uploader,
1989                                 'upload_date':  u'NA',
1990                                 'title':        video_title,
1991                                 'stitle':       simple_title,
1992                                 'ext':          video_extension.decode('utf-8'),
1993                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1994                                 'description':  video_description,
1995                                 'thumbnail':    video_thumbnail,
1996                                 'player_url':   None,
1997                         })
1998                 except UnavailableVideoError:
1999                         self._downloader.trouble(u'\nERROR: unable to download video')
2000
2001
2002 class VimeoIE(InfoExtractor):
2003         """Information extractor for vimeo.com."""
2004
2005         # _VALID_URL matches Vimeo URLs
2006         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2007         IE_NAME = u'vimeo'
2008
2009         def __init__(self, downloader=None):
2010                 InfoExtractor.__init__(self, downloader)
2011
2012         def report_download_webpage(self, video_id):
2013                 """Report webpage download."""
2014                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2015
2016         def report_extraction(self, video_id):
2017                 """Report information extraction."""
2018                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2019
2020         def _real_extract(self, url, new_video=True):
2021                 # Extract ID from URL
2022                 mobj = re.match(self._VALID_URL, url)
2023                 if mobj is None:
2024                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2025                         return
2026
2027                 # At this point we have a new video
2028                 self._downloader.increment_downloads()
2029                 video_id = mobj.group(1)
2030
2031                 # Retrieve video webpage to extract further information
2032                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
2033                 try:
2034                         self.report_download_webpage(video_id)
2035                         webpage = urllib2.urlopen(request).read()
2036                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2037                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2038                         return
2039
2040                 # Now we begin extracting as much information as we can from what we
2041                 # retrieved. First we extract the information common to all extractors,
2042                 # and latter we extract those that are Vimeo specific.
2043                 self.report_extraction(video_id)
2044
2045                 # Extract title
2046                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2047                 if mobj is None:
2048                         self._downloader.trouble(u'ERROR: unable to extract video title')
2049                         return
2050                 video_title = mobj.group(1).decode('utf-8')
2051                 simple_title = _simplify_title(video_title)
2052
2053                 # Extract uploader
2054                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2055                 if mobj is None:
2056                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
2057                         return
2058                 video_uploader = mobj.group(1).decode('utf-8')
2059
2060                 # Extract video thumbnail
2061                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2062                 if mobj is None:
2063                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2064                         return
2065                 video_thumbnail = mobj.group(1).decode('utf-8')
2066
2067                 # # Extract video description
2068                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2069                 # if mobj is None:
2070                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
2071                 #       return
2072                 # video_description = mobj.group(1).decode('utf-8')
2073                 # if not video_description: video_description = 'No description available.'
2074                 video_description = 'Foo.'
2075
2076                 # Vimeo specific: extract request signature
2077                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2078                 if mobj is None:
2079                         self._downloader.trouble(u'ERROR: unable to extract request signature')
2080                         return
2081                 sig = mobj.group(1).decode('utf-8')
2082
2083                 # Vimeo specific: extract video quality information
2084                 mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2085                 if mobj is None:
2086                         self._downloader.trouble(u'ERROR: unable to extract video quality information')
2087                         return
2088                 quality = mobj.group(1).decode('utf-8')
2089
2090                 if int(quality) == 1:
2091                         quality = 'hd'
2092                 else:
2093                         quality = 'sd'
2094
2095                 # Vimeo specific: Extract request signature expiration
2096                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2097                 if mobj is None:
2098                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2099                         return
2100                 sig_exp = mobj.group(1).decode('utf-8')
2101
2102                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
2103
2104                 try:
2105                         # Process video information
2106                         self._downloader.process_info({
2107                                 'id':           video_id.decode('utf-8'),
2108                                 'url':          video_url,
2109                                 'uploader':     video_uploader,
2110                                 'upload_date':  u'NA',
2111                                 'title':        video_title,
2112                                 'stitle':       simple_title,
2113                                 'ext':          u'mp4',
2114                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2115                                 'description':  video_description,
2116                                 'thumbnail':    video_thumbnail,
2117                                 'description':  video_description,
2118                                 'player_url':   None,
2119                         })
2120                 except UnavailableVideoError:
2121                         self._downloader.trouble(u'ERROR: unable to download video')
2122
2123
2124 class GenericIE(InfoExtractor):
2125         """Generic last-resort information extractor."""
2126
2127         _VALID_URL = r'.*'
2128         IE_NAME = u'generic'
2129
2130         def __init__(self, downloader=None):
2131                 InfoExtractor.__init__(self, downloader)
2132
2133         def report_download_webpage(self, video_id):
2134                 """Report webpage download."""
2135                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2136                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2137
2138         def report_extraction(self, video_id):
2139                 """Report information extraction."""
2140                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2141
2142         def _real_extract(self, url):
2143                 # At this point we have a new video
2144                 self._downloader.increment_downloads()
2145
2146                 video_id = url.split('/')[-1]
2147                 request = urllib2.Request(url)
2148                 try:
2149                         self.report_download_webpage(video_id)
2150                         webpage = urllib2.urlopen(request).read()
2151                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2152                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2153                         return
2154                 except ValueError, err:
2155                         # since this is the last-resort InfoExtractor, if
2156                         # this error is thrown, it'll be thrown here
2157                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2158                         return
2159
2160                 self.report_extraction(video_id)
2161                 # Start with something easy: JW Player in SWFObject
2162                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2163                 if mobj is None:
2164                         # Broaden the search a little bit
2165                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2166                 if mobj is None:
2167                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2168                         return
2169
2170                 # It's possible that one of the regexes
2171                 # matched, but returned an empty group:
2172                 if mobj.group(1) is None:
2173                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2174                         return
2175
2176                 video_url = urllib.unquote(mobj.group(1))
2177                 video_id = os.path.basename(video_url)
2178
2179                 # here's a fun little line of code for you:
2180                 video_extension = os.path.splitext(video_id)[1][1:]
2181                 video_id = os.path.splitext(video_id)[0]
2182
2183                 # it's tempting to parse this further, but you would
2184                 # have to take into account all the variations like
2185                 #   Video Title - Site Name
2186                 #   Site Name | Video Title
2187                 #   Video Title - Tagline | Site Name
2188                 # and so on and so forth; it's just not practical
2189                 mobj = re.search(r'<title>(.*)</title>', webpage)
2190                 if mobj is None:
2191                         self._downloader.trouble(u'ERROR: unable to extract title')
2192                         return
2193                 video_title = mobj.group(1).decode('utf-8')
2194                 video_title = sanitize_title(video_title)
2195                 simple_title = _simplify_title(video_title)
2196
2197                 # video uploader is domain name
2198                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2199                 if mobj is None:
2200                         self._downloader.trouble(u'ERROR: unable to extract title')
2201                         return
2202                 video_uploader = mobj.group(1).decode('utf-8')
2203
2204                 try:
2205                         # Process video information
2206                         self._downloader.process_info({
2207                                 'id':           video_id.decode('utf-8'),
2208                                 'url':          video_url.decode('utf-8'),
2209                                 'uploader':     video_uploader,
2210                                 'upload_date':  u'NA',
2211                                 'title':        video_title,
2212                                 'stitle':       simple_title,
2213                                 'ext':          video_extension.decode('utf-8'),
2214                                 'format':       u'NA',
2215                                 'player_url':   None,
2216                         })
2217                 except UnavailableVideoError, err:
2218                         self._downloader.trouble(u'\nERROR: unable to download video')
2219
2220
2221 class YoutubeSearchIE(InfoExtractor):
2222         """Information Extractor for YouTube search queries."""
2223         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2224         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2225         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2226         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2227         _youtube_ie = None
2228         _max_youtube_results = 1000
2229         IE_NAME = u'youtube:search'
2230
2231         def __init__(self, youtube_ie, downloader=None):
2232                 InfoExtractor.__init__(self, downloader)
2233                 self._youtube_ie = youtube_ie
2234
2235         def report_download_page(self, query, pagenum):
2236                 """Report attempt to download playlist page with given number."""
2237                 query = query.decode(preferredencoding())
2238                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2239
2240         def _real_initialize(self):
2241                 self._youtube_ie.initialize()
2242
2243         def _real_extract(self, query):
2244                 mobj = re.match(self._VALID_URL, query)
2245                 if mobj is None:
2246                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2247                         return
2248
2249                 prefix, query = query.split(':')
2250                 prefix = prefix[8:]
2251                 query = query.encode('utf-8')
2252                 if prefix == '':
2253                         self._download_n_results(query, 1)
2254                         return
2255                 elif prefix == 'all':
2256                         self._download_n_results(query, self._max_youtube_results)
2257                         return
2258                 else:
2259                         try:
2260                                 n = long(prefix)
2261                                 if n <= 0:
2262                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2263                                         return
2264                                 elif n > self._max_youtube_results:
2265                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2266                                         n = self._max_youtube_results
2267                                 self._download_n_results(query, n)
2268                                 return
2269                         except ValueError: # parsing prefix as integer fails
2270                                 self._download_n_results(query, 1)
2271                                 return
2272
2273         def _download_n_results(self, query, n):
2274                 """Downloads a specified number of results for a query"""
2275
2276                 video_ids = []
2277                 already_seen = set()
2278                 pagenum = 1
2279
2280                 while True:
2281                         self.report_download_page(query, pagenum)
2282                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2283                         request = urllib2.Request(result_url)
2284                         try:
2285                                 page = urllib2.urlopen(request).read()
2286                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2287                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2288                                 return
2289
2290                         # Extract video identifiers
2291                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2292                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2293                                 if video_id not in already_seen:
2294                                         video_ids.append(video_id)
2295                                         already_seen.add(video_id)
2296                                         if len(video_ids) == n:
2297                                                 # Specified n videos reached
2298                                                 for id in video_ids:
2299                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2300                                                 return
2301
2302                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2303                                 for id in video_ids:
2304                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2305                                 return
2306
2307                         pagenum = pagenum + 1
2308
2309
2310 class GoogleSearchIE(InfoExtractor):
2311         """Information Extractor for Google Video search queries."""
2312         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2313         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2314         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2315         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2316         _google_ie = None
2317         _max_google_results = 1000
2318         IE_NAME = u'video.google:search'
2319
2320         def __init__(self, google_ie, downloader=None):
2321                 InfoExtractor.__init__(self, downloader)
2322                 self._google_ie = google_ie
2323
2324         def report_download_page(self, query, pagenum):
2325                 """Report attempt to download playlist page with given number."""
2326                 query = query.decode(preferredencoding())
2327                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2328
2329         def _real_initialize(self):
2330                 self._google_ie.initialize()
2331
2332         def _real_extract(self, query):
2333                 mobj = re.match(self._VALID_URL, query)
2334                 if mobj is None:
2335                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2336                         return
2337
2338                 prefix, query = query.split(':')
2339                 prefix = prefix[8:]
2340                 query = query.encode('utf-8')
2341                 if prefix == '':
2342                         self._download_n_results(query, 1)
2343                         return
2344                 elif prefix == 'all':
2345                         self._download_n_results(query, self._max_google_results)
2346                         return
2347                 else:
2348                         try:
2349                                 n = long(prefix)
2350                                 if n <= 0:
2351                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2352                                         return
2353                                 elif n > self._max_google_results:
2354                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2355                                         n = self._max_google_results
2356                                 self._download_n_results(query, n)
2357                                 return
2358                         except ValueError: # parsing prefix as integer fails
2359                                 self._download_n_results(query, 1)
2360                                 return
2361
2362         def _download_n_results(self, query, n):
2363                 """Downloads a specified number of results for a query"""
2364
2365                 video_ids = []
2366                 already_seen = set()
2367                 pagenum = 1
2368
2369                 while True:
2370                         self.report_download_page(query, pagenum)
2371                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2372                         request = urllib2.Request(result_url)
2373                         try:
2374                                 page = urllib2.urlopen(request).read()
2375                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2376                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2377                                 return
2378
2379                         # Extract video identifiers
2380                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2381                                 video_id = mobj.group(1)
2382                                 if video_id not in already_seen:
2383                                         video_ids.append(video_id)
2384                                         already_seen.add(video_id)
2385                                         if len(video_ids) == n:
2386                                                 # Specified n videos reached
2387                                                 for id in video_ids:
2388                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2389                                                 return
2390
2391                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2392                                 for id in video_ids:
2393                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2394                                 return
2395
2396                         pagenum = pagenum + 1
2397
2398
2399 class YahooSearchIE(InfoExtractor):
2400         """Information Extractor for Yahoo! Video search queries."""
2401         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2402         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2403         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2404         _MORE_PAGES_INDICATOR = r'\s*Next'
2405         _yahoo_ie = None
2406         _max_yahoo_results = 1000
2407         IE_NAME = u'video.yahoo:search'
2408
2409         def __init__(self, yahoo_ie, downloader=None):
2410                 InfoExtractor.__init__(self, downloader)
2411                 self._yahoo_ie = yahoo_ie
2412
2413         def report_download_page(self, query, pagenum):
2414                 """Report attempt to download playlist page with given number."""
2415                 query = query.decode(preferredencoding())
2416                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2417
2418         def _real_initialize(self):
2419                 self._yahoo_ie.initialize()
2420
2421         def _real_extract(self, query):
2422                 mobj = re.match(self._VALID_URL, query)
2423                 if mobj is None:
2424                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2425                         return
2426
2427                 prefix, query = query.split(':')
2428                 prefix = prefix[8:]
2429                 query = query.encode('utf-8')
2430                 if prefix == '':
2431                         self._download_n_results(query, 1)
2432                         return
2433                 elif prefix == 'all':
2434                         self._download_n_results(query, self._max_yahoo_results)
2435                         return
2436                 else:
2437                         try:
2438                                 n = long(prefix)
2439                                 if n <= 0:
2440                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2441                                         return
2442                                 elif n > self._max_yahoo_results:
2443                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2444                                         n = self._max_yahoo_results
2445                                 self._download_n_results(query, n)
2446                                 return
2447                         except ValueError: # parsing prefix as integer fails
2448                                 self._download_n_results(query, 1)
2449                                 return
2450
2451         def _download_n_results(self, query, n):
2452                 """Downloads a specified number of results for a query"""
2453
2454                 video_ids = []
2455                 already_seen = set()
2456                 pagenum = 1
2457
2458                 while True:
2459                         self.report_download_page(query, pagenum)
2460                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2461                         request = urllib2.Request(result_url)
2462                         try:
2463                                 page = urllib2.urlopen(request).read()
2464                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2465                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2466                                 return
2467
2468                         # Extract video identifiers
2469                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2470                                 video_id = mobj.group(1)
2471                                 if video_id not in already_seen:
2472                                         video_ids.append(video_id)
2473                                         already_seen.add(video_id)
2474                                         if len(video_ids) == n:
2475                                                 # Specified n videos reached
2476                                                 for id in video_ids:
2477                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2478                                                 return
2479
2480                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2481                                 for id in video_ids:
2482                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2483                                 return
2484
2485                         pagenum = pagenum + 1
2486
2487
2488 class YoutubePlaylistIE(InfoExtractor):
2489         """Information Extractor for YouTube playlists."""
2490
2491         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2492         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2493         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2494         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2495         _youtube_ie = None
2496         IE_NAME = u'youtube:playlist'
2497
2498         def __init__(self, youtube_ie, downloader=None):
2499                 InfoExtractor.__init__(self, downloader)
2500                 self._youtube_ie = youtube_ie
2501
2502         def report_download_page(self, playlist_id, pagenum):
2503                 """Report attempt to download playlist page with given number."""
2504                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2505
2506         def _real_initialize(self):
2507                 self._youtube_ie.initialize()
2508
2509         def _real_extract(self, url):
2510                 # Extract playlist id
2511                 mobj = re.match(self._VALID_URL, url)
2512                 if mobj is None:
2513                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2514                         return
2515
2516                 # Single video case
2517                 if mobj.group(3) is not None:
2518                         self._youtube_ie.extract(mobj.group(3))
2519                         return
2520
2521                 # Download playlist pages
2522                 # prefix is 'p' as default for playlists but there are other types that need extra care
2523                 playlist_prefix = mobj.group(1)
2524                 if playlist_prefix == 'a':
2525                         playlist_access = 'artist'
2526                 else:
2527                         playlist_prefix = 'p'
2528                         playlist_access = 'view_play_list'
2529                 playlist_id = mobj.group(2)
2530                 video_ids = []
2531                 pagenum = 1
2532
2533                 while True:
2534                         self.report_download_page(playlist_id, pagenum)
2535                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2536                         request = urllib2.Request(url)
2537                         try:
2538                                 page = urllib2.urlopen(request).read()
2539                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2540                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2541                                 return
2542
2543                         # Extract video identifiers
2544                         ids_in_page = []
2545                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2546                                 if mobj.group(1) not in ids_in_page:
2547                                         ids_in_page.append(mobj.group(1))
2548                         video_ids.extend(ids_in_page)
2549
2550                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2551                                 break
2552                         pagenum = pagenum + 1
2553
2554                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2555                 playlistend = self._downloader.params.get('playlistend', -1)
2556                 video_ids = video_ids[playliststart:playlistend]
2557
2558                 for id in video_ids:
2559                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2560                 return
2561
2562
2563 class YoutubeUserIE(InfoExtractor):
2564         """Information Extractor for YouTube users."""
2565
2566         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2567         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2568         _GDATA_PAGE_SIZE = 50
2569         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2570         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2571         _youtube_ie = None
2572         IE_NAME = u'youtube:user'
2573
2574         def __init__(self, youtube_ie, downloader=None):
2575                 InfoExtractor.__init__(self, downloader)
2576                 self._youtube_ie = youtube_ie
2577
2578         def report_download_page(self, username, start_index):
2579                 """Report attempt to download user page."""
2580                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2581                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2582
2583         def _real_initialize(self):
2584                 self._youtube_ie.initialize()
2585
2586         def _real_extract(self, url):
2587                 # Extract username
2588                 mobj = re.match(self._VALID_URL, url)
2589                 if mobj is None:
2590                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2591                         return
2592
2593                 username = mobj.group(1)
2594
2595                 # Download video ids using YouTube Data API. Result size per
2596                 # query is limited (currently to 50 videos) so we need to query
2597                 # page by page until there are no video ids - it means we got
2598                 # all of them.
2599
2600                 video_ids = []
2601                 pagenum = 0
2602
2603                 while True:
2604                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2605                         self.report_download_page(username, start_index)
2606
2607                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2608
2609                         try:
2610                                 page = urllib2.urlopen(request).read()
2611                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2612                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2613                                 return
2614
2615                         # Extract video identifiers
2616                         ids_in_page = []
2617
2618                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2619                                 if mobj.group(1) not in ids_in_page:
2620                                         ids_in_page.append(mobj.group(1))
2621
2622                         video_ids.extend(ids_in_page)
2623
2624                         # A little optimization - if current page is not
2625                         # "full", ie. does not contain PAGE_SIZE video ids then
2626                         # we can assume that this page is the last one - there
2627                         # are no more ids on further pages - no need to query
2628                         # again.
2629
2630                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2631                                 break
2632
2633                         pagenum += 1
2634
2635                 all_ids_count = len(video_ids)
2636                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2637                 playlistend = self._downloader.params.get('playlistend', -1)
2638
2639                 if playlistend == -1:
2640                         video_ids = video_ids[playliststart:]
2641                 else:
2642                         video_ids = video_ids[playliststart:playlistend]
2643
2644                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2645                                 (username, all_ids_count, len(video_ids)))
2646
2647                 for video_id in video_ids:
2648                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2649
2650
2651 class DepositFilesIE(InfoExtractor):
2652         """Information extractor for depositfiles.com"""
2653
2654         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2655         IE_NAME = u'DepositFiles'
2656
2657         def __init__(self, downloader=None):
2658                 InfoExtractor.__init__(self, downloader)
2659
2660         def report_download_webpage(self, file_id):
2661                 """Report webpage download."""
2662                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2663
2664         def report_extraction(self, file_id):
2665                 """Report information extraction."""
2666                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2667
2668         def _real_extract(self, url):
2669                 # At this point we have a new file
2670                 self._downloader.increment_downloads()
2671
2672                 file_id = url.split('/')[-1]
2673                 # Rebuild url in english locale
2674                 url = 'http://depositfiles.com/en/files/' + file_id
2675
2676                 # Retrieve file webpage with 'Free download' button pressed
2677                 free_download_indication = { 'gateway_result' : '1' }
2678                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2679                 try:
2680                         self.report_download_webpage(file_id)
2681                         webpage = urllib2.urlopen(request).read()
2682                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2683                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2684                         return
2685
2686                 # Search for the real file URL
2687                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2688                 if (mobj is None) or (mobj.group(1) is None):
2689                         # Try to figure out reason of the error.
2690                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2691                         if (mobj is not None) and (mobj.group(1) is not None):
2692                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2693                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2694                         else:
2695                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2696                         return
2697
2698                 file_url = mobj.group(1)
2699                 file_extension = os.path.splitext(file_url)[1][1:]
2700
2701                 # Search for file title
2702                 mobj = re.search(r'<b title="(.*?)">', webpage)
2703                 if mobj is None:
2704                         self._downloader.trouble(u'ERROR: unable to extract title')
2705                         return
2706                 file_title = mobj.group(1).decode('utf-8')
2707
2708                 try:
2709                         # Process file information
2710                         self._downloader.process_info({
2711                                 'id':           file_id.decode('utf-8'),
2712                                 'url':          file_url.decode('utf-8'),
2713                                 'uploader':     u'NA',
2714                                 'upload_date':  u'NA',
2715                                 'title':        file_title,
2716                                 'stitle':       file_title,
2717                                 'ext':          file_extension.decode('utf-8'),
2718                                 'format':       u'NA',
2719                                 'player_url':   None,
2720                         })
2721                 except UnavailableVideoError, err:
2722                         self._downloader.trouble(u'ERROR: unable to download file')
2723
2724
2725 class FacebookIE(InfoExtractor):
2726         """Information Extractor for Facebook"""
2727
2728         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2729         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2730         _NETRC_MACHINE = 'facebook'
2731         _available_formats = ['video', 'highqual', 'lowqual']
2732         _video_extensions = {
2733                 'video': 'mp4',
2734                 'highqual': 'mp4',
2735                 'lowqual': 'mp4',
2736         }
2737         IE_NAME = u'facebook'
2738
2739         def __init__(self, downloader=None):
2740                 InfoExtractor.__init__(self, downloader)
2741
2742         def _reporter(self, message):
2743                 """Add header and report message."""
2744                 self._downloader.to_screen(u'[facebook] %s' % message)
2745
2746         def report_login(self):
2747                 """Report attempt to log in."""
2748                 self._reporter(u'Logging in')
2749
2750         def report_video_webpage_download(self, video_id):
2751                 """Report attempt to download video webpage."""
2752                 self._reporter(u'%s: Downloading video webpage' % video_id)
2753
2754         def report_information_extraction(self, video_id):
2755                 """Report attempt to extract video information."""
2756                 self._reporter(u'%s: Extracting video information' % video_id)
2757
2758         def _parse_page(self, video_webpage):
2759                 """Extract video information from page"""
2760                 # General data
2761                 data = {'title': r'\("video_title", "(.*?)"\)',
2762                         'description': r'<div class="datawrap">(.*?)</div>',
2763                         'owner': r'\("video_owner_name", "(.*?)"\)',
2764                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2765                         }
2766                 video_info = {}
2767                 for piece in data.keys():
2768                         mobj = re.search(data[piece], video_webpage)
2769                         if mobj is not None:
2770                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2771
2772                 # Video urls
2773                 video_urls = {}
2774                 for fmt in self._available_formats:
2775                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2776                         if mobj is not None:
2777                                 # URL is in a Javascript segment inside an escaped Unicode format within
2778                                 # the generally utf-8 page
2779                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2780                 video_info['video_urls'] = video_urls
2781
2782                 return video_info
2783
2784         def _real_initialize(self):
2785                 if self._downloader is None:
2786                         return
2787
2788                 useremail = None
2789                 password = None
2790                 downloader_params = self._downloader.params
2791
2792                 # Attempt to use provided username and password or .netrc data
2793                 if downloader_params.get('username', None) is not None:
2794                         useremail = downloader_params['username']
2795                         password = downloader_params['password']
2796                 elif downloader_params.get('usenetrc', False):
2797                         try:
2798                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2799                                 if info is not None:
2800                                         useremail = info[0]
2801                                         password = info[2]
2802                                 else:
2803                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2804                         except (IOError, netrc.NetrcParseError), err:
2805                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2806                                 return
2807
2808                 if useremail is None:
2809                         return
2810
2811                 # Log in
2812                 login_form = {
2813                         'email': useremail,
2814                         'pass': password,
2815                         'login': 'Log+In'
2816                         }
2817                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2818                 try:
2819                         self.report_login()
2820                         login_results = urllib2.urlopen(request).read()
2821                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2822                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2823                                 return
2824                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2825                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2826                         return
2827
2828         def _real_extract(self, url):
2829                 mobj = re.match(self._VALID_URL, url)
2830                 if mobj is None:
2831                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2832                         return
2833                 video_id = mobj.group('ID')
2834
2835                 # Get video webpage
2836                 self.report_video_webpage_download(video_id)
2837                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2838                 try:
2839                         page = urllib2.urlopen(request)
2840                         video_webpage = page.read()
2841                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2842                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2843                         return
2844
2845                 # Start extracting information
2846                 self.report_information_extraction(video_id)
2847
2848                 # Extract information
2849                 video_info = self._parse_page(video_webpage)
2850
2851                 # uploader
2852                 if 'owner' not in video_info:
2853                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2854                         return
2855                 video_uploader = video_info['owner']
2856
2857                 # title
2858                 if 'title' not in video_info:
2859                         self._downloader.trouble(u'ERROR: unable to extract video title')
2860                         return
2861                 video_title = video_info['title']
2862                 video_title = video_title.decode('utf-8')
2863                 video_title = sanitize_title(video_title)
2864
2865                 simple_title = _simplify_title(video_title)
2866
2867                 # thumbnail image
2868                 if 'thumbnail' not in video_info:
2869                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2870                         video_thumbnail = ''
2871                 else:
2872                         video_thumbnail = video_info['thumbnail']
2873
2874                 # upload date
2875                 upload_date = u'NA'
2876                 if 'upload_date' in video_info:
2877                         upload_time = video_info['upload_date']
2878                         timetuple = email.utils.parsedate_tz(upload_time)
2879                         if timetuple is not None:
2880                                 try:
2881                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2882                                 except:
2883                                         pass
2884
2885                 # description
2886                 video_description = video_info.get('description', 'No description available.')
2887
2888                 url_map = video_info['video_urls']
2889                 if len(url_map.keys()) > 0:
2890                         # Decide which formats to download
2891                         req_format = self._downloader.params.get('format', None)
2892                         format_limit = self._downloader.params.get('format_limit', None)
2893
2894                         if format_limit is not None and format_limit in self._available_formats:
2895                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2896                         else:
2897                                 format_list = self._available_formats
2898                         existing_formats = [x for x in format_list if x in url_map]
2899                         if len(existing_formats) == 0:
2900                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2901                                 return
2902                         if req_format is None:
2903                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2904                         elif req_format == 'worst':
2905                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2906                         elif req_format == '-1':
2907                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2908                         else:
2909                                 # Specific format
2910                                 if req_format not in url_map:
2911                                         self._downloader.trouble(u'ERROR: requested format not available')
2912                                         return
2913                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2914
2915                 for format_param, video_real_url in video_url_list:
2916
2917                         # At this point we have a new video
2918                         self._downloader.increment_downloads()
2919
2920                         # Extension
2921                         video_extension = self._video_extensions.get(format_param, 'mp4')
2922
2923                         try:
2924                                 # Process video information
2925                                 self._downloader.process_info({
2926                                         'id':           video_id.decode('utf-8'),
2927                                         'url':          video_real_url.decode('utf-8'),
2928                                         'uploader':     video_uploader.decode('utf-8'),
2929                                         'upload_date':  upload_date,
2930                                         'title':        video_title,
2931                                         'stitle':       simple_title,
2932                                         'ext':          video_extension.decode('utf-8'),
2933                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2934                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2935                                         'description':  video_description.decode('utf-8'),
2936                                         'player_url':   None,
2937                                 })
2938                         except UnavailableVideoError, err:
2939                                 self._downloader.trouble(u'\nERROR: unable to download video')
2940
2941 class BlipTVIE(InfoExtractor):
2942         """Information extractor for blip.tv"""
2943
2944         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2945         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2946         IE_NAME = u'blip.tv'
2947
2948         def report_extraction(self, file_id):
2949                 """Report information extraction."""
2950                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2951
2952         def report_direct_download(self, title):
2953                 """Report information extraction."""
2954                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2955
2956         def _real_extract(self, url):
2957                 mobj = re.match(self._VALID_URL, url)
2958                 if mobj is None:
2959                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2960                         return
2961
2962                 if '?' in url:
2963                         cchar = '&'
2964                 else:
2965                         cchar = '?'
2966                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2967                 request = urllib2.Request(json_url)
2968                 self.report_extraction(mobj.group(1))
2969                 info = None
2970                 try:
2971                         urlh = urllib2.urlopen(request)
2972                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2973                                 basename = url.split('/')[-1]
2974                                 title,ext = os.path.splitext(basename)
2975                                 title = title.decode('UTF-8')
2976                                 ext = ext.replace('.', '')
2977                                 self.report_direct_download(title)
2978                                 info = {
2979                                         'id': title,
2980                                         'url': url,
2981                                         'title': title,
2982                                         'stitle': _simplify_title(title),
2983                                         'ext': ext,
2984                                         'urlhandle': urlh
2985                                 }
2986                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2987                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2988                         return
2989                 if info is None: # Regular URL
2990                         try:
2991                                 json_code = urlh.read()
2992                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2993                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2994                                 return
2995
2996                         try:
2997                                 json_data = json.loads(json_code)
2998                                 if 'Post' in json_data:
2999                                         data = json_data['Post']
3000                                 else:
3001                                         data = json_data
3002         
3003                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3004                                 video_url = data['media']['url']
3005                                 umobj = re.match(self._URL_EXT, video_url)
3006                                 if umobj is None:
3007                                         raise ValueError('Can not determine filename extension')
3008                                 ext = umobj.group(1)
3009         
3010                                 info = {
3011                                         'id': data['item_id'],
3012                                         'url': video_url,
3013                                         'uploader': data['display_name'],
3014                                         'upload_date': upload_date,
3015                                         'title': data['title'],
3016                                         'stitle': _simplify_title(data['title']),
3017                                         'ext': ext,
3018                                         'format': data['media']['mimeType'],
3019                                         'thumbnail': data['thumbnailUrl'],
3020                                         'description': data['description'],
3021                                         'player_url': data['embedUrl']
3022                                 }
3023                         except (ValueError,KeyError), err:
3024                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3025                                 return
3026
3027                 self._downloader.increment_downloads()
3028
3029                 try:
3030                         self._downloader.process_info(info)
3031                 except UnavailableVideoError, err:
3032                         self._downloader.trouble(u'\nERROR: unable to download video')
3033
3034
3035 class MyVideoIE(InfoExtractor):
3036         """Information Extractor for myvideo.de."""
3037
3038         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3039         IE_NAME = u'myvideo'
3040
3041         def __init__(self, downloader=None):
3042                 InfoExtractor.__init__(self, downloader)
3043         
3044         def report_download_webpage(self, video_id):
3045                 """Report webpage download."""
3046                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3047
3048         def report_extraction(self, video_id):
3049                 """Report information extraction."""
3050                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3051
3052         def _real_extract(self,url):
3053                 mobj = re.match(self._VALID_URL, url)
3054                 if mobj is None:
3055                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
3056                         return
3057
3058                 video_id = mobj.group(1)
3059
3060                 # Get video webpage
3061                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3062                 try:
3063                         self.report_download_webpage(video_id)
3064                         webpage = urllib2.urlopen(request).read()
3065                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3066                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3067                         return
3068
3069                 self.report_extraction(video_id)
3070                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3071                                  webpage)
3072                 if mobj is None:
3073                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3074                         return
3075                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3076
3077                 mobj = re.search('<title>([^<]+)</title>', webpage)
3078                 if mobj is None:
3079                         self._downloader.trouble(u'ERROR: unable to extract title')
3080                         return
3081
3082                 video_title = mobj.group(1)
3083                 video_title = sanitize_title(video_title)
3084
3085                 simple_title = _simplify_title(video_title)
3086
3087                 try:
3088                         self._downloader.process_info({
3089                                 'id':           video_id,
3090                                 'url':          video_url,
3091                                 'uploader':     u'NA',
3092                                 'upload_date':  u'NA',
3093                                 'title':        video_title,
3094                                 'stitle':       simple_title,
3095                                 'ext':          u'flv',
3096                                 'format':       u'NA',
3097                                 'player_url':   None,
3098                         })
3099                 except UnavailableVideoError:
3100                         self._downloader.trouble(u'\nERROR: Unable to download video')
3101
3102 class ComedyCentralIE(InfoExtractor):
3103         """Information extractor for The Daily Show and Colbert Report """
3104
3105         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3106         IE_NAME = u'comedycentral'
3107
3108         def report_extraction(self, episode_id):
3109                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3110         
3111         def report_config_download(self, episode_id):
3112                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3113
3114         def report_index_download(self, episode_id):
3115                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3116
3117         def report_player_url(self, episode_id):
3118                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3119
3120         def _real_extract(self, url):
3121                 mobj = re.match(self._VALID_URL, url)
3122                 if mobj is None:
3123                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3124                         return
3125
3126                 if mobj.group('shortname'):
3127                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3128                                 url = u'http://www.thedailyshow.com/full-episodes/'
3129                         else:
3130                                 url = u'http://www.colbertnation.com/full-episodes/'
3131                         mobj = re.match(self._VALID_URL, url)
3132                         assert mobj is not None
3133
3134                 dlNewest = not mobj.group('episode')
3135                 if dlNewest:
3136                         epTitle = mobj.group('showname')
3137                 else:
3138                         epTitle = mobj.group('episode')
3139
3140                 req = urllib2.Request(url)
3141                 self.report_extraction(epTitle)
3142                 try:
3143                         htmlHandle = urllib2.urlopen(req)
3144                         html = htmlHandle.read()
3145                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3146                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3147                         return
3148                 if dlNewest:
3149                         url = htmlHandle.geturl()
3150                         mobj = re.match(self._VALID_URL, url)
3151                         if mobj is None:
3152                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3153                                 return
3154                         if mobj.group('episode') == '':
3155                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3156                                 return
3157                         epTitle = mobj.group('episode')
3158
3159                 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3160                 if len(mMovieParams) == 0:
3161                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3162                         return
3163
3164                 playerUrl_raw = mMovieParams[0][0]
3165                 self.report_player_url(epTitle)
3166                 try:
3167                         urlHandle = urllib2.urlopen(playerUrl_raw)
3168                         playerUrl = urlHandle.geturl()
3169                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3170                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3171                         return
3172
3173                 uri = mMovieParams[0][1]
3174                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3175                 self.report_index_download(epTitle)
3176                 try:
3177                         indexXml = urllib2.urlopen(indexUrl).read()
3178                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3179                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3180                         return
3181
3182                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3183                 itemEls = idoc.findall('.//item')
3184                 for itemEl in itemEls:
3185                         mediaId = itemEl.findall('./guid')[0].text
3186                         shortMediaId = mediaId.split(':')[-1]
3187                         showId = mediaId.split(':')[-2].replace('.com', '')
3188                         officialTitle = itemEl.findall('./title')[0].text
3189                         officialDate = itemEl.findall('./pubDate')[0].text
3190
3191                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3192                                                 urllib.urlencode({'uri': mediaId}))
3193                         configReq = urllib2.Request(configUrl)
3194                         self.report_config_download(epTitle)
3195                         try:
3196                                 configXml = urllib2.urlopen(configReq).read()
3197                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3198                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3199                                 return
3200
3201                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3202                         turls = []
3203                         for rendition in cdoc.findall('.//rendition'):
3204                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3205                                 turls.append(finfo)
3206
3207                         if len(turls) == 0:
3208                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3209                                 continue
3210
3211                         # For now, just pick the highest bitrate
3212                         format,video_url = turls[-1]
3213
3214                         self._downloader.increment_downloads()
3215
3216                         effTitle = showId + u'-' + epTitle
3217                         info = {
3218                                 'id': shortMediaId,
3219                                 'url': video_url,
3220                                 'uploader': showId,
3221                                 'upload_date': officialDate,
3222                                 'title': effTitle,
3223                                 'stitle': _simplify_title(effTitle),
3224                                 'ext': 'mp4',
3225                                 'format': format,
3226                                 'thumbnail': None,
3227                                 'description': officialTitle,
3228                                 'player_url': playerUrl
3229                         }
3230
3231                         try:
3232                                 self._downloader.process_info(info)
3233                         except UnavailableVideoError, err:
3234                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3235                                 continue
3236
3237
3238 class EscapistIE(InfoExtractor):
3239         """Information extractor for The Escapist """
3240
3241         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3242         IE_NAME = u'escapist'
3243
3244         def report_extraction(self, showName):
3245                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3246
3247         def report_config_download(self, showName):
3248                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3249
3250         def _real_extract(self, url):
3251                 htmlParser = HTMLParser.HTMLParser()
3252
3253                 mobj = re.match(self._VALID_URL, url)
3254                 if mobj is None:
3255                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3256                         return
3257                 showName = mobj.group('showname')
3258                 videoId = mobj.group('episode')
3259
3260                 self.report_extraction(showName)
3261                 try:
3262                         webPage = urllib2.urlopen(url).read()
3263                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3264                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3265                         return
3266
3267                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3268                 description = htmlParser.unescape(descMatch.group(1))
3269                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3270                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3271                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3272                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3273                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3274                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3275
3276                 self.report_config_download(showName)
3277                 try:
3278                         configJSON = urllib2.urlopen(configUrl).read()
3279                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3280                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3281                         return
3282
3283                 # Technically, it's JavaScript, not JSON
3284                 configJSON = configJSON.replace("'", '"')
3285
3286                 try:
3287                         config = json.loads(configJSON)
3288                 except (ValueError,), err:
3289                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3290                         return
3291
3292                 playlist = config['playlist']
3293                 videoUrl = playlist[1]['url']
3294
3295                 self._downloader.increment_downloads()
3296                 info = {
3297                         'id': videoId,
3298                         'url': videoUrl,
3299                         'uploader': showName,
3300                         'upload_date': None,
3301                         'title': showName,
3302                         'stitle': _simplify_title(showName),
3303                         'ext': 'flv',
3304                         'format': 'flv',
3305                         'thumbnail': imgUrl,
3306                         'description': description,
3307                         'player_url': playerUrl,
3308                 }
3309
3310                 try:
3311                         self._downloader.process_info(info)
3312                 except UnavailableVideoError, err:
3313                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3314
3315
3316 class CollegeHumorIE(InfoExtractor):
3317         """Information extractor for collegehumor.com"""
3318
3319         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3320         IE_NAME = u'collegehumor'
3321
3322         def report_webpage(self, video_id):
3323                 """Report information extraction."""
3324                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3325
3326         def report_extraction(self, video_id):
3327                 """Report information extraction."""
3328                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3329
3330         def _real_extract(self, url):
3331                 htmlParser = HTMLParser.HTMLParser()
3332
3333                 mobj = re.match(self._VALID_URL, url)
3334                 if mobj is None:
3335                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3336                         return
3337                 video_id = mobj.group('videoid')
3338
3339                 self.report_webpage(video_id)
3340                 request = urllib2.Request(url)
3341                 try:
3342                         webpage = urllib2.urlopen(request).read()
3343                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3344                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3345                         return
3346
3347                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3348                 if m is None:
3349                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3350                         return
3351                 internal_video_id = m.group('internalvideoid')
3352
3353                 info = {
3354                         'id': video_id,
3355                         'internal_id': internal_video_id,
3356                 }
3357
3358                 self.report_extraction(video_id)
3359                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3360                 try:
3361                         metaXml = urllib2.urlopen(xmlUrl).read()
3362                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3363                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3364                         return
3365
3366                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3367                 try:
3368                         videoNode = mdoc.findall('./video')[0]
3369                         info['description'] = videoNode.findall('./description')[0].text
3370                         info['title'] = videoNode.findall('./caption')[0].text
3371                         info['stitle'] = _simplify_title(info['title'])
3372                         info['url'] = videoNode.findall('./file')[0].text
3373                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3374                         info['ext'] = info['url'].rpartition('.')[2]
3375                         info['format'] = info['ext']
3376                 except IndexError:
3377                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3378                         return
3379
3380                 self._downloader.increment_downloads()
3381
3382                 try:
3383                         self._downloader.process_info(info)
3384                 except UnavailableVideoError, err:
3385                         self._downloader.trouble(u'\nERROR: unable to download video')
3386
3387
3388 class XVideosIE(InfoExtractor):
3389         """Information extractor for xvideos.com"""
3390
3391         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3392         IE_NAME = u'xvideos'
3393
3394         def report_webpage(self, video_id):
3395                 """Report information extraction."""
3396                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3397
3398         def report_extraction(self, video_id):
3399                 """Report information extraction."""
3400                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3401
3402         def _real_extract(self, url):
3403                 htmlParser = HTMLParser.HTMLParser()
3404
3405                 mobj = re.match(self._VALID_URL, url)
3406                 if mobj is None:
3407                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3408                         return
3409                 video_id = mobj.group(1).decode('utf-8')
3410
3411                 self.report_webpage(video_id)
3412
3413                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3414                 try:
3415                         webpage = urllib2.urlopen(request).read()
3416                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3417                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3418                         return
3419
3420                 self.report_extraction(video_id)
3421
3422
3423                 # Extract video URL
3424                 mobj = re.search(r'flv_url=(.+?)&', webpage)
3425                 if mobj is None:
3426                         self._downloader.trouble(u'ERROR: unable to extract video url')
3427                         return
3428                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3429
3430
3431                 # Extract title
3432                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3433                 if mobj is None:
3434                         self._downloader.trouble(u'ERROR: unable to extract video title')
3435                         return
3436                 video_title = mobj.group(1).decode('utf-8')
3437
3438
3439                 # Extract video thumbnail
3440                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3441                 if mobj is None:
3442                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3443                         return
3444                 video_thumbnail = mobj.group(1).decode('utf-8')
3445
3446
3447
3448                 self._downloader.increment_downloads()
3449                 info = {
3450                         'id': video_id,
3451                         'url': video_url,
3452                         'uploader': None,
3453                         'upload_date': None,
3454                         'title': video_title,
3455                         'stitle': _simplify_title(video_title),
3456                         'ext': 'flv',
3457                         'format': 'flv',
3458                         'thumbnail': video_thumbnail,
3459                         'description': None,
3460                         'player_url': None,
3461                 }
3462
3463                 try:
3464                         self._downloader.process_info(info)
3465                 except UnavailableVideoError, err:
3466                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3467
3468
3469 class SoundcloudIE(InfoExtractor):
3470         """Information extractor for soundcloud.com
3471            To access the media, the uid of the song and a stream token
3472            must be extracted from the page source and the script must make
3473            a request to media.soundcloud.com/crossdomain.xml. Then
3474            the media can be grabbed by requesting from an url composed
3475            of the stream token and uid
3476          """
3477
3478         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3479         IE_NAME = u'soundcloud'
3480
3481         def __init__(self, downloader=None):
3482                 InfoExtractor.__init__(self, downloader)
3483
3484         def report_webpage(self, video_id):
3485                 """Report information extraction."""
3486                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3487
3488         def report_extraction(self, video_id):
3489                 """Report information extraction."""
3490                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3491
3492         def _real_extract(self, url):
3493                 htmlParser = HTMLParser.HTMLParser()
3494
3495                 mobj = re.match(self._VALID_URL, url)
3496                 if mobj is None:
3497                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3498                         return
3499
3500                 # extract uploader (which is in the url)
3501                 uploader = mobj.group(1).decode('utf-8')
3502                 # extract simple title (uploader + slug of song title)
3503                 slug_title =  mobj.group(2).decode('utf-8')
3504                 simple_title = uploader + '-' + slug_title
3505
3506                 self.report_webpage('%s/%s' % (uploader, slug_title))
3507
3508                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3509                 try:
3510                         webpage = urllib2.urlopen(request).read()
3511                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3512                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3513                         return
3514
3515                 self.report_extraction('%s/%s' % (uploader, slug_title))
3516
3517                 # extract uid and stream token that soundcloud hands out for access
3518                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3519                 if mobj:
3520                         video_id = mobj.group(1)
3521                         stream_token = mobj.group(2)
3522
3523                 # extract unsimplified title
3524                 mobj = re.search('"title":"(.*?)",', webpage)
3525                 if mobj:
3526                         title = mobj.group(1)
3527
3528                 # construct media url (with uid/token)
3529                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3530                 mediaURL = mediaURL % (video_id, stream_token)
3531
3532                 # description
3533                 description = u'No description available'
3534                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3535                 if mobj:
3536                         description = mobj.group(1)
3537                 
3538                 # upload date
3539                 upload_date = None
3540                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3541                 if mobj:
3542                         try:
3543                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3544                         except Exception, e:
3545                                 print str(e)
3546
3547                 # for soundcloud, a request to a cross domain is required for cookies
3548                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3549
3550                 try:
3551                         self._downloader.process_info({
3552                                 'id':           video_id.decode('utf-8'),
3553                                 'url':          mediaURL,
3554                                 'uploader':     uploader.decode('utf-8'),
3555                                 'upload_date':  upload_date,
3556                                 'title':        simple_title.decode('utf-8'),
3557                                 'stitle':       simple_title.decode('utf-8'),
3558                                 'ext':          u'mp3',
3559                                 'format':       u'NA',
3560                                 'player_url':   None,
3561                                 'description': description.decode('utf-8')
3562                         })
3563                 except UnavailableVideoError:
3564                         self._downloader.trouble(u'\nERROR: unable to download video')
3565
3566
3567 class InfoQIE(InfoExtractor):
3568         """Information extractor for infoq.com"""
3569
3570         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3571         IE_NAME = u'infoq'
3572
3573         def report_webpage(self, video_id):
3574                 """Report information extraction."""
3575                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3576
3577         def report_extraction(self, video_id):
3578                 """Report information extraction."""
3579                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3580
3581         def _real_extract(self, url):
3582                 htmlParser = HTMLParser.HTMLParser()
3583
3584                 mobj = re.match(self._VALID_URL, url)
3585                 if mobj is None:
3586                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3587                         return
3588
3589                 self.report_webpage(url)
3590
3591                 request = urllib2.Request(url)
3592                 try:
3593                         webpage = urllib2.urlopen(request).read()
3594                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3595                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3596                         return
3597
3598                 self.report_extraction(url)
3599
3600
3601                 # Extract video URL
3602                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3603                 if mobj is None:
3604                         self._downloader.trouble(u'ERROR: unable to extract video url')
3605                         return
3606                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3607
3608
3609                 # Extract title
3610                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3611                 if mobj is None:
3612                         self._downloader.trouble(u'ERROR: unable to extract video title')
3613                         return
3614                 video_title = mobj.group(1).decode('utf-8')
3615
3616                 # Extract description
3617                 video_description = u'No description available.'
3618                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3619                 if mobj is not None:
3620                         video_description = mobj.group(1).decode('utf-8')
3621
3622                 video_filename = video_url.split('/')[-1]
3623                 video_id, extension = video_filename.split('.')
3624
3625                 self._downloader.increment_downloads()
3626                 info = {
3627                         'id': video_id,
3628                         'url': video_url,
3629                         'uploader': None,
3630                         'upload_date': None,
3631                         'title': video_title,
3632                         'stitle': _simplify_title(video_title),
3633                         'ext': extension,
3634                         'format': extension, # Extension is always(?) mp4, but seems to be flv
3635                         'thumbnail': None,
3636                         'description': video_description,
3637                         'player_url': None,
3638                 }
3639
3640                 try:
3641                         self._downloader.process_info(info)
3642                 except UnavailableVideoError, err:
3643                         self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3644
3645 class MixcloudIE(InfoExtractor):
3646         """Information extractor for www.mixcloud.com"""
3647         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3648         IE_NAME = u'mixcloud'
3649
3650         def __init__(self, downloader=None):
3651                 InfoExtractor.__init__(self, downloader)
3652
3653         def report_download_json(self, file_id):
3654                 """Report JSON download."""
3655                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3656
3657         def report_extraction(self, file_id):
3658                 """Report information extraction."""
3659                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3660
3661         def get_urls(self, jsonData, fmt, bitrate='best'):
3662                 """Get urls from 'audio_formats' section in json"""
3663                 file_url = None
3664                 try:
3665                         bitrate_list = jsonData[fmt]
3666                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3667                                 bitrate = max(bitrate_list) # select highest
3668
3669                         url_list = jsonData[fmt][bitrate]
3670                 except TypeError: # we have no bitrate info.
3671                         url_list = jsonData[fmt]
3672                                 
3673                 return url_list
3674
3675         def check_urls(self, url_list):
3676                 """Returns 1st active url from list"""
3677                 for url in url_list:
3678                         try:
3679                                 urllib2.urlopen(url)
3680                                 return url
3681                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3682                                 url = None
3683
3684                 return None
3685
3686         def _print_formats(self, formats):
3687                 print 'Available formats:'
3688                 for fmt in formats.keys():
3689                         for b in formats[fmt]:
3690                                 try:
3691                                         ext = formats[fmt][b][0]
3692                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3693                                 except TypeError: # we have no bitrate info
3694                                         ext = formats[fmt][0]
3695                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3696                                         break
3697
3698         def _real_extract(self, url):
3699                 mobj = re.match(self._VALID_URL, url)
3700                 if mobj is None:
3701                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3702                         return
3703                 # extract uploader & filename from url
3704                 uploader = mobj.group(1).decode('utf-8')
3705                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3706
3707                 # construct API request
3708                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3709                 # retrieve .json file with links to files
3710                 request = urllib2.Request(file_url)
3711                 try:
3712                         self.report_download_json(file_url)
3713                         jsonData = urllib2.urlopen(request).read()
3714                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3715                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3716                         return
3717
3718                 # parse JSON
3719                 json_data = json.loads(jsonData)
3720                 player_url = json_data['player_swf_url']
3721                 formats = dict(json_data['audio_formats'])
3722
3723                 req_format = self._downloader.params.get('format', None)
3724                 bitrate = None
3725
3726                 if self._downloader.params.get('listformats', None):
3727                         self._print_formats(formats)
3728                         return
3729
3730                 if req_format is None or req_format == 'best':
3731                         for format_param in formats.keys():
3732                                 url_list = self.get_urls(formats, format_param)
3733                                 # check urls
3734                                 file_url = self.check_urls(url_list)
3735                                 if file_url is not None:
3736                                         break # got it!
3737                 else:
3738                         if req_format not in formats.keys():
3739                                 self._downloader.trouble(u'ERROR: format is not available')
3740                                 return
3741
3742                         url_list = self.get_urls(formats, req_format)
3743                         file_url = self.check_urls(url_list)
3744                         format_param = req_format
3745
3746                 # We have audio
3747                 self._downloader.increment_downloads()
3748                 try:
3749                         # Process file information
3750                         self._downloader.process_info({
3751                                 'id':           file_id.decode('utf-8'),
3752                                 'url':          file_url.decode('utf-8'),
3753                                 'uploader':     uploader.decode('utf-8'),
3754                                 'upload_date':  u'NA',
3755                                 'title':        json_data['name'],
3756                                 'stitle':       _simplify_title(json_data['name']),
3757                                 'ext':          file_url.split('.')[-1].decode('utf-8'),
3758                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
3759                                 'thumbnail':    json_data['thumbnail_url'],
3760                                 'description':  json_data['description'],
3761                                 'player_url':   player_url.decode('utf-8'),
3762                         })
3763                 except UnavailableVideoError, err:
3764                         self._downloader.trouble(u'ERROR: unable to download file')
3765
3766 class StanfordOpenClassroomIE(InfoExtractor):
3767         """Information extractor for Stanford's Open ClassRoom"""
3768
3769         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3770         IE_NAME = u'stanfordoc'
3771
3772         def report_extraction(self, video_id):
3773                 """Report information extraction."""
3774                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3775
3776         def _real_extract(self, url):
3777                 mobj = re.match(self._VALID_URL, url)
3778                 if mobj is None:
3779                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3780                         return
3781
3782                 if mobj.group('course') and mobj.group('video'): # A specific video
3783                         course = mobj.group('course')
3784                         video = mobj.group('video')
3785                         info = {
3786                                 'id': _simplify_title(course + '_' + video),
3787                         }
3788         
3789                         self.report_extraction(info['id'])
3790                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3791                         xmlUrl = baseUrl + video + '.xml'
3792                         try:
3793                                 metaXml = urllib2.urlopen(xmlUrl).read()
3794                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3795                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3796                                 return
3797                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
3798                         try:
3799                                 info['title'] = mdoc.findall('./title')[0].text
3800                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3801                         except IndexError:
3802                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3803                                 return
3804                         info['stitle'] = _simplify_title(info['title'])
3805                         info['ext'] = info['url'].rpartition('.')[2]
3806                         info['format'] = info['ext']
3807                         self._downloader.increment_downloads()
3808                         try:
3809                                 self._downloader.process_info(info)
3810                         except UnavailableVideoError, err:
3811                                 self._downloader.trouble(u'\nERROR: unable to download video')
3812                 else:
3813                         print('TODO: Not yet implemented')
3814                         1/0
3815
3816
3817
3818
3819
3820
3821 class PostProcessor(object):
3822         """Post Processor class.
3823
3824         PostProcessor objects can be added to downloaders with their
3825         add_post_processor() method. When the downloader has finished a
3826         successful download, it will take its internal chain of PostProcessors
3827         and start calling the run() method on each one of them, first with
3828         an initial argument and then with the returned value of the previous
3829         PostProcessor.
3830
3831         The chain will be stopped if one of them ever returns None or the end
3832         of the chain is reached.
3833
3834         PostProcessor objects follow a "mutual registration" process similar
3835         to InfoExtractor objects.
3836         """
3837
3838         _downloader = None
3839
3840         def __init__(self, downloader=None):
3841                 self._downloader = downloader
3842
3843         def set_downloader(self, downloader):
3844                 """Sets the downloader for this PP."""
3845                 self._downloader = downloader
3846
3847         def run(self, information):
3848                 """Run the PostProcessor.
3849
3850                 The "information" argument is a dictionary like the ones
3851                 composed by InfoExtractors. The only difference is that this
3852                 one has an extra field called "filepath" that points to the
3853                 downloaded file.
3854
3855                 When this method returns None, the postprocessing chain is
3856                 stopped. However, this method may return an information
3857                 dictionary that will be passed to the next postprocessing
3858                 object in the chain. It can be the one it received after
3859                 changing some fields.
3860
3861                 In addition, this method may raise a PostProcessingError
3862                 exception that will be taken into account by the downloader
3863                 it was called from.
3864                 """
3865                 return information # by default, do nothing
3866
3867
3868 class FFmpegExtractAudioPP(PostProcessor):
3869
3870         def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3871                 PostProcessor.__init__(self, downloader)
3872                 if preferredcodec is None:
3873                         preferredcodec = 'best'
3874                 self._preferredcodec = preferredcodec
3875                 self._preferredquality = preferredquality
3876                 self._keepvideo = keepvideo
3877
3878         @staticmethod
3879         def get_audio_codec(path):
3880                 try:
3881                         cmd = ['ffprobe', '-show_streams', '--', path]
3882                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3883                         output = handle.communicate()[0]
3884                         if handle.wait() != 0:
3885                                 return None
3886                 except (IOError, OSError):
3887                         return None
3888                 audio_codec = None
3889                 for line in output.split('\n'):
3890                         if line.startswith('codec_name='):
3891                                 audio_codec = line.split('=')[1].strip()
3892                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3893                                 return audio_codec
3894                 return None
3895
3896         @staticmethod
3897         def run_ffmpeg(path, out_path, codec, more_opts):
3898                 try:
3899                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3900                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3901                         return (ret == 0)
3902                 except (IOError, OSError):
3903                         return False
3904
3905         def run(self, information):
3906                 path = information['filepath']
3907
3908                 filecodec = self.get_audio_codec(path)
3909                 if filecodec is None:
3910                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3911                         return None
3912
3913                 more_opts = []
3914                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3915                         if filecodec in ['aac', 'mp3', 'vorbis']:
3916                                 # Lossless if possible
3917                                 acodec = 'copy'
3918                                 extension = filecodec
3919                                 if filecodec == 'aac':
3920                                         more_opts = ['-f', 'adts']
3921                                 if filecodec == 'vorbis':
3922                                         extension = 'ogg'
3923                         else:
3924                                 # MP3 otherwise.
3925                                 acodec = 'libmp3lame'
3926                                 extension = 'mp3'
3927                                 more_opts = []
3928                                 if self._preferredquality is not None:
3929                                         more_opts += ['-ab', self._preferredquality]
3930                 else:
3931                         # We convert the audio (lossy)
3932                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'vorbis': 'libvorbis'}[self._preferredcodec]
3933                         extension = self._preferredcodec
3934                         more_opts = []
3935                         if self._preferredquality is not None:
3936                                 more_opts += ['-ab', self._preferredquality]
3937                         if self._preferredcodec == 'aac':
3938                                 more_opts += ['-f', 'adts']
3939                         if self._preferredcodec == 'vorbis':
3940                                 extension = 'ogg'
3941
3942                 (prefix, ext) = os.path.splitext(path)
3943                 new_path = prefix + '.' + extension
3944                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3945                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3946
3947                 if not status:
3948                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3949                         return None
3950
3951                 # Try to update the date time for extracted audio file.
3952                 if information.get('filetime') is not None:
3953                         try:
3954                                 os.utime(new_path, (time.time(), information['filetime']))
3955                         except:
3956                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
3957
3958                 if not self._keepvideo:
3959                         try:
3960                                 os.remove(path)
3961                         except (IOError, OSError):
3962                                 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3963                                 return None
3964
3965                 information['filepath'] = new_path
3966                 return information
3967
3968
3969 def updateSelf(downloader, filename):
3970         ''' Update the program file with the latest version from the repository '''
3971         # Note: downloader only used for options
3972         if not os.access(filename, os.W_OK):
3973                 sys.exit('ERROR: no write permissions on %s' % filename)
3974
3975         downloader.to_screen('Updating to latest version...')
3976
3977         try:
3978                 try:
3979                         urlh = urllib.urlopen(UPDATE_URL)
3980                         newcontent = urlh.read()
3981                         
3982                         vmatch = re.search("__version__ = '([^']+)'", newcontent)
3983                         if vmatch is not None and vmatch.group(1) == __version__:
3984                                 downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
3985                                 return
3986                 finally:
3987                         urlh.close()
3988         except (IOError, OSError), err:
3989                 sys.exit('ERROR: unable to download latest version')
3990
3991         try:
3992                 outf = open(filename, 'wb')
3993                 try:
3994                         outf.write(newcontent)
3995                 finally:
3996                         outf.close()
3997         except (IOError, OSError), err:
3998                 sys.exit('ERROR: unable to overwrite current version')
3999
4000         downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
4001
4002 def parseOpts():
4003         # Deferred imports
4004         import getpass
4005         import optparse
4006         import shlex
4007
4008         def _readOptions(filename):
4009                 try:
4010                         optionf = open(filename)
4011                 except IOError:
4012                         return [] # silently skip if file is not present
4013                 try:
4014                         res = []
4015                         for l in optionf:
4016                                 res += shlex.split(l, comments=True)
4017                 finally:
4018                         optionf.close()
4019                 return res
4020
4021         def _format_option_string(option):
4022                 ''' ('-o', '--option') -> -o, --format METAVAR'''
4023
4024                 opts = []
4025
4026                 if option._short_opts: opts.append(option._short_opts[0])
4027                 if option._long_opts: opts.append(option._long_opts[0])
4028                 if len(opts) > 1: opts.insert(1, ', ')
4029
4030                 if option.takes_value(): opts.append(' %s' % option.metavar)
4031
4032                 return "".join(opts)
4033
4034         def _find_term_columns():
4035                 columns = os.environ.get('COLUMNS', None)
4036                 if columns:
4037                         return int(columns)
4038
4039                 try:
4040                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4041                         out,err = sp.communicate()
4042                         return int(out.split()[1])
4043                 except:
4044                         pass
4045                 return None
4046
4047         max_width = 80
4048         max_help_position = 80
4049
4050         # No need to wrap help messages if we're on a wide console
4051         columns = _find_term_columns()
4052         if columns: max_width = columns
4053
4054         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4055         fmt.format_option_strings = _format_option_string
4056
4057         kw = {
4058                 'version'   : __version__,
4059                 'formatter' : fmt,
4060                 'usage' : '%prog [options] url [url...]',
4061                 'conflict_handler' : 'resolve',
4062         }
4063
4064         parser = optparse.OptionParser(**kw)
4065
4066         # option groups
4067         general        = optparse.OptionGroup(parser, 'General Options')
4068         selection      = optparse.OptionGroup(parser, 'Video Selection')
4069         authentication = optparse.OptionGroup(parser, 'Authentication Options')
4070         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
4071         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
4072         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
4073         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4074
4075         general.add_option('-h', '--help',
4076                         action='help', help='print this help text and exit')
4077         general.add_option('-v', '--version',
4078                         action='version', help='print program version and exit')
4079         general.add_option('-U', '--update',
4080                         action='store_true', dest='update_self', help='update this program to latest version')
4081         general.add_option('-i', '--ignore-errors',
4082                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4083         general.add_option('-r', '--rate-limit',
4084                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4085         general.add_option('-R', '--retries',
4086                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4087         general.add_option('--dump-user-agent',
4088                         action='store_true', dest='dump_user_agent',
4089                         help='display the current browser identification', default=False)
4090         general.add_option('--list-extractors',
4091                         action='store_true', dest='list_extractors',
4092                         help='List all supported extractors and the URLs they would handle', default=False)
4093
4094         selection.add_option('--playlist-start',
4095                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4096         selection.add_option('--playlist-end',
4097                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4098         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4099         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4100         selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4101
4102         authentication.add_option('-u', '--username',
4103                         dest='username', metavar='USERNAME', help='account username')
4104         authentication.add_option('-p', '--password',
4105                         dest='password', metavar='PASSWORD', help='account password')
4106         authentication.add_option('-n', '--netrc',
4107                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4108
4109
4110         video_format.add_option('-f', '--format',
4111                         action='store', dest='format', metavar='FORMAT', help='video format code')
4112         video_format.add_option('--all-formats',
4113                         action='store_const', dest='format', help='download all available video formats', const='all')
4114         video_format.add_option('--max-quality',
4115                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4116         video_format.add_option('-F', '--list-formats',
4117                         action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4118
4119
4120         verbosity.add_option('-q', '--quiet',
4121                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
4122         verbosity.add_option('-s', '--simulate',
4123                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4124         verbosity.add_option('--skip-download',
4125                         action='store_true', dest='skip_download', help='do not download the video', default=False)
4126         verbosity.add_option('-g', '--get-url',
4127                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4128         verbosity.add_option('-e', '--get-title',
4129                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4130         verbosity.add_option('--get-thumbnail',
4131                         action='store_true', dest='getthumbnail',
4132                         help='simulate, quiet but print thumbnail URL', default=False)
4133         verbosity.add_option('--get-description',
4134                         action='store_true', dest='getdescription',
4135                         help='simulate, quiet but print video description', default=False)
4136         verbosity.add_option('--get-filename',
4137                         action='store_true', dest='getfilename',
4138                         help='simulate, quiet but print output filename', default=False)
4139         verbosity.add_option('--get-format',
4140                         action='store_true', dest='getformat',
4141                         help='simulate, quiet but print output format', default=False)
4142         verbosity.add_option('--no-progress',
4143                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4144         verbosity.add_option('--console-title',
4145                         action='store_true', dest='consoletitle',
4146                         help='display progress in console titlebar', default=False)
4147
4148
4149         filesystem.add_option('-t', '--title',
4150                         action='store_true', dest='usetitle', help='use title in file name', default=False)
4151         filesystem.add_option('-l', '--literal',
4152                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4153         filesystem.add_option('-A', '--auto-number',
4154                         action='store_true', dest='autonumber',
4155                         help='number downloaded files starting from 00000', default=False)
4156         filesystem.add_option('-o', '--output',
4157                         dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4158         filesystem.add_option('-a', '--batch-file',
4159                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4160         filesystem.add_option('-w', '--no-overwrites',
4161                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4162         filesystem.add_option('-c', '--continue',
4163                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
4164         filesystem.add_option('--no-continue',
4165                         action='store_false', dest='continue_dl',
4166                         help='do not resume partially downloaded files (restart from beginning)')
4167         filesystem.add_option('--cookies',
4168                         dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4169         filesystem.add_option('--no-part',
4170                         action='store_true', dest='nopart', help='do not use .part files', default=False)
4171         filesystem.add_option('--no-mtime',
4172                         action='store_false', dest='updatetime',
4173                         help='do not use the Last-modified header to set the file modification time', default=True)
4174         filesystem.add_option('--write-description',
4175                         action='store_true', dest='writedescription',
4176                         help='write video description to a .description file', default=False)
4177         filesystem.add_option('--write-info-json',
4178                         action='store_true', dest='writeinfojson',
4179                         help='write video metadata to a .info.json file', default=False)
4180
4181
4182         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4183                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4184         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4185                         help='"best", "aac", "vorbis" or "mp3"; best by default')
4186         postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4187                         help='ffmpeg audio bitrate specification, 128k by default')
4188         postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4189                         help='keeps the video file on disk after the post-processing; the video is erased by default')
4190
4191
4192         parser.add_option_group(general)
4193         parser.add_option_group(selection)
4194         parser.add_option_group(filesystem)
4195         parser.add_option_group(verbosity)
4196         parser.add_option_group(video_format)
4197         parser.add_option_group(authentication)
4198         parser.add_option_group(postproc)
4199
4200         xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4201         if xdg_config_home:
4202                 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4203         else:
4204                 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4205         argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4206         opts, args = parser.parse_args(argv)
4207
4208         return parser, opts, args
4209
4210 def gen_extractors():
4211         """ Return a list of an instance of every supported extractor.
4212         The order does matter; the first extractor matched is the one handling the URL.
4213         """
4214         youtube_ie = YoutubeIE()
4215         google_ie = GoogleIE()
4216         yahoo_ie = YahooIE()
4217         return [
4218                 YoutubePlaylistIE(youtube_ie),
4219                 YoutubeUserIE(youtube_ie),
4220                 YoutubeSearchIE(youtube_ie),
4221                 youtube_ie,
4222                 MetacafeIE(youtube_ie),
4223                 DailymotionIE(),
4224                 google_ie,
4225                 GoogleSearchIE(google_ie),
4226                 PhotobucketIE(),
4227                 yahoo_ie,
4228                 YahooSearchIE(yahoo_ie),
4229                 DepositFilesIE(),
4230                 FacebookIE(),
4231                 BlipTVIE(),
4232                 VimeoIE(),
4233                 MyVideoIE(),
4234                 ComedyCentralIE(),
4235                 EscapistIE(),
4236                 CollegeHumorIE(),
4237                 XVideosIE(),
4238                 SoundcloudIE(),
4239                 InfoQIE(),
4240                 MixcloudIE(),
4241                 StanfordOpenClassroomIE(),
4242
4243                 GenericIE()
4244         ]
4245
4246 def _real_main():
4247         parser, opts, args = parseOpts()
4248
4249         # Open appropriate CookieJar
4250         if opts.cookiefile is None:
4251                 jar = cookielib.CookieJar()
4252         else:
4253                 try:
4254                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
4255                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4256                                 jar.load()
4257                 except (IOError, OSError), err:
4258                         sys.exit(u'ERROR: unable to open cookie file')
4259
4260         # Dump user agent
4261         if opts.dump_user_agent:
4262                 print std_headers['User-Agent']
4263                 sys.exit(0)
4264
4265         # Batch file verification
4266         batchurls = []
4267         if opts.batchfile is not None:
4268                 try:
4269                         if opts.batchfile == '-':
4270                                 batchfd = sys.stdin
4271                         else:
4272                                 batchfd = open(opts.batchfile, 'r')
4273                         batchurls = batchfd.readlines()
4274                         batchurls = [x.strip() for x in batchurls]
4275                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4276                 except IOError:
4277                         sys.exit(u'ERROR: batch file could not be read')
4278         all_urls = batchurls + args
4279
4280         # General configuration
4281         cookie_processor = urllib2.HTTPCookieProcessor(jar)
4282         opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
4283         urllib2.install_opener(opener)
4284         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4285
4286         extractors = gen_extractors()
4287
4288         if opts.list_extractors:
4289                 for ie in extractors:
4290                         print(ie.IE_NAME)
4291                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4292                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4293                         for mu in matchedUrls:
4294                                 print(u'  ' + mu)
4295                 sys.exit(0)
4296
4297         # Conflicting, missing and erroneous options
4298         if opts.usenetrc and (opts.username is not None or opts.password is not None):
4299                 parser.error(u'using .netrc conflicts with giving username/password')
4300         if opts.password is not None and opts.username is None:
4301                 parser.error(u'account username missing')
4302         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4303                 parser.error(u'using output template conflicts with using title, literal title or auto number')
4304         if opts.usetitle and opts.useliteral:
4305                 parser.error(u'using title conflicts with using literal title')
4306         if opts.username is not None and opts.password is None:
4307                 opts.password = getpass.getpass(u'Type account password and press return:')
4308         if opts.ratelimit is not None:
4309                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4310                 if numeric_limit is None:
4311                         parser.error(u'invalid rate limit specified')
4312                 opts.ratelimit = numeric_limit
4313         if opts.retries is not None:
4314                 try:
4315                         opts.retries = long(opts.retries)
4316                 except (TypeError, ValueError), err:
4317                         parser.error(u'invalid retry count specified')
4318         try:
4319                 opts.playliststart = int(opts.playliststart)
4320                 if opts.playliststart <= 0:
4321                         raise ValueError(u'Playlist start must be positive')
4322         except (TypeError, ValueError), err:
4323                 parser.error(u'invalid playlist start number specified')
4324         try:
4325                 opts.playlistend = int(opts.playlistend)
4326                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4327                         raise ValueError(u'Playlist end must be greater than playlist start')
4328         except (TypeError, ValueError), err:
4329                 parser.error(u'invalid playlist end number specified')
4330         if opts.extractaudio:
4331                 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis']:
4332                         parser.error(u'invalid audio format specified')
4333
4334         # File downloader
4335         fd = FileDownloader({
4336                 'usenetrc': opts.usenetrc,
4337                 'username': opts.username,
4338                 'password': opts.password,
4339                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4340                 'forceurl': opts.geturl,
4341                 'forcetitle': opts.gettitle,
4342                 'forcethumbnail': opts.getthumbnail,
4343                 'forcedescription': opts.getdescription,
4344                 'forcefilename': opts.getfilename,
4345                 'forceformat': opts.getformat,
4346                 'simulate': opts.simulate,
4347                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4348                 'format': opts.format,
4349                 'format_limit': opts.format_limit,
4350                 'listformats': opts.listformats,
4351                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4352                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4353                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4354                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4355                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4356                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4357                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4358                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4359                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4360                         or u'%(id)s.%(ext)s'),
4361                 'ignoreerrors': opts.ignoreerrors,
4362                 'ratelimit': opts.ratelimit,
4363                 'nooverwrites': opts.nooverwrites,
4364                 'retries': opts.retries,
4365                 'continuedl': opts.continue_dl,
4366                 'noprogress': opts.noprogress,
4367                 'playliststart': opts.playliststart,
4368                 'playlistend': opts.playlistend,
4369                 'logtostderr': opts.outtmpl == '-',
4370                 'consoletitle': opts.consoletitle,
4371                 'nopart': opts.nopart,
4372                 'updatetime': opts.updatetime,
4373                 'writedescription': opts.writedescription,
4374                 'writeinfojson': opts.writeinfojson,
4375                 'matchtitle': opts.matchtitle,
4376                 'rejecttitle': opts.rejecttitle,
4377                 'max_downloads': opts.max_downloads,
4378                 })
4379         for extractor in extractors:
4380                 fd.add_info_extractor(extractor)
4381
4382         # PostProcessors
4383         if opts.extractaudio:
4384                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4385
4386         # Update version
4387         if opts.update_self:
4388                 updateSelf(fd, sys.argv[0])
4389
4390         # Maybe do nothing
4391         if len(all_urls) < 1:
4392                 if not opts.update_self:
4393                         parser.error(u'you must provide at least one URL')
4394                 else:
4395                         sys.exit()
4396         retcode = fd.download(all_urls)
4397
4398         # Dump cookie jar if requested
4399         if opts.cookiefile is not None:
4400                 try:
4401                         jar.save()
4402                 except (IOError, OSError), err:
4403                         sys.exit(u'ERROR: unable to save cookie jar')
4404
4405         sys.exit(retcode)
4406
4407 def main():
4408         try:
4409                 _real_main()
4410         except DownloadError:
4411                 sys.exit(1)
4412         except SameFileError:
4413                 sys.exit(u'ERROR: fixed output name but more than one file to download')
4414         except KeyboardInterrupt:
4415                 sys.exit(u'\nERROR: Interrupted by user')
4416
4417 if __name__ == '__main__':
4418         main()
4419
4420 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: