Use None on missing required info_dict fields
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import datetime
5 import HTMLParser
6 import httplib
7 import netrc
8 import os
9 import re
10 import socket
11 import time
12 import urllib
13 import urllib2
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 from urlparse import parse_qs
19
20 try:
21         import cStringIO as StringIO
22 except ImportError:
23         import StringIO
24
25 from utils import *
26
27
28 class InfoExtractor(object):
29         """Information Extractor class.
30
31         Information extractors are the classes that, given a URL, extract
32         information about the video (or videos) the URL refers to. This
33         information includes the real video URL, the video title, author and
34         others. The information is stored in a dictionary which is then 
35         passed to the FileDownloader. The FileDownloader processes this
36         information possibly downloading the video to the file system, among
37         other possible outcomes.
38
39         The dictionaries must include the following fields:
40
41         id:             Video identifier.
42         url:            Final video URL.
43         uploader:       Nickname of the video uploader, unescaped.
44         upload_date:    Video upload date (YYYYMMDD).
45         title:          Video title, unescaped.
46         ext:            Video filename extension.
47
48         The following fields are optional:
49
50         format:         The video format, defaults to ext (used for --get-format)
51         thumbnail:      Full URL to a video thumbnail image.
52         description:    One-line video description.
53         player_url:     SWF Player URL (used for rtmpdump).
54         subtitles:      The .srt file contents.
55         urlhandle:              [internal] The urlHandle to be used to download the file,
56                         like returned by urllib2.urlopen
57
58         The fields should all be Unicode strings.
59
60         Subclasses of this one should re-define the _real_initialize() and
61         _real_extract() methods and define a _VALID_URL regexp.
62         Probably, they should also be added to the list of extractors.
63
64         _real_extract() must return a *list* of information dictionaries as
65         described above.
66
67         Finally, the _WORKING attribute should be set to False for broken IEs
68         in order to warn the users and skip the tests.
69         """
70
71         _ready = False
72         _downloader = None
73         _WORKING = True
74
75         def __init__(self, downloader=None):
76                 """Constructor. Receives an optional downloader."""
77                 self._ready = False
78                 self.set_downloader(downloader)
79
80         def suitable(self, url):
81                 """Receives a URL and returns True if suitable for this IE."""
82                 return re.match(self._VALID_URL, url) is not None
83
84         def working(self):
85                 """Getter method for _WORKING."""
86                 return self._WORKING
87
88         def initialize(self):
89                 """Initializes an instance (authentication, etc)."""
90                 if not self._ready:
91                         self._real_initialize()
92                         self._ready = True
93
94         def extract(self, url):
95                 """Extracts URL information and returns it in list of dicts."""
96                 self.initialize()
97                 return self._real_extract(url)
98
99         def set_downloader(self, downloader):
100                 """Sets the downloader for this IE."""
101                 self._downloader = downloader
102
103         def _real_initialize(self):
104                 """Real initialization process. Redefine in subclasses."""
105                 pass
106
107         def _real_extract(self, url):
108                 """Real extraction process. Redefine in subclasses."""
109                 pass
110
111
112 class YoutubeIE(InfoExtractor):
113         """Information extractor for youtube.com."""
114
115         _VALID_URL = r"""^
116                          (
117                              (?:https?://)?                                       # http(s):// (optional)
118                              (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
119                                 tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
120                              (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
121                              (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
122                              (?:                                                  # the various things that can precede the ID:
123                                  (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
124                                  |(?:                                             # or the v= param in all its forms
125                                      (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
126                                      (?:\?|\#!?)                                  # the params delimiter ? or # or #!
127                                      (?:.+&)?                                     # any other preceding param (like /?s=tuff&v=xxxx)
128                                      v=
129                                  )
130                              )?                                                   # optional -> youtube.com/xxxx is OK
131                          )?                                                       # all until now is optional -> you can pass the naked ID
132                          ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
133                          (?(1).+)?                                                # if we found the ID, everything can follow
134                          $"""
135         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
136         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
137         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
138         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
139         _NETRC_MACHINE = 'youtube'
140         # Listed in order of quality
141         _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
142         _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
143         _video_extensions = {
144                 '13': '3gp',
145                 '17': 'mp4',
146                 '18': 'mp4',
147                 '22': 'mp4',
148                 '37': 'mp4',
149                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
150                 '43': 'webm',
151                 '44': 'webm',
152                 '45': 'webm',
153                 '46': 'webm',
154         }
155         _video_dimensions = {
156                 '5': '240x400',
157                 '6': '???',
158                 '13': '???',
159                 '17': '144x176',
160                 '18': '360x640',
161                 '22': '720x1280',
162                 '34': '360x640',
163                 '35': '480x854',
164                 '37': '1080x1920',
165                 '38': '3072x4096',
166                 '43': '360x640',
167                 '44': '480x854',
168                 '45': '720x1280',
169                 '46': '1080x1920',
170         }       
171         IE_NAME = u'youtube'
172
173         def suitable(self, url):
174                 """Receives a URL and returns True if suitable for this IE."""
175                 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
176
177         def report_lang(self):
178                 """Report attempt to set language."""
179                 self._downloader.to_screen(u'[youtube] Setting language')
180
181         def report_login(self):
182                 """Report attempt to log in."""
183                 self._downloader.to_screen(u'[youtube] Logging in')
184
185         def report_age_confirmation(self):
186                 """Report attempt to confirm age."""
187                 self._downloader.to_screen(u'[youtube] Confirming age')
188
189         def report_video_webpage_download(self, video_id):
190                 """Report attempt to download video webpage."""
191                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
192
193         def report_video_info_webpage_download(self, video_id):
194                 """Report attempt to download video info webpage."""
195                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
196
197         def report_video_subtitles_download(self, video_id):
198                 """Report attempt to download video info webpage."""
199                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
200
201         def report_information_extraction(self, video_id):
202                 """Report attempt to extract video information."""
203                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
204
205         def report_unavailable_format(self, video_id, format):
206                 """Report extracted video URL."""
207                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
208
209         def report_rtmp_download(self):
210                 """Indicate the download will use the RTMP protocol."""
211                 self._downloader.to_screen(u'[youtube] RTMP download detected')
212
213         def _closed_captions_xml_to_srt(self, xml_string):
214                 srt = ''
215                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
216                 # TODO parse xml instead of regex
217                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
218                         if not dur: dur = '4'
219                         start = float(start)
220                         end = start + float(dur)
221                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
222                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
223                         caption = unescapeHTML(caption)
224                         caption = unescapeHTML(caption) # double cycle, intentional
225                         srt += str(n+1) + '\n'
226                         srt += start + ' --> ' + end + '\n'
227                         srt += caption + '\n\n'
228                 return srt
229
230         def _print_formats(self, formats):
231                 print('Available formats:')
232                 for x in formats:
233                         print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
234
235         def _real_initialize(self):
236                 if self._downloader is None:
237                         return
238
239                 username = None
240                 password = None
241                 downloader_params = self._downloader.params
242
243                 # Attempt to use provided username and password or .netrc data
244                 if downloader_params.get('username', None) is not None:
245                         username = downloader_params['username']
246                         password = downloader_params['password']
247                 elif downloader_params.get('usenetrc', False):
248                         try:
249                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
250                                 if info is not None:
251                                         username = info[0]
252                                         password = info[2]
253                                 else:
254                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
255                         except (IOError, netrc.NetrcParseError), err:
256                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
257                                 return
258
259                 # Set language
260                 request = urllib2.Request(self._LANG_URL)
261                 try:
262                         self.report_lang()
263                         urllib2.urlopen(request).read()
264                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
265                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
266                         return
267
268                 # No authentication to be performed
269                 if username is None:
270                         return
271
272                 # Log in
273                 login_form = {
274                                 'current_form': 'loginForm',
275                                 'next':         '/',
276                                 'action_login': 'Log In',
277                                 'username':     username,
278                                 'password':     password,
279                                 }
280                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
281                 try:
282                         self.report_login()
283                         login_results = urllib2.urlopen(request).read()
284                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
285                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
286                                 return
287                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
288                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
289                         return
290
291                 # Confirm age
292                 age_form = {
293                                 'next_url':             '/',
294                                 'action_confirm':       'Confirm',
295                                 }
296                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
297                 try:
298                         self.report_age_confirmation()
299                         age_results = urllib2.urlopen(request).read()
300                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
301                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
302                         return
303
304         def _real_extract(self, url):
305                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
306                 mobj = re.search(self._NEXT_URL_RE, url)
307                 if mobj:
308                         url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
309
310                 # Extract video id from URL
311                 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
312                 if mobj is None:
313                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
314                         return
315                 video_id = mobj.group(2)
316
317                 # Get video webpage
318                 self.report_video_webpage_download(video_id)
319                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
320                 try:
321                         video_webpage = urllib2.urlopen(request).read()
322                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
323                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
324                         return
325
326                 # Attempt to extract SWF player URL
327                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
328                 if mobj is not None:
329                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
330                 else:
331                         player_url = None
332
333                 # Get video info
334                 self.report_video_info_webpage_download(video_id)
335                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
336                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
337                                         % (video_id, el_type))
338                         request = urllib2.Request(video_info_url)
339                         try:
340                                 video_info_webpage = urllib2.urlopen(request).read()
341                                 video_info = parse_qs(video_info_webpage)
342                                 if 'token' in video_info:
343                                         break
344                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
345                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
346                                 return
347                 if 'token' not in video_info:
348                         if 'reason' in video_info:
349                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
350                         else:
351                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
352                         return
353
354                 # Check for "rental" videos
355                 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
356                         self._downloader.trouble(u'ERROR: "rental" videos not supported')
357                         return
358
359                 # Start extracting information
360                 self.report_information_extraction(video_id)
361
362                 # uploader
363                 if 'author' not in video_info:
364                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
365                         return
366                 video_uploader = urllib.unquote_plus(video_info['author'][0])
367
368                 # title
369                 if 'title' not in video_info:
370                         self._downloader.trouble(u'ERROR: unable to extract video title')
371                         return
372                 video_title = urllib.unquote_plus(video_info['title'][0])
373                 video_title = video_title.decode('utf-8')
374
375                 # thumbnail image
376                 if 'thumbnail_url' not in video_info:
377                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
378                         video_thumbnail = ''
379                 else:   # don't panic if we can't find it
380                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
381
382                 # upload date
383                 upload_date = None
384                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
385                 if mobj is not None:
386                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
387                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
388                         for expression in format_expressions:
389                                 try:
390                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
391                                 except:
392                                         pass
393
394                 # description
395                 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
396                 if video_description: video_description = clean_html(video_description)
397                 else: video_description = ''
398                         
399                 # closed captions
400                 video_subtitles = None
401                 if self._downloader.params.get('writesubtitles', False):
402                         try:
403                                 self.report_video_subtitles_download(video_id)
404                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
405                                 try:
406                                         srt_list = urllib2.urlopen(request).read()
407                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
408                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
409                                 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
410                                 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
411                                 if not srt_lang_list:
412                                         raise Trouble(u'WARNING: video has no closed captions')
413                                 if self._downloader.params.get('subtitleslang', False):
414                                         srt_lang = self._downloader.params.get('subtitleslang')
415                                 elif 'en' in srt_lang_list:
416                                         srt_lang = 'en'
417                                 else:
418                                         srt_lang = srt_lang_list.keys()[0]
419                                 if not srt_lang in srt_lang_list:
420                                         raise Trouble(u'WARNING: no closed captions found in the specified language')
421                                 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
422                                 try:
423                                         srt_xml = urllib2.urlopen(request).read()
424                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
425                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
426                                 if not srt_xml:
427                                         raise Trouble(u'WARNING: unable to download video subtitles')
428                                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
429                         except Trouble as trouble:
430                                 self._downloader.trouble(trouble[0])
431
432                 if 'length_seconds' not in video_info:
433                         self._downloader.trouble(u'WARNING: unable to extract video duration')
434                         video_duration = ''
435                 else:
436                         video_duration = urllib.unquote_plus(video_info['length_seconds'][0])
437
438                 # token
439                 video_token = urllib.unquote_plus(video_info['token'][0])
440
441                 # Decide which formats to download
442                 req_format = self._downloader.params.get('format', None)
443
444                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
445                         self.report_rtmp_download()
446                         video_url_list = [(None, video_info['conn'][0])]
447                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
448                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
449                         url_data = [parse_qs(uds) for uds in url_data_strs]
450                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
451                         url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
452
453                         format_limit = self._downloader.params.get('format_limit', None)
454                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
455                         if format_limit is not None and format_limit in available_formats:
456                                 format_list = available_formats[available_formats.index(format_limit):]
457                         else:
458                                 format_list = available_formats
459                         existing_formats = [x for x in format_list if x in url_map]
460                         if len(existing_formats) == 0:
461                                 self._downloader.trouble(u'ERROR: no known formats available for video')
462                                 return
463                         if self._downloader.params.get('listformats', None):
464                                 self._print_formats(existing_formats)
465                                 return
466                         if req_format is None or req_format == 'best':
467                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
468                         elif req_format == 'worst':
469                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
470                         elif req_format in ('-1', 'all'):
471                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
472                         else:
473                                 # Specific formats. We pick the first in a slash-delimeted sequence.
474                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
475                                 req_formats = req_format.split('/')
476                                 video_url_list = None
477                                 for rf in req_formats:
478                                         if rf in url_map:
479                                                 video_url_list = [(rf, url_map[rf])]
480                                                 break
481                                 if video_url_list is None:
482                                         self._downloader.trouble(u'ERROR: requested format not available')
483                                         return
484                 else:
485                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
486                         return
487
488                 results = []
489                 for format_param, video_real_url in video_url_list:
490                         # Extension
491                         video_extension = self._video_extensions.get(format_param, 'flv')
492
493                         video_format = '{} - {}'.format(format_param.decode('utf-8') if format_param else video_extension.decode('utf-8'),
494                                                             self._video_dimensions.get(format_param, '???'))
495
496                         results.append({
497                                 'id':           video_id.decode('utf-8'),
498                                 'url':          video_real_url.decode('utf-8'),
499                                 'uploader':     video_uploader.decode('utf-8'),
500                                 'upload_date':  upload_date,
501                                 'title':        video_title,
502                                 'ext':          video_extension.decode('utf-8'),
503                                 'format':       video_format,
504                                 'thumbnail':    video_thumbnail.decode('utf-8'),
505                                 'description':  video_description,
506                                 'player_url':   player_url,
507                                 'subtitles':    video_subtitles,
508                                 'duration':             video_duration
509                         })
510                 return results
511
512
513 class MetacafeIE(InfoExtractor):
514         """Information Extractor for metacafe.com."""
515
516         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
517         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
518         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
519         IE_NAME = u'metacafe'
520
521         def __init__(self, downloader=None):
522                 InfoExtractor.__init__(self, downloader)
523
524         def report_disclaimer(self):
525                 """Report disclaimer retrieval."""
526                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
527
528         def report_age_confirmation(self):
529                 """Report attempt to confirm age."""
530                 self._downloader.to_screen(u'[metacafe] Confirming age')
531
532         def report_download_webpage(self, video_id):
533                 """Report webpage download."""
534                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
535
536         def report_extraction(self, video_id):
537                 """Report information extraction."""
538                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
539
540         def _real_initialize(self):
541                 # Retrieve disclaimer
542                 request = urllib2.Request(self._DISCLAIMER)
543                 try:
544                         self.report_disclaimer()
545                         disclaimer = urllib2.urlopen(request).read()
546                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
547                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
548                         return
549
550                 # Confirm age
551                 disclaimer_form = {
552                         'filters': '0',
553                         'submit': "Continue - I'm over 18",
554                         }
555                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
556                 try:
557                         self.report_age_confirmation()
558                         disclaimer = urllib2.urlopen(request).read()
559                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
560                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
561                         return
562
563         def _real_extract(self, url):
564                 # Extract id and simplified title from URL
565                 mobj = re.match(self._VALID_URL, url)
566                 if mobj is None:
567                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
568                         return
569
570                 video_id = mobj.group(1)
571
572                 # Check if video comes from YouTube
573                 mobj2 = re.match(r'^yt-(.*)$', video_id)
574                 if mobj2 is not None:
575                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
576                         return
577
578                 # Retrieve video webpage to extract further information
579                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
580                 try:
581                         self.report_download_webpage(video_id)
582                         webpage = urllib2.urlopen(request).read()
583                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
584                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
585                         return
586
587                 # Extract URL, uploader and title from webpage
588                 self.report_extraction(video_id)
589                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
590                 if mobj is not None:
591                         mediaURL = urllib.unquote(mobj.group(1))
592                         video_extension = mediaURL[-3:]
593
594                         # Extract gdaKey if available
595                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
596                         if mobj is None:
597                                 video_url = mediaURL
598                         else:
599                                 gdaKey = mobj.group(1)
600                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
601                 else:
602                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
603                         if mobj is None:
604                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
605                                 return
606                         vardict = parse_qs(mobj.group(1))
607                         if 'mediaData' not in vardict:
608                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
609                                 return
610                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
611                         if mobj is None:
612                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
613                                 return
614                         mediaURL = mobj.group(1).replace('\\/', '/')
615                         video_extension = mediaURL[-3:]
616                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
617
618                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
619                 if mobj is None:
620                         self._downloader.trouble(u'ERROR: unable to extract title')
621                         return
622                 video_title = mobj.group(1).decode('utf-8')
623
624                 mobj = re.search(r'submitter=(.*?);', webpage)
625                 if mobj is None:
626                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
627                         return
628                 video_uploader = mobj.group(1)
629
630                 return [{
631                         'id':           video_id.decode('utf-8'),
632                         'url':          video_url.decode('utf-8'),
633                         'uploader':     video_uploader.decode('utf-8'),
634                         'upload_date':  None,
635                         'title':        video_title,
636                         'ext':          video_extension.decode('utf-8'),
637                 }]
638
639
640 class DailymotionIE(InfoExtractor):
641         """Information Extractor for Dailymotion"""
642
643         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
644         IE_NAME = u'dailymotion'
645
646         def __init__(self, downloader=None):
647                 InfoExtractor.__init__(self, downloader)
648
649         def report_download_webpage(self, video_id):
650                 """Report webpage download."""
651                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
652
653         def report_extraction(self, video_id):
654                 """Report information extraction."""
655                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
656
657         def _real_extract(self, url):
658                 # Extract id and simplified title from URL
659                 mobj = re.match(self._VALID_URL, url)
660                 if mobj is None:
661                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
662                         return
663
664                 video_id = mobj.group(1).split('_')[0].split('?')[0]
665
666                 video_extension = 'mp4'
667
668                 # Retrieve video webpage to extract further information
669                 request = urllib2.Request(url)
670                 request.add_header('Cookie', 'family_filter=off')
671                 try:
672                         self.report_download_webpage(video_id)
673                         webpage = urllib2.urlopen(request).read()
674                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
675                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
676                         return
677
678                 # Extract URL, uploader and title from webpage
679                 self.report_extraction(video_id)
680                 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
681                 if mobj is None:
682                         self._downloader.trouble(u'ERROR: unable to extract media URL')
683                         return
684                 flashvars = urllib.unquote(mobj.group(1))
685
686                 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
687                         if key in flashvars:
688                                 max_quality = key
689                                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
690                                 break
691                 else:
692                         self._downloader.trouble(u'ERROR: unable to extract video URL')
693                         return
694
695                 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
696                 if mobj is None:
697                         self._downloader.trouble(u'ERROR: unable to extract video URL')
698                         return
699
700                 video_url = urllib.unquote(mobj.group(1)).replace('\\/', '/')
701
702                 # TODO: support choosing qualities
703
704                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
705                 if mobj is None:
706                         self._downloader.trouble(u'ERROR: unable to extract title')
707                         return
708                 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
709
710                 video_uploader = None
711                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
712                 if mobj is None:
713                         # lookin for official user
714                         mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
715                         if mobj_official is None:
716                                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
717                         else:
718                                 video_uploader = mobj_official.group(1)
719                 else:
720                         video_uploader = mobj.group(1)
721
722                 video_upload_date = None
723                 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
724                 if mobj is not None:
725                         video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
726
727                 return [{
728                         'id':           video_id.decode('utf-8'),
729                         'url':          video_url.decode('utf-8'),
730                         'uploader':     video_uploader.decode('utf-8'),
731                         'upload_date':  video_upload_date,
732                         'title':        video_title,
733                         'ext':          video_extension.decode('utf-8'),
734                 }]
735
736
737 class GoogleIE(InfoExtractor):
738         """Information extractor for video.google.com."""
739
740         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
741         IE_NAME = u'video.google'
742
743         def __init__(self, downloader=None):
744                 InfoExtractor.__init__(self, downloader)
745
746         def report_download_webpage(self, video_id):
747                 """Report webpage download."""
748                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
749
750         def report_extraction(self, video_id):
751                 """Report information extraction."""
752                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
753
754         def _real_extract(self, url):
755                 # Extract id from URL
756                 mobj = re.match(self._VALID_URL, url)
757                 if mobj is None:
758                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
759                         return
760
761                 video_id = mobj.group(1)
762
763                 video_extension = 'mp4'
764
765                 # Retrieve video webpage to extract further information
766                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
767                 try:
768                         self.report_download_webpage(video_id)
769                         webpage = urllib2.urlopen(request).read()
770                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
771                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
772                         return
773
774                 # Extract URL, uploader, and title from webpage
775                 self.report_extraction(video_id)
776                 mobj = re.search(r"download_url:'([^']+)'", webpage)
777                 if mobj is None:
778                         video_extension = 'flv'
779                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
780                 if mobj is None:
781                         self._downloader.trouble(u'ERROR: unable to extract media URL')
782                         return
783                 mediaURL = urllib.unquote(mobj.group(1))
784                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
785                 mediaURL = mediaURL.replace('\\x26', '\x26')
786
787                 video_url = mediaURL
788
789                 mobj = re.search(r'<title>(.*)</title>', webpage)
790                 if mobj is None:
791                         self._downloader.trouble(u'ERROR: unable to extract title')
792                         return
793                 video_title = mobj.group(1).decode('utf-8')
794
795                 # Extract video description
796                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
797                 if mobj is None:
798                         self._downloader.trouble(u'ERROR: unable to extract video description')
799                         return
800                 video_description = mobj.group(1).decode('utf-8')
801                 if not video_description:
802                         video_description = 'No description available.'
803
804                 # Extract video thumbnail
805                 if self._downloader.params.get('forcethumbnail', False):
806                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
807                         try:
808                                 webpage = urllib2.urlopen(request).read()
809                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
810                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
811                                 return
812                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
813                         if mobj is None:
814                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
815                                 return
816                         video_thumbnail = mobj.group(1)
817                 else:   # we need something to pass to process_info
818                         video_thumbnail = ''
819
820                 return [{
821                         'id':           video_id.decode('utf-8'),
822                         'url':          video_url.decode('utf-8'),
823                         'uploader':     None,
824                         'upload_date':  None,
825                         'title':        video_title,
826                         'ext':          video_extension.decode('utf-8'),
827                 }]
828
829
830 class PhotobucketIE(InfoExtractor):
831         """Information extractor for photobucket.com."""
832
833         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
834         IE_NAME = u'photobucket'
835
836         def __init__(self, downloader=None):
837                 InfoExtractor.__init__(self, downloader)
838
839         def report_download_webpage(self, video_id):
840                 """Report webpage download."""
841                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
842
843         def report_extraction(self, video_id):
844                 """Report information extraction."""
845                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
846
847         def _real_extract(self, url):
848                 # Extract id from URL
849                 mobj = re.match(self._VALID_URL, url)
850                 if mobj is None:
851                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
852                         return
853
854                 video_id = mobj.group(1)
855
856                 video_extension = 'flv'
857
858                 # Retrieve video webpage to extract further information
859                 request = urllib2.Request(url)
860                 try:
861                         self.report_download_webpage(video_id)
862                         webpage = urllib2.urlopen(request).read()
863                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
864                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
865                         return
866
867                 # Extract URL, uploader, and title from webpage
868                 self.report_extraction(video_id)
869                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
870                 if mobj is None:
871                         self._downloader.trouble(u'ERROR: unable to extract media URL')
872                         return
873                 mediaURL = urllib.unquote(mobj.group(1))
874
875                 video_url = mediaURL
876
877                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
878                 if mobj is None:
879                         self._downloader.trouble(u'ERROR: unable to extract title')
880                         return
881                 video_title = mobj.group(1).decode('utf-8')
882
883                 video_uploader = mobj.group(2).decode('utf-8')
884
885                 return [{
886                         'id':           video_id.decode('utf-8'),
887                         'url':          video_url.decode('utf-8'),
888                         'uploader':     video_uploader,
889                         'upload_date':  None,
890                         'title':        video_title,
891                         'ext':          video_extension.decode('utf-8'),
892                 }]
893
894
895 class YahooIE(InfoExtractor):
896         """Information extractor for video.yahoo.com."""
897
898         # _VALID_URL matches all Yahoo! Video URLs
899         # _VPAGE_URL matches only the extractable '/watch/' URLs
900         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
901         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
902         IE_NAME = u'video.yahoo'
903
904         def __init__(self, downloader=None):
905                 InfoExtractor.__init__(self, downloader)
906
907         def report_download_webpage(self, video_id):
908                 """Report webpage download."""
909                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
910
911         def report_extraction(self, video_id):
912                 """Report information extraction."""
913                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
914
915         def _real_extract(self, url, new_video=True):
916                 # Extract ID from URL
917                 mobj = re.match(self._VALID_URL, url)
918                 if mobj is None:
919                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
920                         return
921
922                 video_id = mobj.group(2)
923                 video_extension = 'flv'
924
925                 # Rewrite valid but non-extractable URLs as
926                 # extractable English language /watch/ URLs
927                 if re.match(self._VPAGE_URL, url) is None:
928                         request = urllib2.Request(url)
929                         try:
930                                 webpage = urllib2.urlopen(request).read()
931                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
932                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
933                                 return
934
935                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
936                         if mobj is None:
937                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
938                                 return
939                         yahoo_id = mobj.group(1)
940
941                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
942                         if mobj is None:
943                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
944                                 return
945                         yahoo_vid = mobj.group(1)
946
947                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
948                         return self._real_extract(url, new_video=False)
949
950                 # Retrieve video webpage to extract further information
951                 request = urllib2.Request(url)
952                 try:
953                         self.report_download_webpage(video_id)
954                         webpage = urllib2.urlopen(request).read()
955                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
956                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
957                         return
958
959                 # Extract uploader and title from webpage
960                 self.report_extraction(video_id)
961                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
962                 if mobj is None:
963                         self._downloader.trouble(u'ERROR: unable to extract video title')
964                         return
965                 video_title = mobj.group(1).decode('utf-8')
966
967                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
968                 if mobj is None:
969                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
970                         return
971                 video_uploader = mobj.group(1).decode('utf-8')
972
973                 # Extract video thumbnail
974                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
975                 if mobj is None:
976                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
977                         return
978                 video_thumbnail = mobj.group(1).decode('utf-8')
979
980                 # Extract video description
981                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
982                 if mobj is None:
983                         self._downloader.trouble(u'ERROR: unable to extract video description')
984                         return
985                 video_description = mobj.group(1).decode('utf-8')
986                 if not video_description:
987                         video_description = 'No description available.'
988
989                 # Extract video height and width
990                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
991                 if mobj is None:
992                         self._downloader.trouble(u'ERROR: unable to extract video height')
993                         return
994                 yv_video_height = mobj.group(1)
995
996                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
997                 if mobj is None:
998                         self._downloader.trouble(u'ERROR: unable to extract video width')
999                         return
1000                 yv_video_width = mobj.group(1)
1001
1002                 # Retrieve video playlist to extract media URL
1003                 # I'm not completely sure what all these options are, but we
1004                 # seem to need most of them, otherwise the server sends a 401.
1005                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1006                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1007                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1008                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1009                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1010                 try:
1011                         self.report_download_webpage(video_id)
1012                         webpage = urllib2.urlopen(request).read()
1013                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1014                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1015                         return
1016
1017                 # Extract media URL from playlist XML
1018                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1019                 if mobj is None:
1020                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1021                         return
1022                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1023                 video_url = unescapeHTML(video_url)
1024
1025                 return [{
1026                         'id':           video_id.decode('utf-8'),
1027                         'url':          video_url,
1028                         'uploader':     video_uploader,
1029                         'upload_date':  None,
1030                         'title':        video_title,
1031                         'ext':          video_extension.decode('utf-8'),
1032                         'thumbnail':    video_thumbnail.decode('utf-8'),
1033                         'description':  video_description,
1034                 }]
1035
1036
1037 class VimeoIE(InfoExtractor):
1038         """Information extractor for vimeo.com."""
1039
1040         # _VALID_URL matches Vimeo URLs
1041         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
1042         IE_NAME = u'vimeo'
1043
1044         def __init__(self, downloader=None):
1045                 InfoExtractor.__init__(self, downloader)
1046
1047         def report_download_webpage(self, video_id):
1048                 """Report webpage download."""
1049                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1050
1051         def report_extraction(self, video_id):
1052                 """Report information extraction."""
1053                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1054
1055         def _real_extract(self, url, new_video=True):
1056                 # Extract ID from URL
1057                 mobj = re.match(self._VALID_URL, url)
1058                 if mobj is None:
1059                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1060                         return
1061
1062                 video_id = mobj.group(1)
1063
1064                 # Retrieve video webpage to extract further information
1065                 request = urllib2.Request(url, None, std_headers)
1066                 try:
1067                         self.report_download_webpage(video_id)
1068                         webpage = urllib2.urlopen(request).read()
1069                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1070                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1071                         return
1072
1073                 # Now we begin extracting as much information as we can from what we
1074                 # retrieved. First we extract the information common to all extractors,
1075                 # and latter we extract those that are Vimeo specific.
1076                 self.report_extraction(video_id)
1077
1078                 # Extract the config JSON
1079                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1080                 try:
1081                         config = json.loads(config)
1082                 except:
1083                         self._downloader.trouble(u'ERROR: unable to extract info section')
1084                         return
1085                 
1086                 # Extract title
1087                 video_title = config["video"]["title"]
1088
1089                 # Extract uploader
1090                 video_uploader = config["video"]["owner"]["name"]
1091
1092                 # Extract video thumbnail
1093                 video_thumbnail = config["video"]["thumbnail"]
1094
1095                 # Extract video description
1096                 video_description = get_element_by_id("description", webpage.decode('utf8'))
1097                 if video_description: video_description = clean_html(video_description)
1098                 else: video_description = ''
1099
1100                 # Extract upload date
1101                 video_upload_date = None
1102                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1103                 if mobj is not None:
1104                         video_upload_date = mobj.group(1)
1105
1106                 # Vimeo specific: extract request signature and timestamp
1107                 sig = config['request']['signature']
1108                 timestamp = config['request']['timestamp']
1109
1110                 # Vimeo specific: extract video codec and quality information
1111                 # First consider quality, then codecs, then take everything
1112                 # TODO bind to format param
1113                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1114                 files = { 'hd': [], 'sd': [], 'other': []}
1115                 for codec_name, codec_extension in codecs:
1116                         if codec_name in config["video"]["files"]:
1117                                 if 'hd' in config["video"]["files"][codec_name]:
1118                                         files['hd'].append((codec_name, codec_extension, 'hd'))
1119                                 elif 'sd' in config["video"]["files"][codec_name]:
1120                                         files['sd'].append((codec_name, codec_extension, 'sd'))
1121                                 else:
1122                                         files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1123
1124                 for quality in ('hd', 'sd', 'other'):
1125                         if len(files[quality]) > 0:
1126                                 video_quality = files[quality][0][2]
1127                                 video_codec = files[quality][0][0]
1128                                 video_extension = files[quality][0][1]
1129                                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1130                                 break
1131                 else:
1132                         self._downloader.trouble(u'ERROR: no known codec found')
1133                         return
1134
1135                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1136                                         %(video_id, sig, timestamp, video_quality, video_codec.upper())
1137
1138                 return [{
1139                         'id':           video_id,
1140                         'url':          video_url,
1141                         'uploader':     video_uploader,
1142                         'upload_date':  video_upload_date,
1143                         'title':        video_title,
1144                         'ext':          video_extension,
1145                         'thumbnail':    video_thumbnail,
1146                         'description':  video_description,
1147                 }]
1148
1149
1150 class GenericIE(InfoExtractor):
1151         """Generic last-resort information extractor."""
1152
1153         _VALID_URL = r'.*'
1154         IE_NAME = u'generic'
1155
1156         def __init__(self, downloader=None):
1157                 InfoExtractor.__init__(self, downloader)
1158
1159         def report_download_webpage(self, video_id):
1160                 """Report webpage download."""
1161                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1162                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1163
1164         def report_extraction(self, video_id):
1165                 """Report information extraction."""
1166                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1167
1168         def report_following_redirect(self, new_url):
1169                 """Report information extraction."""
1170                 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1171                 
1172         def _test_redirect(self, url):
1173                 """Check if it is a redirect, like url shorteners, in case restart chain."""
1174                 class HeadRequest(urllib2.Request):
1175                         def get_method(self):
1176                                 return "HEAD"
1177
1178                 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1179                         """
1180                         Subclass the HTTPRedirectHandler to make it use our 
1181                         HeadRequest also on the redirected URL
1182                         """
1183                         def redirect_request(self, req, fp, code, msg, headers, newurl): 
1184                                 if code in (301, 302, 303, 307):
1185                                         newurl = newurl.replace(' ', '%20') 
1186                                         newheaders = dict((k,v) for k,v in req.headers.items()
1187                                                                           if k.lower() not in ("content-length", "content-type"))
1188                                         return HeadRequest(newurl, 
1189                                                                            headers=newheaders,
1190                                                                            origin_req_host=req.get_origin_req_host(), 
1191                                                                            unverifiable=True) 
1192                                 else: 
1193                                         raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp) 
1194
1195                 class HTTPMethodFallback(urllib2.BaseHandler):
1196                         """
1197                         Fallback to GET if HEAD is not allowed (405 HTTP error)
1198                         """
1199                         def http_error_405(self, req, fp, code, msg, headers): 
1200                                 fp.read()
1201                                 fp.close()
1202
1203                                 newheaders = dict((k,v) for k,v in req.headers.items()
1204                                                                   if k.lower() not in ("content-length", "content-type"))
1205                                 return self.parent.open(urllib2.Request(req.get_full_url(), 
1206                                                                                                  headers=newheaders, 
1207                                                                                                  origin_req_host=req.get_origin_req_host(), 
1208                                                                                                  unverifiable=True))
1209
1210                 # Build our opener
1211                 opener = urllib2.OpenerDirector() 
1212                 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1213                                                 HTTPMethodFallback, HEADRedirectHandler,
1214                                                 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1215                         opener.add_handler(handler())
1216
1217                 response = opener.open(HeadRequest(url))
1218                 new_url = response.geturl()
1219                 
1220                 if url == new_url: return False
1221                 
1222                 self.report_following_redirect(new_url)
1223                 self._downloader.download([new_url])
1224                 return True
1225
1226         def _real_extract(self, url):
1227                 if self._test_redirect(url): return
1228
1229                 video_id = url.split('/')[-1]
1230                 request = urllib2.Request(url)
1231                 try:
1232                         self.report_download_webpage(video_id)
1233                         webpage = urllib2.urlopen(request).read()
1234                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1235                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1236                         return
1237                 except ValueError, err:
1238                         # since this is the last-resort InfoExtractor, if
1239                         # this error is thrown, it'll be thrown here
1240                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1241                         return
1242
1243                 self.report_extraction(video_id)
1244                 # Start with something easy: JW Player in SWFObject
1245                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1246                 if mobj is None:
1247                         # Broaden the search a little bit
1248                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1249                 if mobj is None:
1250                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1251                         return
1252
1253                 # It's possible that one of the regexes
1254                 # matched, but returned an empty group:
1255                 if mobj.group(1) is None:
1256                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1257                         return
1258
1259                 video_url = urllib.unquote(mobj.group(1))
1260                 video_id = os.path.basename(video_url)
1261
1262                 # here's a fun little line of code for you:
1263                 video_extension = os.path.splitext(video_id)[1][1:]
1264                 video_id = os.path.splitext(video_id)[0]
1265
1266                 # it's tempting to parse this further, but you would
1267                 # have to take into account all the variations like
1268                 #   Video Title - Site Name
1269                 #   Site Name | Video Title
1270                 #   Video Title - Tagline | Site Name
1271                 # and so on and so forth; it's just not practical
1272                 mobj = re.search(r'<title>(.*)</title>', webpage)
1273                 if mobj is None:
1274                         self._downloader.trouble(u'ERROR: unable to extract title')
1275                         return
1276                 video_title = mobj.group(1).decode('utf-8')
1277
1278                 # video uploader is domain name
1279                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1280                 if mobj is None:
1281                         self._downloader.trouble(u'ERROR: unable to extract title')
1282                         return
1283                 video_uploader = mobj.group(1).decode('utf-8')
1284
1285                 return [{
1286                         'id':           video_id.decode('utf-8'),
1287                         'url':          video_url.decode('utf-8'),
1288                         'uploader':     video_uploader,
1289                         'upload_date':  None,
1290                         'title':        video_title,
1291                         'ext':          video_extension.decode('utf-8'),
1292                 }]
1293
1294
1295 class YoutubeSearchIE(InfoExtractor):
1296         """Information Extractor for YouTube search queries."""
1297         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1298         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1299         _max_youtube_results = 1000
1300         IE_NAME = u'youtube:search'
1301
1302         def __init__(self, downloader=None):
1303                 InfoExtractor.__init__(self, downloader)
1304
1305         def report_download_page(self, query, pagenum):
1306                 """Report attempt to download search page with given number."""
1307                 query = query.decode(preferredencoding())
1308                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1309
1310         def _real_extract(self, query):
1311                 mobj = re.match(self._VALID_URL, query)
1312                 if mobj is None:
1313                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1314                         return
1315
1316                 prefix, query = query.split(':')
1317                 prefix = prefix[8:]
1318                 query = query.encode('utf-8')
1319                 if prefix == '':
1320                         self._download_n_results(query, 1)
1321                         return
1322                 elif prefix == 'all':
1323                         self._download_n_results(query, self._max_youtube_results)
1324                         return
1325                 else:
1326                         try:
1327                                 n = long(prefix)
1328                                 if n <= 0:
1329                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1330                                         return
1331                                 elif n > self._max_youtube_results:
1332                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1333                                         n = self._max_youtube_results
1334                                 self._download_n_results(query, n)
1335                                 return
1336                         except ValueError: # parsing prefix as integer fails
1337                                 self._download_n_results(query, 1)
1338                                 return
1339
1340         def _download_n_results(self, query, n):
1341                 """Downloads a specified number of results for a query"""
1342
1343                 video_ids = []
1344                 pagenum = 0
1345                 limit = n
1346
1347                 while (50 * pagenum) < limit:
1348                         self.report_download_page(query, pagenum+1)
1349                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1350                         request = urllib2.Request(result_url)
1351                         try:
1352                                 data = urllib2.urlopen(request).read()
1353                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1354                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1355                                 return
1356                         api_response = json.loads(data)['data']
1357
1358                         new_ids = list(video['id'] for video in api_response['items'])
1359                         video_ids += new_ids
1360
1361                         limit = min(n, api_response['totalItems'])
1362                         pagenum += 1
1363
1364                 if len(video_ids) > n:
1365                         video_ids = video_ids[:n]
1366                 for id in video_ids:
1367                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1368                 return
1369
1370
1371 class GoogleSearchIE(InfoExtractor):
1372         """Information Extractor for Google Video search queries."""
1373         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1374         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1375         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1376         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1377         _max_google_results = 1000
1378         IE_NAME = u'video.google:search'
1379
1380         def __init__(self, downloader=None):
1381                 InfoExtractor.__init__(self, downloader)
1382
1383         def report_download_page(self, query, pagenum):
1384                 """Report attempt to download playlist page with given number."""
1385                 query = query.decode(preferredencoding())
1386                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1387
1388         def _real_extract(self, query):
1389                 mobj = re.match(self._VALID_URL, query)
1390                 if mobj is None:
1391                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1392                         return
1393
1394                 prefix, query = query.split(':')
1395                 prefix = prefix[8:]
1396                 query = query.encode('utf-8')
1397                 if prefix == '':
1398                         self._download_n_results(query, 1)
1399                         return
1400                 elif prefix == 'all':
1401                         self._download_n_results(query, self._max_google_results)
1402                         return
1403                 else:
1404                         try:
1405                                 n = long(prefix)
1406                                 if n <= 0:
1407                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1408                                         return
1409                                 elif n > self._max_google_results:
1410                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1411                                         n = self._max_google_results
1412                                 self._download_n_results(query, n)
1413                                 return
1414                         except ValueError: # parsing prefix as integer fails
1415                                 self._download_n_results(query, 1)
1416                                 return
1417
1418         def _download_n_results(self, query, n):
1419                 """Downloads a specified number of results for a query"""
1420
1421                 video_ids = []
1422                 pagenum = 0
1423
1424                 while True:
1425                         self.report_download_page(query, pagenum)
1426                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1427                         request = urllib2.Request(result_url)
1428                         try:
1429                                 page = urllib2.urlopen(request).read()
1430                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1431                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1432                                 return
1433
1434                         # Extract video identifiers
1435                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1436                                 video_id = mobj.group(1)
1437                                 if video_id not in video_ids:
1438                                         video_ids.append(video_id)
1439                                         if len(video_ids) == n:
1440                                                 # Specified n videos reached
1441                                                 for id in video_ids:
1442                                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1443                                                 return
1444
1445                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1446                                 for id in video_ids:
1447                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1448                                 return
1449
1450                         pagenum = pagenum + 1
1451
1452
1453 class YahooSearchIE(InfoExtractor):
1454         """Information Extractor for Yahoo! Video search queries."""
1455         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1456         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1457         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1458         _MORE_PAGES_INDICATOR = r'\s*Next'
1459         _max_yahoo_results = 1000
1460         IE_NAME = u'video.yahoo:search'
1461
1462         def __init__(self, downloader=None):
1463                 InfoExtractor.__init__(self, downloader)
1464
1465         def report_download_page(self, query, pagenum):
1466                 """Report attempt to download playlist page with given number."""
1467                 query = query.decode(preferredencoding())
1468                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1469
1470         def _real_extract(self, query):
1471                 mobj = re.match(self._VALID_URL, query)
1472                 if mobj is None:
1473                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1474                         return
1475
1476                 prefix, query = query.split(':')
1477                 prefix = prefix[8:]
1478                 query = query.encode('utf-8')
1479                 if prefix == '':
1480                         self._download_n_results(query, 1)
1481                         return
1482                 elif prefix == 'all':
1483                         self._download_n_results(query, self._max_yahoo_results)
1484                         return
1485                 else:
1486                         try:
1487                                 n = long(prefix)
1488                                 if n <= 0:
1489                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1490                                         return
1491                                 elif n > self._max_yahoo_results:
1492                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1493                                         n = self._max_yahoo_results
1494                                 self._download_n_results(query, n)
1495                                 return
1496                         except ValueError: # parsing prefix as integer fails
1497                                 self._download_n_results(query, 1)
1498                                 return
1499
1500         def _download_n_results(self, query, n):
1501                 """Downloads a specified number of results for a query"""
1502
1503                 video_ids = []
1504                 already_seen = set()
1505                 pagenum = 1
1506
1507                 while True:
1508                         self.report_download_page(query, pagenum)
1509                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1510                         request = urllib2.Request(result_url)
1511                         try:
1512                                 page = urllib2.urlopen(request).read()
1513                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1514                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1515                                 return
1516
1517                         # Extract video identifiers
1518                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1519                                 video_id = mobj.group(1)
1520                                 if video_id not in already_seen:
1521                                         video_ids.append(video_id)
1522                                         already_seen.add(video_id)
1523                                         if len(video_ids) == n:
1524                                                 # Specified n videos reached
1525                                                 for id in video_ids:
1526                                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1527                                                 return
1528
1529                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1530                                 for id in video_ids:
1531                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1532                                 return
1533
1534                         pagenum = pagenum + 1
1535
1536
1537 class YoutubePlaylistIE(InfoExtractor):
1538         """Information Extractor for YouTube playlists."""
1539
1540         _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1541         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1542         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1543         _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1544         IE_NAME = u'youtube:playlist'
1545
1546         def __init__(self, downloader=None):
1547                 InfoExtractor.__init__(self, downloader)
1548
1549         def report_download_page(self, playlist_id, pagenum):
1550                 """Report attempt to download playlist page with given number."""
1551                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1552
1553         def _real_extract(self, url):
1554                 # Extract playlist id
1555                 mobj = re.match(self._VALID_URL, url)
1556                 if mobj is None:
1557                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1558                         return
1559
1560                 # Single video case
1561                 if mobj.group(3) is not None:
1562                         self._downloader.download([mobj.group(3)])
1563                         return
1564
1565                 # Download playlist pages
1566                 # prefix is 'p' as default for playlists but there are other types that need extra care
1567                 playlist_prefix = mobj.group(1)
1568                 if playlist_prefix == 'a':
1569                         playlist_access = 'artist'
1570                 else:
1571                         playlist_prefix = 'p'
1572                         playlist_access = 'view_play_list'
1573                 playlist_id = mobj.group(2)
1574                 video_ids = []
1575                 pagenum = 1
1576
1577                 while True:
1578                         self.report_download_page(playlist_id, pagenum)
1579                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1580                         request = urllib2.Request(url)
1581                         try:
1582                                 page = urllib2.urlopen(request).read()
1583                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1584                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1585                                 return
1586
1587                         # Extract video identifiers
1588                         ids_in_page = []
1589                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1590                                 if mobj.group(1) not in ids_in_page:
1591                                         ids_in_page.append(mobj.group(1))
1592                         video_ids.extend(ids_in_page)
1593
1594                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1595                                 break
1596                         pagenum = pagenum + 1
1597
1598                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1599                 playlistend = self._downloader.params.get('playlistend', -1)
1600                 if playlistend == -1:
1601                         video_ids = video_ids[playliststart:]
1602                 else:
1603                         video_ids = video_ids[playliststart:playlistend]
1604
1605                 for id in video_ids:
1606                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1607                 return
1608
1609
1610 class YoutubeChannelIE(InfoExtractor):
1611         """Information Extractor for YouTube channels."""
1612
1613         _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1614         _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1615         _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1616         IE_NAME = u'youtube:channel'
1617
1618         def report_download_page(self, channel_id, pagenum):
1619                 """Report attempt to download channel page with given number."""
1620                 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1621
1622         def _real_extract(self, url):
1623                 # Extract channel id
1624                 mobj = re.match(self._VALID_URL, url)
1625                 if mobj is None:
1626                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1627                         return
1628
1629                 # Download channel pages
1630                 channel_id = mobj.group(1)
1631                 video_ids = []
1632                 pagenum = 1
1633
1634                 while True:
1635                         self.report_download_page(channel_id, pagenum)
1636                         url = self._TEMPLATE_URL % (channel_id, pagenum)
1637                         request = urllib2.Request(url)
1638                         try:
1639                                 page = urllib2.urlopen(request).read()
1640                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1641                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1642                                 return
1643
1644                         # Extract video identifiers
1645                         ids_in_page = []
1646                         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1647                                 if mobj.group(1) not in ids_in_page:
1648                                         ids_in_page.append(mobj.group(1))
1649                         video_ids.extend(ids_in_page)
1650
1651                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1652                                 break
1653                         pagenum = pagenum + 1
1654
1655                 for id in video_ids:
1656                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1657                 return
1658
1659
1660 class YoutubeUserIE(InfoExtractor):
1661         """Information Extractor for YouTube users."""
1662
1663         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1664         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1665         _GDATA_PAGE_SIZE = 50
1666         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1667         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1668         IE_NAME = u'youtube:user'
1669
1670         def __init__(self, downloader=None):
1671                 InfoExtractor.__init__(self, downloader)
1672
1673         def report_download_page(self, username, start_index):
1674                 """Report attempt to download user page."""
1675                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1676                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1677
1678         def _real_extract(self, url):
1679                 # Extract username
1680                 mobj = re.match(self._VALID_URL, url)
1681                 if mobj is None:
1682                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1683                         return
1684
1685                 username = mobj.group(1)
1686
1687                 # Download video ids using YouTube Data API. Result size per
1688                 # query is limited (currently to 50 videos) so we need to query
1689                 # page by page until there are no video ids - it means we got
1690                 # all of them.
1691
1692                 video_ids = []
1693                 pagenum = 0
1694
1695                 while True:
1696                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1697                         self.report_download_page(username, start_index)
1698
1699                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1700
1701                         try:
1702                                 page = urllib2.urlopen(request).read()
1703                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1704                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1705                                 return
1706
1707                         # Extract video identifiers
1708                         ids_in_page = []
1709
1710                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1711                                 if mobj.group(1) not in ids_in_page:
1712                                         ids_in_page.append(mobj.group(1))
1713
1714                         video_ids.extend(ids_in_page)
1715
1716                         # A little optimization - if current page is not
1717                         # "full", ie. does not contain PAGE_SIZE video ids then
1718                         # we can assume that this page is the last one - there
1719                         # are no more ids on further pages - no need to query
1720                         # again.
1721
1722                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1723                                 break
1724
1725                         pagenum += 1
1726
1727                 all_ids_count = len(video_ids)
1728                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1729                 playlistend = self._downloader.params.get('playlistend', -1)
1730
1731                 if playlistend == -1:
1732                         video_ids = video_ids[playliststart:]
1733                 else:
1734                         video_ids = video_ids[playliststart:playlistend]
1735
1736                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1737                                 (username, all_ids_count, len(video_ids)))
1738
1739                 for video_id in video_ids:
1740                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1741
1742
1743 class BlipTVUserIE(InfoExtractor):
1744         """Information Extractor for blip.tv users."""
1745
1746         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1747         _PAGE_SIZE = 12
1748         IE_NAME = u'blip.tv:user'
1749
1750         def __init__(self, downloader=None):
1751                 InfoExtractor.__init__(self, downloader)
1752
1753         def report_download_page(self, username, pagenum):
1754                 """Report attempt to download user page."""
1755                 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1756                                 (self.IE_NAME, username, pagenum))
1757
1758         def _real_extract(self, url):
1759                 # Extract username
1760                 mobj = re.match(self._VALID_URL, url)
1761                 if mobj is None:
1762                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1763                         return
1764
1765                 username = mobj.group(1)
1766
1767                 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1768
1769                 request = urllib2.Request(url)
1770
1771                 try:
1772                         page = urllib2.urlopen(request).read().decode('utf-8')
1773                         mobj = re.search(r'data-users-id="([^"]+)"', page)
1774                         page_base = page_base % mobj.group(1)
1775                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1776                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1777                         return
1778
1779
1780                 # Download video ids using BlipTV Ajax calls. Result size per
1781                 # query is limited (currently to 12 videos) so we need to query
1782                 # page by page until there are no video ids - it means we got
1783                 # all of them.
1784
1785                 video_ids = []
1786                 pagenum = 1
1787
1788                 while True:
1789                         self.report_download_page(username, pagenum)
1790
1791                         request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1792
1793                         try:
1794                                 page = urllib2.urlopen(request).read().decode('utf-8')
1795                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1796                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1797                                 return
1798
1799                         # Extract video identifiers
1800                         ids_in_page = []
1801
1802                         for mobj in re.finditer(r'href="/([^"]+)"', page):
1803                                 if mobj.group(1) not in ids_in_page:
1804                                         ids_in_page.append(unescapeHTML(mobj.group(1)))
1805
1806                         video_ids.extend(ids_in_page)
1807
1808                         # A little optimization - if current page is not
1809                         # "full", ie. does not contain PAGE_SIZE video ids then
1810                         # we can assume that this page is the last one - there
1811                         # are no more ids on further pages - no need to query
1812                         # again.
1813
1814                         if len(ids_in_page) < self._PAGE_SIZE:
1815                                 break
1816
1817                         pagenum += 1
1818
1819                 all_ids_count = len(video_ids)
1820                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1821                 playlistend = self._downloader.params.get('playlistend', -1)
1822
1823                 if playlistend == -1:
1824                         video_ids = video_ids[playliststart:]
1825                 else:
1826                         video_ids = video_ids[playliststart:playlistend]
1827
1828                 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1829                                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1830
1831                 for video_id in video_ids:
1832                         self._downloader.download([u'http://blip.tv/'+video_id])
1833
1834
1835 class DepositFilesIE(InfoExtractor):
1836         """Information extractor for depositfiles.com"""
1837
1838         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1839         IE_NAME = u'DepositFiles'
1840
1841         def __init__(self, downloader=None):
1842                 InfoExtractor.__init__(self, downloader)
1843
1844         def report_download_webpage(self, file_id):
1845                 """Report webpage download."""
1846                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1847
1848         def report_extraction(self, file_id):
1849                 """Report information extraction."""
1850                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1851
1852         def _real_extract(self, url):
1853                 file_id = url.split('/')[-1]
1854                 # Rebuild url in english locale
1855                 url = 'http://depositfiles.com/en/files/' + file_id
1856
1857                 # Retrieve file webpage with 'Free download' button pressed
1858                 free_download_indication = { 'gateway_result' : '1' }
1859                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1860                 try:
1861                         self.report_download_webpage(file_id)
1862                         webpage = urllib2.urlopen(request).read()
1863                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1864                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1865                         return
1866
1867                 # Search for the real file URL
1868                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1869                 if (mobj is None) or (mobj.group(1) is None):
1870                         # Try to figure out reason of the error.
1871                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1872                         if (mobj is not None) and (mobj.group(1) is not None):
1873                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1874                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1875                         else:
1876                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1877                         return
1878
1879                 file_url = mobj.group(1)
1880                 file_extension = os.path.splitext(file_url)[1][1:]
1881
1882                 # Search for file title
1883                 mobj = re.search(r'<b title="(.*?)">', webpage)
1884                 if mobj is None:
1885                         self._downloader.trouble(u'ERROR: unable to extract title')
1886                         return
1887                 file_title = mobj.group(1).decode('utf-8')
1888
1889                 return [{
1890                         'id':           file_id.decode('utf-8'),
1891                         'url':          file_url.decode('utf-8'),
1892                         'uploader':     None,
1893                         'upload_date':  None,
1894                         'title':        file_title,
1895                         'ext':          file_extension.decode('utf-8'),
1896                 }]
1897
1898
1899 class FacebookIE(InfoExtractor):
1900         """Information Extractor for Facebook"""
1901
1902         _WORKING = False
1903         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1904         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1905         _NETRC_MACHINE = 'facebook'
1906         _available_formats = ['video', 'highqual', 'lowqual']
1907         _video_extensions = {
1908                 'video': 'mp4',
1909                 'highqual': 'mp4',
1910                 'lowqual': 'mp4',
1911         }
1912         IE_NAME = u'facebook'
1913
1914         def __init__(self, downloader=None):
1915                 InfoExtractor.__init__(self, downloader)
1916
1917         def _reporter(self, message):
1918                 """Add header and report message."""
1919                 self._downloader.to_screen(u'[facebook] %s' % message)
1920
1921         def report_login(self):
1922                 """Report attempt to log in."""
1923                 self._reporter(u'Logging in')
1924
1925         def report_video_webpage_download(self, video_id):
1926                 """Report attempt to download video webpage."""
1927                 self._reporter(u'%s: Downloading video webpage' % video_id)
1928
1929         def report_information_extraction(self, video_id):
1930                 """Report attempt to extract video information."""
1931                 self._reporter(u'%s: Extracting video information' % video_id)
1932
1933         def _parse_page(self, video_webpage):
1934                 """Extract video information from page"""
1935                 # General data
1936                 data = {'title': r'\("video_title", "(.*?)"\)',
1937                         'description': r'<div class="datawrap">(.*?)</div>',
1938                         'owner': r'\("video_owner_name", "(.*?)"\)',
1939                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1940                         }
1941                 video_info = {}
1942                 for piece in data.keys():
1943                         mobj = re.search(data[piece], video_webpage)
1944                         if mobj is not None:
1945                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1946
1947                 # Video urls
1948                 video_urls = {}
1949                 for fmt in self._available_formats:
1950                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1951                         if mobj is not None:
1952                                 # URL is in a Javascript segment inside an escaped Unicode format within
1953                                 # the generally utf-8 page
1954                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1955                 video_info['video_urls'] = video_urls
1956
1957                 return video_info
1958
1959         def _real_initialize(self):
1960                 if self._downloader is None:
1961                         return
1962
1963                 useremail = None
1964                 password = None
1965                 downloader_params = self._downloader.params
1966
1967                 # Attempt to use provided username and password or .netrc data
1968                 if downloader_params.get('username', None) is not None:
1969                         useremail = downloader_params['username']
1970                         password = downloader_params['password']
1971                 elif downloader_params.get('usenetrc', False):
1972                         try:
1973                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1974                                 if info is not None:
1975                                         useremail = info[0]
1976                                         password = info[2]
1977                                 else:
1978                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1979                         except (IOError, netrc.NetrcParseError), err:
1980                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
1981                                 return
1982
1983                 if useremail is None:
1984                         return
1985
1986                 # Log in
1987                 login_form = {
1988                         'email': useremail,
1989                         'pass': password,
1990                         'login': 'Log+In'
1991                         }
1992                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1993                 try:
1994                         self.report_login()
1995                         login_results = urllib2.urlopen(request).read()
1996                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1997                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1998                                 return
1999                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2000                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2001                         return
2002
2003         def _real_extract(self, url):
2004                 mobj = re.match(self._VALID_URL, url)
2005                 if mobj is None:
2006                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2007                         return
2008                 video_id = mobj.group('ID')
2009
2010                 # Get video webpage
2011                 self.report_video_webpage_download(video_id)
2012                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2013                 try:
2014                         page = urllib2.urlopen(request)
2015                         video_webpage = page.read()
2016                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2017                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2018                         return
2019
2020                 # Start extracting information
2021                 self.report_information_extraction(video_id)
2022
2023                 # Extract information
2024                 video_info = self._parse_page(video_webpage)
2025
2026                 # uploader
2027                 if 'owner' not in video_info:
2028                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2029                         return
2030                 video_uploader = video_info['owner']
2031
2032                 # title
2033                 if 'title' not in video_info:
2034                         self._downloader.trouble(u'ERROR: unable to extract video title')
2035                         return
2036                 video_title = video_info['title']
2037                 video_title = video_title.decode('utf-8')
2038
2039                 # thumbnail image
2040                 if 'thumbnail' not in video_info:
2041                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2042                         video_thumbnail = ''
2043                 else:
2044                         video_thumbnail = video_info['thumbnail']
2045
2046                 # upload date
2047                 upload_date = None
2048                 if 'upload_date' in video_info:
2049                         upload_time = video_info['upload_date']
2050                         timetuple = email.utils.parsedate_tz(upload_time)
2051                         if timetuple is not None:
2052                                 try:
2053                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2054                                 except:
2055                                         pass
2056
2057                 # description
2058                 video_description = video_info.get('description', 'No description available.')
2059
2060                 url_map = video_info['video_urls']
2061                 if len(url_map.keys()) > 0:
2062                         # Decide which formats to download
2063                         req_format = self._downloader.params.get('format', None)
2064                         format_limit = self._downloader.params.get('format_limit', None)
2065
2066                         if format_limit is not None and format_limit in self._available_formats:
2067                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2068                         else:
2069                                 format_list = self._available_formats
2070                         existing_formats = [x for x in format_list if x in url_map]
2071                         if len(existing_formats) == 0:
2072                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2073                                 return
2074                         if req_format is None:
2075                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2076                         elif req_format == 'worst':
2077                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2078                         elif req_format == '-1':
2079                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2080                         else:
2081                                 # Specific format
2082                                 if req_format not in url_map:
2083                                         self._downloader.trouble(u'ERROR: requested format not available')
2084                                         return
2085                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2086
2087                 results = []
2088                 for format_param, video_real_url in video_url_list:
2089                         # Extension
2090                         video_extension = self._video_extensions.get(format_param, 'mp4')
2091
2092                         results.append({
2093                                 'id':           video_id.decode('utf-8'),
2094                                 'url':          video_real_url.decode('utf-8'),
2095                                 'uploader':     video_uploader.decode('utf-8'),
2096                                 'upload_date':  upload_date,
2097                                 'title':        video_title,
2098                                 'ext':          video_extension.decode('utf-8'),
2099                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2100                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2101                                 'description':  video_description.decode('utf-8'),
2102                         })
2103                 return results
2104
2105 class BlipTVIE(InfoExtractor):
2106         """Information extractor for blip.tv"""
2107
2108         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2109         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2110         IE_NAME = u'blip.tv'
2111
2112         def report_extraction(self, file_id):
2113                 """Report information extraction."""
2114                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2115
2116         def report_direct_download(self, title):
2117                 """Report information extraction."""
2118                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2119
2120         def _real_extract(self, url):
2121                 mobj = re.match(self._VALID_URL, url)
2122                 if mobj is None:
2123                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2124                         return
2125
2126                 if '?' in url:
2127                         cchar = '&'
2128                 else:
2129                         cchar = '?'
2130                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2131                 request = urllib2.Request(json_url.encode('utf-8'))
2132                 self.report_extraction(mobj.group(1))
2133                 info = None
2134                 try:
2135                         urlh = urllib2.urlopen(request)
2136                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2137                                 basename = url.split('/')[-1]
2138                                 title,ext = os.path.splitext(basename)
2139                                 title = title.decode('UTF-8')
2140                                 ext = ext.replace('.', '')
2141                                 self.report_direct_download(title)
2142                                 info = {
2143                                         'id': title,
2144                                         'url': url,
2145                                         'uploader': None,
2146                                         'upload_date': None,
2147                                         'title': title,
2148                                         'ext': ext,
2149                                         'urlhandle': urlh
2150                                 }
2151                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2152                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2153                         return
2154                 if info is None: # Regular URL
2155                         try:
2156                                 json_code = urlh.read()
2157                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2158                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2159                                 return
2160
2161                         try:
2162                                 json_data = json.loads(json_code)
2163                                 if 'Post' in json_data:
2164                                         data = json_data['Post']
2165                                 else:
2166                                         data = json_data
2167
2168                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2169                                 video_url = data['media']['url']
2170                                 umobj = re.match(self._URL_EXT, video_url)
2171                                 if umobj is None:
2172                                         raise ValueError('Can not determine filename extension')
2173                                 ext = umobj.group(1)
2174
2175                                 info = {
2176                                         'id': data['item_id'],
2177                                         'url': video_url,
2178                                         'uploader': data['display_name'],
2179                                         'upload_date': upload_date,
2180                                         'title': data['title'],
2181                                         'ext': ext,
2182                                         'format': data['media']['mimeType'],
2183                                         'thumbnail': data['thumbnailUrl'],
2184                                         'description': data['description'],
2185                                         'player_url': data['embedUrl']
2186                                 }
2187                         except (ValueError,KeyError), err:
2188                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2189                                 return
2190
2191                 std_headers['User-Agent'] = 'iTunes/10.6.1'
2192                 return [info]
2193
2194
2195 class MyVideoIE(InfoExtractor):
2196         """Information Extractor for myvideo.de."""
2197
2198         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2199         IE_NAME = u'myvideo'
2200
2201         def __init__(self, downloader=None):
2202                 InfoExtractor.__init__(self, downloader)
2203         
2204         def report_download_webpage(self, video_id):
2205                 """Report webpage download."""
2206                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2207
2208         def report_extraction(self, video_id):
2209                 """Report information extraction."""
2210                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2211
2212         def _real_extract(self,url):
2213                 mobj = re.match(self._VALID_URL, url)
2214                 if mobj is None:
2215                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2216                         return
2217
2218                 video_id = mobj.group(1)
2219
2220                 # Get video webpage
2221                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2222                 try:
2223                         self.report_download_webpage(video_id)
2224                         webpage = urllib2.urlopen(request).read()
2225                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2226                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2227                         return
2228
2229                 self.report_extraction(video_id)
2230                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2231                                  webpage)
2232                 if mobj is None:
2233                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2234                         return
2235                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2236
2237                 mobj = re.search('<title>([^<]+)</title>', webpage)
2238                 if mobj is None:
2239                         self._downloader.trouble(u'ERROR: unable to extract title')
2240                         return
2241
2242                 video_title = mobj.group(1)
2243
2244                 return [{
2245                         'id':           video_id,
2246                         'url':          video_url,
2247                         'uploader':     None,
2248                         'upload_date':  None,
2249                         'title':        video_title,
2250                         'ext':          u'flv',
2251                 }]
2252
2253 class ComedyCentralIE(InfoExtractor):
2254         """Information extractor for The Daily Show and Colbert Report """
2255
2256         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2257         IE_NAME = u'comedycentral'
2258
2259         _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2260
2261         _video_extensions = {
2262                 '3500': 'mp4',
2263                 '2200': 'mp4',
2264                 '1700': 'mp4',
2265                 '1200': 'mp4',
2266                 '750': 'mp4',
2267                 '400': 'mp4',
2268         }
2269         _video_dimensions = {
2270                 '3500': '1280x720',
2271                 '2200': '960x540',
2272                 '1700': '768x432',
2273                 '1200': '640x360',
2274                 '750': '512x288',
2275                 '400': '384x216',
2276         }
2277
2278         def report_extraction(self, episode_id):
2279                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2280
2281         def report_config_download(self, episode_id):
2282                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2283
2284         def report_index_download(self, episode_id):
2285                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2286
2287         def report_player_url(self, episode_id):
2288                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2289
2290
2291         def _print_formats(self, formats):
2292                 print('Available formats:')
2293                 for x in formats:
2294                         print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2295
2296
2297         def _real_extract(self, url):
2298                 mobj = re.match(self._VALID_URL, url)
2299                 if mobj is None:
2300                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2301                         return
2302
2303                 if mobj.group('shortname'):
2304                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
2305                                 url = u'http://www.thedailyshow.com/full-episodes/'
2306                         else:
2307                                 url = u'http://www.colbertnation.com/full-episodes/'
2308                         mobj = re.match(self._VALID_URL, url)
2309                         assert mobj is not None
2310
2311                 dlNewest = not mobj.group('episode')
2312                 if dlNewest:
2313                         epTitle = mobj.group('showname')
2314                 else:
2315                         epTitle = mobj.group('episode')
2316
2317                 req = urllib2.Request(url)
2318                 self.report_extraction(epTitle)
2319                 try:
2320                         htmlHandle = urllib2.urlopen(req)
2321                         html = htmlHandle.read()
2322                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2323                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2324                         return
2325                 if dlNewest:
2326                         url = htmlHandle.geturl()
2327                         mobj = re.match(self._VALID_URL, url)
2328                         if mobj is None:
2329                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2330                                 return
2331                         if mobj.group('episode') == '':
2332                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2333                                 return
2334                         epTitle = mobj.group('episode')
2335
2336                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2337
2338                 if len(mMovieParams) == 0:
2339                         # The Colbert Report embeds the information in a without
2340                         # a URL prefix; so extract the alternate reference
2341                         # and then add the URL prefix manually.
2342
2343                         altMovieParams = re.findall('data-mgid="([^"]*episode.*?:.*?)"', html)
2344                         if len(altMovieParams) == 0:
2345                                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2346                                 return
2347                         else:
2348                                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2349                 
2350                 playerUrl_raw = mMovieParams[0][0]
2351                 self.report_player_url(epTitle)
2352                 try:
2353                         urlHandle = urllib2.urlopen(playerUrl_raw)
2354                         playerUrl = urlHandle.geturl()
2355                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2356                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2357                         return
2358
2359                 uri = mMovieParams[0][1]
2360                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2361                 self.report_index_download(epTitle)
2362                 try:
2363                         indexXml = urllib2.urlopen(indexUrl).read()
2364                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2365                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2366                         return
2367
2368                 results = []
2369
2370                 idoc = xml.etree.ElementTree.fromstring(indexXml)
2371                 itemEls = idoc.findall('.//item')
2372                 for itemEl in itemEls:
2373                         mediaId = itemEl.findall('./guid')[0].text
2374                         shortMediaId = mediaId.split(':')[-1]
2375                         showId = mediaId.split(':')[-2].replace('.com', '')
2376                         officialTitle = itemEl.findall('./title')[0].text
2377                         officialDate = itemEl.findall('./pubDate')[0].text
2378
2379                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2380                                                 urllib.urlencode({'uri': mediaId}))
2381                         configReq = urllib2.Request(configUrl)
2382                         self.report_config_download(epTitle)
2383                         try:
2384                                 configXml = urllib2.urlopen(configReq).read()
2385                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2386                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2387                                 return
2388
2389                         cdoc = xml.etree.ElementTree.fromstring(configXml)
2390                         turls = []
2391                         for rendition in cdoc.findall('.//rendition'):
2392                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2393                                 turls.append(finfo)
2394
2395                         if len(turls) == 0:
2396                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2397                                 continue
2398                         
2399                         if self._downloader.params.get('listformats', None):
2400                                 self._print_formats([i[0] for i in turls])
2401                                 return
2402
2403                         # For now, just pick the highest bitrate
2404                         format,video_url = turls[-1]
2405
2406                         # Get the format arg from the arg stream
2407                         req_format = self._downloader.params.get('format', None)
2408
2409                         # Select format if we can find one
2410                         for f,v in turls:
2411                                 if f == req_format:
2412                                         format, video_url = f, v
2413                                         break
2414
2415                         # Patch to download from alternative CDN, which does not
2416                         # break on current RTMPDump builds
2417                         broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2418                         better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2419
2420                         if video_url.startswith(broken_cdn):
2421                                 video_url = video_url.replace(broken_cdn, better_cdn)
2422
2423                         effTitle = showId + u'-' + epTitle
2424                         info = {
2425                                 'id': shortMediaId,
2426                                 'url': video_url,
2427                                 'uploader': showId,
2428                                 'upload_date': officialDate,
2429                                 'title': effTitle,
2430                                 'ext': 'mp4',
2431                                 'format': format,
2432                                 'thumbnail': None,
2433                                 'description': officialTitle,
2434                                 'player_url': None #playerUrl
2435                         }
2436
2437                         results.append(info)
2438                         
2439                 return results
2440
2441
2442 class EscapistIE(InfoExtractor):
2443         """Information extractor for The Escapist """
2444
2445         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2446         IE_NAME = u'escapist'
2447
2448         def report_extraction(self, showName):
2449                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2450
2451         def report_config_download(self, showName):
2452                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2453
2454         def _real_extract(self, url):
2455                 mobj = re.match(self._VALID_URL, url)
2456                 if mobj is None:
2457                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2458                         return
2459                 showName = mobj.group('showname')
2460                 videoId = mobj.group('episode')
2461
2462                 self.report_extraction(showName)
2463                 try:
2464                         webPage = urllib2.urlopen(url)
2465                         webPageBytes = webPage.read()
2466                         m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2467                         webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2468                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2469                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2470                         return
2471
2472                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2473                 description = unescapeHTML(descMatch.group(1))
2474                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2475                 imgUrl = unescapeHTML(imgMatch.group(1))
2476                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2477                 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2478                 configUrlMatch = re.search('config=(.*)$', playerUrl)
2479                 configUrl = urllib2.unquote(configUrlMatch.group(1))
2480
2481                 self.report_config_download(showName)
2482                 try:
2483                         configJSON = urllib2.urlopen(configUrl).read()
2484                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2485                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2486                         return
2487
2488                 # Technically, it's JavaScript, not JSON
2489                 configJSON = configJSON.replace("'", '"')
2490
2491                 try:
2492                         config = json.loads(configJSON)
2493                 except (ValueError,), err:
2494                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2495                         return
2496
2497                 playlist = config['playlist']
2498                 videoUrl = playlist[1]['url']
2499
2500                 info = {
2501                         'id': videoId,
2502                         'url': videoUrl,
2503                         'uploader': showName,
2504                         'upload_date': None,
2505                         'title': showName,
2506                         'ext': 'flv',
2507                         'thumbnail': imgUrl,
2508                         'description': description,
2509                         'player_url': playerUrl,
2510                 }
2511
2512                 return [info]
2513
2514
2515 class CollegeHumorIE(InfoExtractor):
2516         """Information extractor for collegehumor.com"""
2517
2518         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2519         IE_NAME = u'collegehumor'
2520
2521         def report_webpage(self, video_id):
2522                 """Report information extraction."""
2523                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2524
2525         def report_extraction(self, video_id):
2526                 """Report information extraction."""
2527                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2528
2529         def _real_extract(self, url):
2530                 mobj = re.match(self._VALID_URL, url)
2531                 if mobj is None:
2532                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2533                         return
2534                 video_id = mobj.group('videoid')
2535
2536                 self.report_webpage(video_id)
2537                 request = urllib2.Request(url)
2538                 try:
2539                         webpage = urllib2.urlopen(request).read()
2540                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2541                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2542                         return
2543
2544                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2545                 if m is None:
2546                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2547                         return
2548                 internal_video_id = m.group('internalvideoid')
2549
2550                 info = {
2551                         'id': video_id,
2552                         'internal_id': internal_video_id,
2553                         'uploader': None,
2554                         'upload_date': None,
2555                 }
2556
2557                 self.report_extraction(video_id)
2558                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2559                 try:
2560                         metaXml = urllib2.urlopen(xmlUrl).read()
2561                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2562                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2563                         return
2564
2565                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2566                 try:
2567                         videoNode = mdoc.findall('./video')[0]
2568                         info['description'] = videoNode.findall('./description')[0].text
2569                         info['title'] = videoNode.findall('./caption')[0].text
2570                         info['url'] = videoNode.findall('./file')[0].text
2571                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2572                         info['ext'] = info['url'].rpartition('.')[2]
2573                 except IndexError:
2574                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2575                         return
2576
2577                 return [info]
2578
2579
2580 class XVideosIE(InfoExtractor):
2581         """Information extractor for xvideos.com"""
2582
2583         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2584         IE_NAME = u'xvideos'
2585
2586         def report_webpage(self, video_id):
2587                 """Report information extraction."""
2588                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2589
2590         def report_extraction(self, video_id):
2591                 """Report information extraction."""
2592                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2593
2594         def _real_extract(self, url):
2595                 mobj = re.match(self._VALID_URL, url)
2596                 if mobj is None:
2597                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2598                         return
2599                 video_id = mobj.group(1).decode('utf-8')
2600
2601                 self.report_webpage(video_id)
2602
2603                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2604                 try:
2605                         webpage = urllib2.urlopen(request).read()
2606                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2607                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2608                         return
2609
2610                 self.report_extraction(video_id)
2611
2612
2613                 # Extract video URL
2614                 mobj = re.search(r'flv_url=(.+?)&', webpage)
2615                 if mobj is None:
2616                         self._downloader.trouble(u'ERROR: unable to extract video url')
2617                         return
2618                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2619
2620
2621                 # Extract title
2622                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2623                 if mobj is None:
2624                         self._downloader.trouble(u'ERROR: unable to extract video title')
2625                         return
2626                 video_title = mobj.group(1).decode('utf-8')
2627
2628
2629                 # Extract video thumbnail
2630                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2631                 if mobj is None:
2632                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2633                         return
2634                 video_thumbnail = mobj.group(0).decode('utf-8')
2635
2636                 info = {
2637                         'id': video_id,
2638                         'url': video_url,
2639                         'uploader': None,
2640                         'upload_date': None,
2641                         'title': video_title,
2642                         'ext': 'flv',
2643                         'thumbnail': video_thumbnail,
2644                         'description': None,
2645                 }
2646
2647                 return [info]
2648
2649
2650 class SoundcloudIE(InfoExtractor):
2651         """Information extractor for soundcloud.com
2652            To access the media, the uid of the song and a stream token
2653            must be extracted from the page source and the script must make
2654            a request to media.soundcloud.com/crossdomain.xml. Then
2655            the media can be grabbed by requesting from an url composed
2656            of the stream token and uid
2657          """
2658
2659         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2660         IE_NAME = u'soundcloud'
2661
2662         def __init__(self, downloader=None):
2663                 InfoExtractor.__init__(self, downloader)
2664
2665         def report_webpage(self, video_id):
2666                 """Report information extraction."""
2667                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2668
2669         def report_extraction(self, video_id):
2670                 """Report information extraction."""
2671                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2672
2673         def _real_extract(self, url):
2674                 mobj = re.match(self._VALID_URL, url)
2675                 if mobj is None:
2676                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2677                         return
2678
2679                 # extract uploader (which is in the url)
2680                 uploader = mobj.group(1).decode('utf-8')
2681                 # extract simple title (uploader + slug of song title)
2682                 slug_title =  mobj.group(2).decode('utf-8')
2683                 simple_title = uploader + u'-' + slug_title
2684
2685                 self.report_webpage('%s/%s' % (uploader, slug_title))
2686
2687                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2688                 try:
2689                         webpage = urllib2.urlopen(request).read()
2690                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2691                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2692                         return
2693
2694                 self.report_extraction('%s/%s' % (uploader, slug_title))
2695
2696                 # extract uid and stream token that soundcloud hands out for access
2697                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2698                 if mobj:
2699                         video_id = mobj.group(1)
2700                         stream_token = mobj.group(2)
2701
2702                 # extract unsimplified title
2703                 mobj = re.search('"title":"(.*?)",', webpage)
2704                 if mobj:
2705                         title = mobj.group(1).decode('utf-8')
2706                 else:
2707                         title = simple_title
2708
2709                 # construct media url (with uid/token)
2710                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2711                 mediaURL = mediaURL % (video_id, stream_token)
2712
2713                 # description
2714                 description = u'No description available'
2715                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2716                 if mobj:
2717                         description = mobj.group(1)
2718
2719                 # upload date
2720                 upload_date = None
2721                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2722                 if mobj:
2723                         try:
2724                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2725                         except Exception, e:
2726                                 self._downloader.to_stderr(compat_str(e))
2727
2728                 # for soundcloud, a request to a cross domain is required for cookies
2729                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2730
2731                 return [{
2732                         'id':           video_id.decode('utf-8'),
2733                         'url':          mediaURL,
2734                         'uploader':     uploader.decode('utf-8'),
2735                         'upload_date':  upload_date,
2736                         'title':        title,
2737                         'ext':          u'mp3',
2738                         'description': description.decode('utf-8')
2739                 }]
2740
2741
2742 class InfoQIE(InfoExtractor):
2743         """Information extractor for infoq.com"""
2744
2745         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2746         IE_NAME = u'infoq'
2747
2748         def report_webpage(self, video_id):
2749                 """Report information extraction."""
2750                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2751
2752         def report_extraction(self, video_id):
2753                 """Report information extraction."""
2754                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2755
2756         def _real_extract(self, url):
2757                 mobj = re.match(self._VALID_URL, url)
2758                 if mobj is None:
2759                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2760                         return
2761
2762                 self.report_webpage(url)
2763
2764                 request = urllib2.Request(url)
2765                 try:
2766                         webpage = urllib2.urlopen(request).read()
2767                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2768                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2769                         return
2770
2771                 self.report_extraction(url)
2772
2773
2774                 # Extract video URL
2775                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2776                 if mobj is None:
2777                         self._downloader.trouble(u'ERROR: unable to extract video url')
2778                         return
2779                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2780
2781
2782                 # Extract title
2783                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2784                 if mobj is None:
2785                         self._downloader.trouble(u'ERROR: unable to extract video title')
2786                         return
2787                 video_title = mobj.group(1).decode('utf-8')
2788
2789                 # Extract description
2790                 video_description = u'No description available.'
2791                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2792                 if mobj is not None:
2793                         video_description = mobj.group(1).decode('utf-8')
2794
2795                 video_filename = video_url.split('/')[-1]
2796                 video_id, extension = video_filename.split('.')
2797
2798                 info = {
2799                         'id': video_id,
2800                         'url': video_url,
2801                         'uploader': None,
2802                         'upload_date': None,
2803                         'title': video_title,
2804                         'ext': extension, # Extension is always(?) mp4, but seems to be flv
2805                         'thumbnail': None,
2806                         'description': video_description,
2807                 }
2808
2809                 return [info]
2810
2811 class MixcloudIE(InfoExtractor):
2812         """Information extractor for www.mixcloud.com"""
2813         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2814         IE_NAME = u'mixcloud'
2815
2816         def __init__(self, downloader=None):
2817                 InfoExtractor.__init__(self, downloader)
2818
2819         def report_download_json(self, file_id):
2820                 """Report JSON download."""
2821                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2822
2823         def report_extraction(self, file_id):
2824                 """Report information extraction."""
2825                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2826
2827         def get_urls(self, jsonData, fmt, bitrate='best'):
2828                 """Get urls from 'audio_formats' section in json"""
2829                 file_url = None
2830                 try:
2831                         bitrate_list = jsonData[fmt]
2832                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2833                                 bitrate = max(bitrate_list) # select highest
2834
2835                         url_list = jsonData[fmt][bitrate]
2836                 except TypeError: # we have no bitrate info.
2837                         url_list = jsonData[fmt]
2838                 return url_list
2839
2840         def check_urls(self, url_list):
2841                 """Returns 1st active url from list"""
2842                 for url in url_list:
2843                         try:
2844                                 urllib2.urlopen(url)
2845                                 return url
2846                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2847                                 url = None
2848
2849                 return None
2850
2851         def _print_formats(self, formats):
2852                 print('Available formats:')
2853                 for fmt in formats.keys():
2854                         for b in formats[fmt]:
2855                                 try:
2856                                         ext = formats[fmt][b][0]
2857                                         print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2858                                 except TypeError: # we have no bitrate info
2859                                         ext = formats[fmt][0]
2860                                         print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2861                                         break
2862
2863         def _real_extract(self, url):
2864                 mobj = re.match(self._VALID_URL, url)
2865                 if mobj is None:
2866                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2867                         return
2868                 # extract uploader & filename from url
2869                 uploader = mobj.group(1).decode('utf-8')
2870                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2871
2872                 # construct API request
2873                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2874                 # retrieve .json file with links to files
2875                 request = urllib2.Request(file_url)
2876                 try:
2877                         self.report_download_json(file_url)
2878                         jsonData = urllib2.urlopen(request).read()
2879                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2880                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2881                         return
2882
2883                 # parse JSON
2884                 json_data = json.loads(jsonData)
2885                 player_url = json_data['player_swf_url']
2886                 formats = dict(json_data['audio_formats'])
2887
2888                 req_format = self._downloader.params.get('format', None)
2889                 bitrate = None
2890
2891                 if self._downloader.params.get('listformats', None):
2892                         self._print_formats(formats)
2893                         return
2894
2895                 if req_format is None or req_format == 'best':
2896                         for format_param in formats.keys():
2897                                 url_list = self.get_urls(formats, format_param)
2898                                 # check urls
2899                                 file_url = self.check_urls(url_list)
2900                                 if file_url is not None:
2901                                         break # got it!
2902                 else:
2903                         if req_format not in formats.keys():
2904                                 self._downloader.trouble(u'ERROR: format is not available')
2905                                 return
2906
2907                         url_list = self.get_urls(formats, req_format)
2908                         file_url = self.check_urls(url_list)
2909                         format_param = req_format
2910
2911                 return [{
2912                         'id': file_id.decode('utf-8'),
2913                         'url': file_url.decode('utf-8'),
2914                         'uploader':     uploader.decode('utf-8'),
2915                         'upload_date': None,
2916                         'title': json_data['name'],
2917                         'ext': file_url.split('.')[-1].decode('utf-8'),
2918                         'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2919                         'thumbnail': json_data['thumbnail_url'],
2920                         'description': json_data['description'],
2921                         'player_url': player_url.decode('utf-8'),
2922                 }]
2923
2924 class StanfordOpenClassroomIE(InfoExtractor):
2925         """Information extractor for Stanford's Open ClassRoom"""
2926
2927         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2928         IE_NAME = u'stanfordoc'
2929
2930         def report_download_webpage(self, objid):
2931                 """Report information extraction."""
2932                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2933
2934         def report_extraction(self, video_id):
2935                 """Report information extraction."""
2936                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2937
2938         def _real_extract(self, url):
2939                 mobj = re.match(self._VALID_URL, url)
2940                 if mobj is None:
2941                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2942                         return
2943
2944                 if mobj.group('course') and mobj.group('video'): # A specific video
2945                         course = mobj.group('course')
2946                         video = mobj.group('video')
2947                         info = {
2948                                 'id': course + '_' + video,
2949                                 'uploader': None,
2950                                 'upload_date': None,
2951                         }
2952
2953                         self.report_extraction(info['id'])
2954                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2955                         xmlUrl = baseUrl + video + '.xml'
2956                         try:
2957                                 metaXml = urllib2.urlopen(xmlUrl).read()
2958                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2959                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2960                                 return
2961                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2962                         try:
2963                                 info['title'] = mdoc.findall('./title')[0].text
2964                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2965                         except IndexError:
2966                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2967                                 return
2968                         info['ext'] = info['url'].rpartition('.')[2]
2969                         return [info]
2970                 elif mobj.group('course'): # A course page
2971                         course = mobj.group('course')
2972                         info = {
2973                                 'id': course,
2974                                 'type': 'playlist',
2975                                 'uploader': None,
2976                                 'upload_date': None,
2977                         }
2978
2979                         self.report_download_webpage(info['id'])
2980                         try:
2981                                 coursepage = urllib2.urlopen(url).read()
2982                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2983                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2984                                 return
2985
2986                         m = re.search('<h1>([^<]+)</h1>', coursepage)
2987                         if m:
2988                                 info['title'] = unescapeHTML(m.group(1))
2989                         else:
2990                                 info['title'] = info['id']
2991
2992                         m = re.search('<description>([^<]+)</description>', coursepage)
2993                         if m:
2994                                 info['description'] = unescapeHTML(m.group(1))
2995
2996                         links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2997                         info['list'] = [
2998                                 {
2999                                         'type': 'reference',
3000                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3001                                 }
3002                                         for vpage in links]
3003                         results = []
3004                         for entry in info['list']:
3005                                 assert entry['type'] == 'reference'
3006                                 results += self.extract(entry['url'])
3007                         return results
3008                         
3009                 else: # Root page
3010                         info = {
3011                                 'id': 'Stanford OpenClassroom',
3012                                 'type': 'playlist',
3013                                 'uploader': None,
3014                                 'upload_date': None,
3015                         }
3016
3017                         self.report_download_webpage(info['id'])
3018                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3019                         try:
3020                                 rootpage = urllib2.urlopen(rootURL).read()
3021                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3022                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3023                                 return
3024
3025                         info['title'] = info['id']
3026
3027                         links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3028                         info['list'] = [
3029                                 {
3030                                         'type': 'reference',
3031                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3032                                 }
3033                                         for cpage in links]
3034
3035                         results = []
3036                         for entry in info['list']:
3037                                 assert entry['type'] == 'reference'
3038                                 results += self.extract(entry['url'])
3039                         return results
3040
3041 class MTVIE(InfoExtractor):
3042         """Information extractor for MTV.com"""
3043
3044         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3045         IE_NAME = u'mtv'
3046
3047         def report_webpage(self, video_id):
3048                 """Report information extraction."""
3049                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3050
3051         def report_extraction(self, video_id):
3052                 """Report information extraction."""
3053                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3054
3055         def _real_extract(self, url):
3056                 mobj = re.match(self._VALID_URL, url)
3057                 if mobj is None:
3058                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3059                         return
3060                 if not mobj.group('proto'):
3061                         url = 'http://' + url
3062                 video_id = mobj.group('videoid')
3063                 self.report_webpage(video_id)
3064
3065                 request = urllib2.Request(url)
3066                 try:
3067                         webpage = urllib2.urlopen(request).read()
3068                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3069                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3070                         return
3071
3072                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3073                 if mobj is None:
3074                         self._downloader.trouble(u'ERROR: unable to extract song name')
3075                         return
3076                 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3077                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3078                 if mobj is None:
3079                         self._downloader.trouble(u'ERROR: unable to extract performer')
3080                         return
3081                 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3082                 video_title = performer + ' - ' + song_name 
3083
3084                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3085                 if mobj is None:
3086                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3087                         return
3088                 mtvn_uri = mobj.group(1)
3089
3090                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3091                 if mobj is None:
3092                         self._downloader.trouble(u'ERROR: unable to extract content id')
3093                         return
3094                 content_id = mobj.group(1)
3095
3096                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3097                 self.report_extraction(video_id)
3098                 request = urllib2.Request(videogen_url)
3099                 try:
3100                         metadataXml = urllib2.urlopen(request).read()
3101                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3102                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3103                         return
3104
3105                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3106                 renditions = mdoc.findall('.//rendition')
3107
3108                 # For now, always pick the highest quality.
3109                 rendition = renditions[-1]
3110
3111                 try:
3112                         _,_,ext = rendition.attrib['type'].partition('/')
3113                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3114                         video_url = rendition.find('./src').text
3115                 except KeyError:
3116                         self._downloader.trouble('Invalid rendition field.')
3117                         return
3118
3119                 info = {
3120                         'id': video_id,
3121                         'url': video_url,
3122                         'uploader': performer,
3123                         'upload_date': None,
3124                         'title': video_title,
3125                         'ext': ext,
3126                         'format': format,
3127                 }
3128
3129                 return [info]
3130
3131
3132 class YoukuIE(InfoExtractor):
3133
3134         _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3135         IE_NAME = u'Youku'
3136
3137         def __init__(self, downloader=None):
3138                 InfoExtractor.__init__(self, downloader)
3139
3140         def report_download_webpage(self, file_id):
3141                 """Report webpage download."""
3142                 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3143
3144         def report_extraction(self, file_id):
3145                 """Report information extraction."""
3146                 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3147
3148         def _gen_sid(self):
3149                 nowTime = int(time.time() * 1000)
3150                 random1 = random.randint(1000,1998)
3151                 random2 = random.randint(1000,9999)
3152
3153                 return "%d%d%d" %(nowTime,random1,random2)
3154
3155         def _get_file_ID_mix_string(self, seed):
3156                 mixed = []
3157                 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3158                 seed = float(seed)
3159                 for i in range(len(source)):
3160                         seed  =  (seed * 211 + 30031 ) % 65536
3161                         index  =  math.floor(seed / 65536 * len(source) )
3162                         mixed.append(source[int(index)])
3163                         source.remove(source[int(index)])
3164                 #return ''.join(mixed)
3165                 return mixed
3166
3167         def _get_file_id(self, fileId, seed):
3168                 mixed = self._get_file_ID_mix_string(seed)
3169                 ids = fileId.split('*')
3170                 realId = []
3171                 for ch in ids:
3172                         if ch:
3173                                 realId.append(mixed[int(ch)])
3174                 return ''.join(realId)
3175
3176         def _real_extract(self, url):
3177                 mobj = re.match(self._VALID_URL, url)
3178                 if mobj is None:
3179                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3180                         return
3181                 video_id = mobj.group('ID')
3182
3183                 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3184
3185                 request = urllib2.Request(info_url, None, std_headers)
3186                 try:
3187                         self.report_download_webpage(video_id)
3188                         jsondata = urllib2.urlopen(request).read()
3189                 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3190                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3191                         return
3192
3193                 self.report_extraction(video_id)
3194                 try:
3195                         config = json.loads(jsondata)
3196
3197                         video_title =  config['data'][0]['title']
3198                         seed = config['data'][0]['seed']
3199
3200                         format = self._downloader.params.get('format', None)
3201                         supported_format = config['data'][0]['streamfileids'].keys()
3202
3203                         if format is None or format == 'best':
3204                                 if 'hd2' in supported_format:
3205                                         format = 'hd2'
3206                                 else:
3207                                         format = 'flv'
3208                                 ext = u'flv'
3209                         elif format == 'worst':
3210                                 format = 'mp4'
3211                                 ext = u'mp4'
3212                         else:
3213                                 format = 'flv'
3214                                 ext = u'flv'
3215
3216
3217                         fileid = config['data'][0]['streamfileids'][format]
3218                         seg_number = len(config['data'][0]['segs'][format])
3219
3220                         keys=[]
3221                         for i in xrange(seg_number):
3222                                 keys.append(config['data'][0]['segs'][format][i]['k'])
3223
3224                         #TODO check error
3225                         #youku only could be viewed from mainland china
3226                 except:
3227                         self._downloader.trouble(u'ERROR: unable to extract info section')
3228                         return
3229
3230                 files_info=[]
3231                 sid = self._gen_sid()
3232                 fileid = self._get_file_id(fileid, seed)
3233
3234                 #column 8,9 of fileid represent the segment number
3235                 #fileid[7:9] should be changed
3236                 for index, key in enumerate(keys):
3237
3238                         temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3239                         download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3240
3241                         info = {
3242                                 'id': '%s_part%02d' % (video_id, index),
3243                                 'url': download_url,
3244                                 'uploader': None,
3245                                 'upload_date': None,
3246                                 'title': video_title,
3247                                 'ext': ext,
3248                         }
3249                         files_info.append(info)
3250
3251                 return files_info
3252
3253
3254 class XNXXIE(InfoExtractor):
3255         """Information extractor for xnxx.com"""
3256
3257         _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3258         IE_NAME = u'xnxx'
3259         VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3260         VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3261         VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3262
3263         def report_webpage(self, video_id):
3264                 """Report information extraction"""
3265                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3266
3267         def report_extraction(self, video_id):
3268                 """Report information extraction"""
3269                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3270
3271         def _real_extract(self, url):
3272                 mobj = re.match(self._VALID_URL, url)
3273                 if mobj is None:
3274                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3275                         return
3276                 video_id = mobj.group(1).decode('utf-8')
3277
3278                 self.report_webpage(video_id)
3279
3280                 # Get webpage content
3281                 try:
3282                         webpage = urllib2.urlopen(url).read()
3283                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3284                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3285                         return
3286
3287                 result = re.search(self.VIDEO_URL_RE, webpage)
3288                 if result is None:
3289                         self._downloader.trouble(u'ERROR: unable to extract video url')
3290                         return
3291                 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3292
3293                 result = re.search(self.VIDEO_TITLE_RE, webpage)
3294                 if result is None:
3295                         self._downloader.trouble(u'ERROR: unable to extract video title')
3296                         return
3297                 video_title = result.group(1).decode('utf-8')
3298
3299                 result = re.search(self.VIDEO_THUMB_RE, webpage)
3300                 if result is None:
3301                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3302                         return
3303                 video_thumbnail = result.group(1).decode('utf-8')
3304
3305                 return [{
3306                         'id': video_id,
3307                         'url': video_url,
3308                         'uploader': None,
3309                         'upload_date': None,
3310                         'title': video_title,
3311                         'ext': 'flv',
3312                         'thumbnail': video_thumbnail,
3313                         'description': None,
3314                 }]
3315
3316
3317 class GooglePlusIE(InfoExtractor):
3318         """Information extractor for plus.google.com."""
3319
3320         _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3321         IE_NAME = u'plus.google'
3322
3323         def __init__(self, downloader=None):
3324                 InfoExtractor.__init__(self, downloader)
3325
3326         def report_extract_entry(self, url):
3327                 """Report downloading extry"""
3328                 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3329
3330         def report_date(self, upload_date):
3331                 """Report downloading extry"""
3332                 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3333
3334         def report_uploader(self, uploader):
3335                 """Report downloading extry"""
3336                 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3337
3338         def report_title(self, video_title):
3339                 """Report downloading extry"""
3340                 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3341
3342         def report_extract_vid_page(self, video_page):
3343                 """Report information extraction."""
3344                 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3345
3346         def _real_extract(self, url):
3347                 # Extract id from URL
3348                 mobj = re.match(self._VALID_URL, url)
3349                 if mobj is None:
3350                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3351                         return
3352
3353                 post_url = mobj.group(0)
3354                 video_id = mobj.group(2)
3355
3356                 video_extension = 'flv'
3357
3358                 # Step 1, Retrieve post webpage to extract further information
3359                 self.report_extract_entry(post_url)
3360                 request = urllib2.Request(post_url)
3361                 try:
3362                         webpage = urllib2.urlopen(request).read()
3363                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3364                         self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3365                         return
3366
3367                 # Extract update date
3368                 upload_date = None
3369                 pattern = 'title="Timestamp">(.*?)</a>'
3370                 mobj = re.search(pattern, webpage)
3371                 if mobj:
3372                         upload_date = mobj.group(1)
3373                         # Convert timestring to a format suitable for filename
3374                         upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3375                         upload_date = upload_date.strftime('%Y%m%d')
3376                 self.report_date(upload_date)
3377
3378                 # Extract uploader
3379                 uploader = None
3380                 pattern = r'rel\="author".*?>(.*?)</a>'
3381                 mobj = re.search(pattern, webpage)
3382                 if mobj:
3383                         uploader = mobj.group(1)
3384                 self.report_uploader(uploader)
3385
3386                 # Extract title
3387                 # Get the first line for title
3388                 video_title = u'NA'
3389                 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3390                 mobj = re.search(pattern, webpage)
3391                 if mobj:
3392                         video_title = mobj.group(1)
3393                 self.report_title(video_title)
3394
3395                 # Step 2, Stimulate clicking the image box to launch video
3396                 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3397                 mobj = re.search(pattern, webpage)
3398                 if mobj is None:
3399                         self._downloader.trouble(u'ERROR: unable to extract video page URL')
3400
3401                 video_page = mobj.group(1)
3402                 request = urllib2.Request(video_page)
3403                 try:
3404                         webpage = urllib2.urlopen(request).read()
3405                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3406                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3407                         return
3408                 self.report_extract_vid_page(video_page)
3409
3410
3411                 # Extract video links on video page
3412                 """Extract video links of all sizes"""
3413                 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3414                 mobj = re.findall(pattern, webpage)
3415                 if len(mobj) == 0:
3416                         self._downloader.trouble(u'ERROR: unable to extract video links')
3417
3418                 # Sort in resolution
3419                 links = sorted(mobj)
3420
3421                 # Choose the lowest of the sort, i.e. highest resolution
3422                 video_url = links[-1]
3423                 # Only get the url. The resolution part in the tuple has no use anymore
3424                 video_url = video_url[-1]
3425                 # Treat escaped \u0026 style hex
3426                 video_url = unicode(video_url, "unicode_escape")
3427
3428
3429                 return [{
3430                         'id':           video_id.decode('utf-8'),
3431                         'url':          video_url,
3432                         'uploader':     uploader.decode('utf-8'),
3433                         'upload_date':  upload_date.decode('utf-8'),
3434                         'title':        video_title.decode('utf-8'),
3435                         'ext':          video_extension.decode('utf-8'),
3436                 }]