default info_dict['format'] to info_dict['ext'] and make the YT one more verbose
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import datetime
5 import HTMLParser
6 import httplib
7 import netrc
8 import os
9 import re
10 import socket
11 import time
12 import urllib
13 import urllib2
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 from urlparse import parse_qs
19
20 try:
21         import cStringIO as StringIO
22 except ImportError:
23         import StringIO
24
25 from utils import *
26
27
28 class InfoExtractor(object):
29         """Information Extractor class.
30
31         Information extractors are the classes that, given a URL, extract
32         information about the video (or videos) the URL refers to. This
33         information includes the real video URL, the video title, author and
34         others. The information is stored in a dictionary which is then 
35         passed to the FileDownloader. The FileDownloader processes this
36         information possibly downloading the video to the file system, among
37         other possible outcomes.
38
39         The dictionaries must include the following fields:
40
41         id:         Video identifier.
42         url:        Final video URL.
43         uploader:   Nickname of the video uploader.
44         title:      Video title, unescaped.
45         ext:        Video filename extension.
46         player_url: SWF Player URL (may be None).
47
48         The following fields are optional:
49
50         format:         The video format, defaults to ext. Used by --get-format
51         thumbnail:      Full URL to a video thumbnail image.
52         description     One-line video description.
53
54         Subclasses of this one should re-define the _real_initialize() and
55         _real_extract() methods and define a _VALID_URL regexp.
56         Probably, they should also be added to the list of extractors.
57
58         _real_extract() must return a *list* of information dictionaries as
59         described above.
60         """
61
62         _ready = False
63         _downloader = None
64
65         def __init__(self, downloader=None):
66                 """Constructor. Receives an optional downloader."""
67                 self._ready = False
68                 self.set_downloader(downloader)
69
70         def suitable(self, url):
71                 """Receives a URL and returns True if suitable for this IE."""
72                 return re.match(self._VALID_URL, url) is not None
73
74         def initialize(self):
75                 """Initializes an instance (authentication, etc)."""
76                 if not self._ready:
77                         self._real_initialize()
78                         self._ready = True
79
80         def extract(self, url):
81                 """Extracts URL information and returns it in list of dicts."""
82                 self.initialize()
83                 return self._real_extract(url)
84
85         def set_downloader(self, downloader):
86                 """Sets the downloader for this IE."""
87                 self._downloader = downloader
88
89         def _real_initialize(self):
90                 """Real initialization process. Redefine in subclasses."""
91                 pass
92
93         def _real_extract(self, url):
94                 """Real extraction process. Redefine in subclasses."""
95                 pass
96
97
98 class YoutubeIE(InfoExtractor):
99         """Information extractor for youtube.com."""
100
101         _VALID_URL = r"""^
102                          (
103                              (?:https?://)?                                       # http(s):// (optional)
104                              (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
105                                 tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
106                              (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
107                              (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
108                              (?:                                                  # the various things that can precede the ID:
109                                  (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
110                                  |(?:                                             # or the v= param in all its forms
111                                      (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
112                                      (?:\?|\#!?)                                  # the params delimiter ? or # or #!
113                                      (?:.+&)?                                     # any other preceding param (like /?s=tuff&v=xxxx)
114                                      v=
115                                  )
116                              )?                                                   # optional -> youtube.com/xxxx is OK
117                          )?                                                       # all until now is optional -> you can pass the naked ID
118                          ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
119                          (?(1).+)?                                                # if we found the ID, everything can follow
120                          $"""
121         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
122         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
123         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
124         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
125         _NETRC_MACHINE = 'youtube'
126         # Listed in order of quality
127         _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
128         _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
129         _video_extensions = {
130                 '13': '3gp',
131                 '17': 'mp4',
132                 '18': 'mp4',
133                 '22': 'mp4',
134                 '37': 'mp4',
135                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
136                 '43': 'webm',
137                 '44': 'webm',
138                 '45': 'webm',
139                 '46': 'webm',
140         }
141         _video_dimensions = {
142                 '5': '240x400',
143                 '6': '???',
144                 '13': '???',
145                 '17': '144x176',
146                 '18': '360x640',
147                 '22': '720x1280',
148                 '34': '360x640',
149                 '35': '480x854',
150                 '37': '1080x1920',
151                 '38': '3072x4096',
152                 '43': '360x640',
153                 '44': '480x854',
154                 '45': '720x1280',
155                 '46': '1080x1920',
156         }       
157         IE_NAME = u'youtube'
158
159         def suitable(self, url):
160                 """Receives a URL and returns True if suitable for this IE."""
161                 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
162
163         def report_lang(self):
164                 """Report attempt to set language."""
165                 self._downloader.to_screen(u'[youtube] Setting language')
166
167         def report_login(self):
168                 """Report attempt to log in."""
169                 self._downloader.to_screen(u'[youtube] Logging in')
170
171         def report_age_confirmation(self):
172                 """Report attempt to confirm age."""
173                 self._downloader.to_screen(u'[youtube] Confirming age')
174
175         def report_video_webpage_download(self, video_id):
176                 """Report attempt to download video webpage."""
177                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
178
179         def report_video_info_webpage_download(self, video_id):
180                 """Report attempt to download video info webpage."""
181                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
182
183         def report_video_subtitles_download(self, video_id):
184                 """Report attempt to download video info webpage."""
185                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
186
187         def report_information_extraction(self, video_id):
188                 """Report attempt to extract video information."""
189                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
190
191         def report_unavailable_format(self, video_id, format):
192                 """Report extracted video URL."""
193                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
194
195         def report_rtmp_download(self):
196                 """Indicate the download will use the RTMP protocol."""
197                 self._downloader.to_screen(u'[youtube] RTMP download detected')
198
199         def _closed_captions_xml_to_srt(self, xml_string):
200                 srt = ''
201                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
202                 # TODO parse xml instead of regex
203                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
204                         if not dur: dur = '4'
205                         start = float(start)
206                         end = start + float(dur)
207                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
208                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
209                         caption = unescapeHTML(caption)
210                         caption = unescapeHTML(caption) # double cycle, intentional
211                         srt += str(n+1) + '\n'
212                         srt += start + ' --> ' + end + '\n'
213                         srt += caption + '\n\n'
214                 return srt
215
216         def _print_formats(self, formats):
217                 print('Available formats:')
218                 for x in formats:
219                         print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
220
221         def _real_initialize(self):
222                 if self._downloader is None:
223                         return
224
225                 username = None
226                 password = None
227                 downloader_params = self._downloader.params
228
229                 # Attempt to use provided username and password or .netrc data
230                 if downloader_params.get('username', None) is not None:
231                         username = downloader_params['username']
232                         password = downloader_params['password']
233                 elif downloader_params.get('usenetrc', False):
234                         try:
235                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
236                                 if info is not None:
237                                         username = info[0]
238                                         password = info[2]
239                                 else:
240                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
241                         except (IOError, netrc.NetrcParseError), err:
242                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
243                                 return
244
245                 # Set language
246                 request = urllib2.Request(self._LANG_URL)
247                 try:
248                         self.report_lang()
249                         urllib2.urlopen(request).read()
250                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
251                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
252                         return
253
254                 # No authentication to be performed
255                 if username is None:
256                         return
257
258                 # Log in
259                 login_form = {
260                                 'current_form': 'loginForm',
261                                 'next':         '/',
262                                 'action_login': 'Log In',
263                                 'username':     username,
264                                 'password':     password,
265                                 }
266                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
267                 try:
268                         self.report_login()
269                         login_results = urllib2.urlopen(request).read()
270                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
271                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
272                                 return
273                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
274                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
275                         return
276
277                 # Confirm age
278                 age_form = {
279                                 'next_url':             '/',
280                                 'action_confirm':       'Confirm',
281                                 }
282                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
283                 try:
284                         self.report_age_confirmation()
285                         age_results = urllib2.urlopen(request).read()
286                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
287                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
288                         return
289
290         def _real_extract(self, url):
291                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
292                 mobj = re.search(self._NEXT_URL_RE, url)
293                 if mobj:
294                         url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
295
296                 # Extract video id from URL
297                 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
298                 if mobj is None:
299                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
300                         return
301                 video_id = mobj.group(2)
302
303                 # Get video webpage
304                 self.report_video_webpage_download(video_id)
305                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
306                 try:
307                         video_webpage = urllib2.urlopen(request).read()
308                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
309                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
310                         return
311
312                 # Attempt to extract SWF player URL
313                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
314                 if mobj is not None:
315                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
316                 else:
317                         player_url = None
318
319                 # Get video info
320                 self.report_video_info_webpage_download(video_id)
321                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
322                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
323                                         % (video_id, el_type))
324                         request = urllib2.Request(video_info_url)
325                         try:
326                                 video_info_webpage = urllib2.urlopen(request).read()
327                                 video_info = parse_qs(video_info_webpage)
328                                 if 'token' in video_info:
329                                         break
330                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
331                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
332                                 return
333                 if 'token' not in video_info:
334                         if 'reason' in video_info:
335                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
336                         else:
337                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
338                         return
339
340                 # Check for "rental" videos
341                 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
342                         self._downloader.trouble(u'ERROR: "rental" videos not supported')
343                         return
344
345                 # Start extracting information
346                 self.report_information_extraction(video_id)
347
348                 # uploader
349                 if 'author' not in video_info:
350                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
351                         return
352                 video_uploader = urllib.unquote_plus(video_info['author'][0])
353
354                 # title
355                 if 'title' not in video_info:
356                         self._downloader.trouble(u'ERROR: unable to extract video title')
357                         return
358                 video_title = urllib.unquote_plus(video_info['title'][0])
359                 video_title = video_title.decode('utf-8')
360
361                 # thumbnail image
362                 if 'thumbnail_url' not in video_info:
363                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
364                         video_thumbnail = ''
365                 else:   # don't panic if we can't find it
366                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
367
368                 # upload date
369                 upload_date = u'NA'
370                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
371                 if mobj is not None:
372                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
373                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
374                         for expression in format_expressions:
375                                 try:
376                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
377                                 except:
378                                         pass
379
380                 # description
381                 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
382                 if video_description: video_description = clean_html(video_description)
383                 else: video_description = ''
384                         
385                 # closed captions
386                 video_subtitles = None
387                 if self._downloader.params.get('writesubtitles', False):
388                         try:
389                                 self.report_video_subtitles_download(video_id)
390                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
391                                 try:
392                                         srt_list = urllib2.urlopen(request).read()
393                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
394                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
395                                 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
396                                 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
397                                 if not srt_lang_list:
398                                         raise Trouble(u'WARNING: video has no closed captions')
399                                 if self._downloader.params.get('subtitleslang', False):
400                                         srt_lang = self._downloader.params.get('subtitleslang')
401                                 elif 'en' in srt_lang_list:
402                                         srt_lang = 'en'
403                                 else:
404                                         srt_lang = srt_lang_list.keys()[0]
405                                 if not srt_lang in srt_lang_list:
406                                         raise Trouble(u'WARNING: no closed captions found in the specified language')
407                                 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
408                                 try:
409                                         srt_xml = urllib2.urlopen(request).read()
410                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
411                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
412                                 if not srt_xml:
413                                         raise Trouble(u'WARNING: unable to download video subtitles')
414                                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
415                         except Trouble as trouble:
416                                 self._downloader.trouble(trouble[0])
417
418                 if 'length_seconds' not in video_info:
419                         self._downloader.trouble(u'WARNING: unable to extract video duration')
420                         video_duration = ''
421                 else:
422                         video_duration = urllib.unquote_plus(video_info['length_seconds'][0])
423
424                 # token
425                 video_token = urllib.unquote_plus(video_info['token'][0])
426
427                 # Decide which formats to download
428                 req_format = self._downloader.params.get('format', None)
429
430                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
431                         self.report_rtmp_download()
432                         video_url_list = [(None, video_info['conn'][0])]
433                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
434                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
435                         url_data = [parse_qs(uds) for uds in url_data_strs]
436                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
437                         url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
438
439                         format_limit = self._downloader.params.get('format_limit', None)
440                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
441                         if format_limit is not None and format_limit in available_formats:
442                                 format_list = available_formats[available_formats.index(format_limit):]
443                         else:
444                                 format_list = available_formats
445                         existing_formats = [x for x in format_list if x in url_map]
446                         if len(existing_formats) == 0:
447                                 self._downloader.trouble(u'ERROR: no known formats available for video')
448                                 return
449                         if self._downloader.params.get('listformats', None):
450                                 self._print_formats(existing_formats)
451                                 return
452                         if req_format is None or req_format == 'best':
453                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
454                         elif req_format == 'worst':
455                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
456                         elif req_format in ('-1', 'all'):
457                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
458                         else:
459                                 # Specific formats. We pick the first in a slash-delimeted sequence.
460                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
461                                 req_formats = req_format.split('/')
462                                 video_url_list = None
463                                 for rf in req_formats:
464                                         if rf in url_map:
465                                                 video_url_list = [(rf, url_map[rf])]
466                                                 break
467                                 if video_url_list is None:
468                                         self._downloader.trouble(u'ERROR: requested format not available')
469                                         return
470                 else:
471                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
472                         return
473
474                 results = []
475                 for format_param, video_real_url in video_url_list:
476                         # Extension
477                         video_extension = self._video_extensions.get(format_param, 'flv')
478
479                         video_format = '{} - {}'.format(format_param.decode('utf-8') if format_param else video_extension.decode('utf-8'),
480                                                             self._video_dimensions.get(format_param, '???'))
481
482                         results.append({
483                                 'id':           video_id.decode('utf-8'),
484                                 'url':          video_real_url.decode('utf-8'),
485                                 'uploader':     video_uploader.decode('utf-8'),
486                                 'upload_date':  upload_date,
487                                 'title':        video_title,
488                                 'ext':          video_extension.decode('utf-8'),
489                                 'format':       video_format,
490                                 'thumbnail':    video_thumbnail.decode('utf-8'),
491                                 'description':  video_description,
492                                 'player_url':   player_url,
493                                 'subtitles':    video_subtitles,
494                                 'duration':             video_duration
495                         })
496                 return results
497
498
499 class MetacafeIE(InfoExtractor):
500         """Information Extractor for metacafe.com."""
501
502         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
503         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
504         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
505         IE_NAME = u'metacafe'
506
507         def __init__(self, downloader=None):
508                 InfoExtractor.__init__(self, downloader)
509
510         def report_disclaimer(self):
511                 """Report disclaimer retrieval."""
512                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
513
514         def report_age_confirmation(self):
515                 """Report attempt to confirm age."""
516                 self._downloader.to_screen(u'[metacafe] Confirming age')
517
518         def report_download_webpage(self, video_id):
519                 """Report webpage download."""
520                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
521
522         def report_extraction(self, video_id):
523                 """Report information extraction."""
524                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
525
526         def _real_initialize(self):
527                 # Retrieve disclaimer
528                 request = urllib2.Request(self._DISCLAIMER)
529                 try:
530                         self.report_disclaimer()
531                         disclaimer = urllib2.urlopen(request).read()
532                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
533                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
534                         return
535
536                 # Confirm age
537                 disclaimer_form = {
538                         'filters': '0',
539                         'submit': "Continue - I'm over 18",
540                         }
541                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
542                 try:
543                         self.report_age_confirmation()
544                         disclaimer = urllib2.urlopen(request).read()
545                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
546                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
547                         return
548
549         def _real_extract(self, url):
550                 # Extract id and simplified title from URL
551                 mobj = re.match(self._VALID_URL, url)
552                 if mobj is None:
553                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
554                         return
555
556                 video_id = mobj.group(1)
557
558                 # Check if video comes from YouTube
559                 mobj2 = re.match(r'^yt-(.*)$', video_id)
560                 if mobj2 is not None:
561                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
562                         return
563
564                 # Retrieve video webpage to extract further information
565                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
566                 try:
567                         self.report_download_webpage(video_id)
568                         webpage = urllib2.urlopen(request).read()
569                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
570                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
571                         return
572
573                 # Extract URL, uploader and title from webpage
574                 self.report_extraction(video_id)
575                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
576                 if mobj is not None:
577                         mediaURL = urllib.unquote(mobj.group(1))
578                         video_extension = mediaURL[-3:]
579
580                         # Extract gdaKey if available
581                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
582                         if mobj is None:
583                                 video_url = mediaURL
584                         else:
585                                 gdaKey = mobj.group(1)
586                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
587                 else:
588                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
589                         if mobj is None:
590                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
591                                 return
592                         vardict = parse_qs(mobj.group(1))
593                         if 'mediaData' not in vardict:
594                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
595                                 return
596                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
597                         if mobj is None:
598                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
599                                 return
600                         mediaURL = mobj.group(1).replace('\\/', '/')
601                         video_extension = mediaURL[-3:]
602                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
603
604                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
605                 if mobj is None:
606                         self._downloader.trouble(u'ERROR: unable to extract title')
607                         return
608                 video_title = mobj.group(1).decode('utf-8')
609
610                 mobj = re.search(r'submitter=(.*?);', webpage)
611                 if mobj is None:
612                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
613                         return
614                 video_uploader = mobj.group(1)
615
616                 return [{
617                         'id':           video_id.decode('utf-8'),
618                         'url':          video_url.decode('utf-8'),
619                         'uploader':     video_uploader.decode('utf-8'),
620                         'upload_date':  u'NA',
621                         'title':        video_title,
622                         'ext':          video_extension.decode('utf-8'),
623                         'player_url':   None,
624                 }]
625
626
627 class DailymotionIE(InfoExtractor):
628         """Information Extractor for Dailymotion"""
629
630         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
631         IE_NAME = u'dailymotion'
632
633         def __init__(self, downloader=None):
634                 InfoExtractor.__init__(self, downloader)
635
636         def report_download_webpage(self, video_id):
637                 """Report webpage download."""
638                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
639
640         def report_extraction(self, video_id):
641                 """Report information extraction."""
642                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
643
644         def _real_extract(self, url):
645                 # Extract id and simplified title from URL
646                 mobj = re.match(self._VALID_URL, url)
647                 if mobj is None:
648                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
649                         return
650
651                 video_id = mobj.group(1).split('_')[0].split('?')[0]
652
653                 video_extension = 'mp4'
654
655                 # Retrieve video webpage to extract further information
656                 request = urllib2.Request(url)
657                 request.add_header('Cookie', 'family_filter=off')
658                 try:
659                         self.report_download_webpage(video_id)
660                         webpage = urllib2.urlopen(request).read()
661                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
662                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
663                         return
664
665                 # Extract URL, uploader and title from webpage
666                 self.report_extraction(video_id)
667                 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
668                 if mobj is None:
669                         self._downloader.trouble(u'ERROR: unable to extract media URL')
670                         return
671                 flashvars = urllib.unquote(mobj.group(1))
672
673                 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
674                         if key in flashvars:
675                                 max_quality = key
676                                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
677                                 break
678                 else:
679                         self._downloader.trouble(u'ERROR: unable to extract video URL')
680                         return
681
682                 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
683                 if mobj is None:
684                         self._downloader.trouble(u'ERROR: unable to extract video URL')
685                         return
686
687                 video_url = urllib.unquote(mobj.group(1)).replace('\\/', '/')
688
689                 # TODO: support choosing qualities
690
691                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
692                 if mobj is None:
693                         self._downloader.trouble(u'ERROR: unable to extract title')
694                         return
695                 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
696
697                 video_uploader = u'NA'
698                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
699                 if mobj is None:
700                         # lookin for official user
701                         mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
702                         if mobj_official is None:
703                                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
704                         else:
705                                 video_uploader = mobj_official.group(1)
706                 else:
707                         video_uploader = mobj.group(1)
708
709                 video_upload_date = u'NA'
710                 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
711                 if mobj is not None:
712                         video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
713
714                 return [{
715                         'id':           video_id.decode('utf-8'),
716                         'url':          video_url.decode('utf-8'),
717                         'uploader':     video_uploader.decode('utf-8'),
718                         'upload_date':  video_upload_date,
719                         'title':        video_title,
720                         'ext':          video_extension.decode('utf-8'),
721                         'player_url':   None,
722                 }]
723
724
725 class GoogleIE(InfoExtractor):
726         """Information extractor for video.google.com."""
727
728         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
729         IE_NAME = u'video.google'
730
731         def __init__(self, downloader=None):
732                 InfoExtractor.__init__(self, downloader)
733
734         def report_download_webpage(self, video_id):
735                 """Report webpage download."""
736                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
737
738         def report_extraction(self, video_id):
739                 """Report information extraction."""
740                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
741
742         def _real_extract(self, url):
743                 # Extract id from URL
744                 mobj = re.match(self._VALID_URL, url)
745                 if mobj is None:
746                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
747                         return
748
749                 video_id = mobj.group(1)
750
751                 video_extension = 'mp4'
752
753                 # Retrieve video webpage to extract further information
754                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
755                 try:
756                         self.report_download_webpage(video_id)
757                         webpage = urllib2.urlopen(request).read()
758                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
759                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
760                         return
761
762                 # Extract URL, uploader, and title from webpage
763                 self.report_extraction(video_id)
764                 mobj = re.search(r"download_url:'([^']+)'", webpage)
765                 if mobj is None:
766                         video_extension = 'flv'
767                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
768                 if mobj is None:
769                         self._downloader.trouble(u'ERROR: unable to extract media URL')
770                         return
771                 mediaURL = urllib.unquote(mobj.group(1))
772                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
773                 mediaURL = mediaURL.replace('\\x26', '\x26')
774
775                 video_url = mediaURL
776
777                 mobj = re.search(r'<title>(.*)</title>', webpage)
778                 if mobj is None:
779                         self._downloader.trouble(u'ERROR: unable to extract title')
780                         return
781                 video_title = mobj.group(1).decode('utf-8')
782
783                 # Extract video description
784                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
785                 if mobj is None:
786                         self._downloader.trouble(u'ERROR: unable to extract video description')
787                         return
788                 video_description = mobj.group(1).decode('utf-8')
789                 if not video_description:
790                         video_description = 'No description available.'
791
792                 # Extract video thumbnail
793                 if self._downloader.params.get('forcethumbnail', False):
794                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
795                         try:
796                                 webpage = urllib2.urlopen(request).read()
797                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
798                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
799                                 return
800                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
801                         if mobj is None:
802                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
803                                 return
804                         video_thumbnail = mobj.group(1)
805                 else:   # we need something to pass to process_info
806                         video_thumbnail = ''
807
808                 return [{
809                         'id':           video_id.decode('utf-8'),
810                         'url':          video_url.decode('utf-8'),
811                         'uploader':     u'NA',
812                         'upload_date':  u'NA',
813                         'title':        video_title,
814                         'ext':          video_extension.decode('utf-8'),
815                         'player_url':   None,
816                 }]
817
818
819 class PhotobucketIE(InfoExtractor):
820         """Information extractor for photobucket.com."""
821
822         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
823         IE_NAME = u'photobucket'
824
825         def __init__(self, downloader=None):
826                 InfoExtractor.__init__(self, downloader)
827
828         def report_download_webpage(self, video_id):
829                 """Report webpage download."""
830                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
831
832         def report_extraction(self, video_id):
833                 """Report information extraction."""
834                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
835
836         def _real_extract(self, url):
837                 # Extract id from URL
838                 mobj = re.match(self._VALID_URL, url)
839                 if mobj is None:
840                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
841                         return
842
843                 video_id = mobj.group(1)
844
845                 video_extension = 'flv'
846
847                 # Retrieve video webpage to extract further information
848                 request = urllib2.Request(url)
849                 try:
850                         self.report_download_webpage(video_id)
851                         webpage = urllib2.urlopen(request).read()
852                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
853                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
854                         return
855
856                 # Extract URL, uploader, and title from webpage
857                 self.report_extraction(video_id)
858                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
859                 if mobj is None:
860                         self._downloader.trouble(u'ERROR: unable to extract media URL')
861                         return
862                 mediaURL = urllib.unquote(mobj.group(1))
863
864                 video_url = mediaURL
865
866                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
867                 if mobj is None:
868                         self._downloader.trouble(u'ERROR: unable to extract title')
869                         return
870                 video_title = mobj.group(1).decode('utf-8')
871
872                 video_uploader = mobj.group(2).decode('utf-8')
873
874                 return [{
875                         'id':           video_id.decode('utf-8'),
876                         'url':          video_url.decode('utf-8'),
877                         'uploader':     video_uploader,
878                         'upload_date':  u'NA',
879                         'title':        video_title,
880                         'ext':          video_extension.decode('utf-8'),
881                         'player_url':   None,
882                 }]
883
884
885 class YahooIE(InfoExtractor):
886         """Information extractor for video.yahoo.com."""
887
888         # _VALID_URL matches all Yahoo! Video URLs
889         # _VPAGE_URL matches only the extractable '/watch/' URLs
890         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
891         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
892         IE_NAME = u'video.yahoo'
893
894         def __init__(self, downloader=None):
895                 InfoExtractor.__init__(self, downloader)
896
897         def report_download_webpage(self, video_id):
898                 """Report webpage download."""
899                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
900
901         def report_extraction(self, video_id):
902                 """Report information extraction."""
903                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
904
905         def _real_extract(self, url, new_video=True):
906                 # Extract ID from URL
907                 mobj = re.match(self._VALID_URL, url)
908                 if mobj is None:
909                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
910                         return
911
912                 video_id = mobj.group(2)
913                 video_extension = 'flv'
914
915                 # Rewrite valid but non-extractable URLs as
916                 # extractable English language /watch/ URLs
917                 if re.match(self._VPAGE_URL, url) is None:
918                         request = urllib2.Request(url)
919                         try:
920                                 webpage = urllib2.urlopen(request).read()
921                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
922                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
923                                 return
924
925                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
926                         if mobj is None:
927                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
928                                 return
929                         yahoo_id = mobj.group(1)
930
931                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
932                         if mobj is None:
933                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
934                                 return
935                         yahoo_vid = mobj.group(1)
936
937                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
938                         return self._real_extract(url, new_video=False)
939
940                 # Retrieve video webpage to extract further information
941                 request = urllib2.Request(url)
942                 try:
943                         self.report_download_webpage(video_id)
944                         webpage = urllib2.urlopen(request).read()
945                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
946                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
947                         return
948
949                 # Extract uploader and title from webpage
950                 self.report_extraction(video_id)
951                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
952                 if mobj is None:
953                         self._downloader.trouble(u'ERROR: unable to extract video title')
954                         return
955                 video_title = mobj.group(1).decode('utf-8')
956
957                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
958                 if mobj is None:
959                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
960                         return
961                 video_uploader = mobj.group(1).decode('utf-8')
962
963                 # Extract video thumbnail
964                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
965                 if mobj is None:
966                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
967                         return
968                 video_thumbnail = mobj.group(1).decode('utf-8')
969
970                 # Extract video description
971                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
972                 if mobj is None:
973                         self._downloader.trouble(u'ERROR: unable to extract video description')
974                         return
975                 video_description = mobj.group(1).decode('utf-8')
976                 if not video_description:
977                         video_description = 'No description available.'
978
979                 # Extract video height and width
980                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
981                 if mobj is None:
982                         self._downloader.trouble(u'ERROR: unable to extract video height')
983                         return
984                 yv_video_height = mobj.group(1)
985
986                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
987                 if mobj is None:
988                         self._downloader.trouble(u'ERROR: unable to extract video width')
989                         return
990                 yv_video_width = mobj.group(1)
991
992                 # Retrieve video playlist to extract media URL
993                 # I'm not completely sure what all these options are, but we
994                 # seem to need most of them, otherwise the server sends a 401.
995                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
996                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
997                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
998                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
999                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1000                 try:
1001                         self.report_download_webpage(video_id)
1002                         webpage = urllib2.urlopen(request).read()
1003                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1004                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1005                         return
1006
1007                 # Extract media URL from playlist XML
1008                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1009                 if mobj is None:
1010                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1011                         return
1012                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1013                 video_url = unescapeHTML(video_url)
1014
1015                 return [{
1016                         'id':           video_id.decode('utf-8'),
1017                         'url':          video_url,
1018                         'uploader':     video_uploader,
1019                         'upload_date':  u'NA',
1020                         'title':        video_title,
1021                         'ext':          video_extension.decode('utf-8'),
1022                         'thumbnail':    video_thumbnail.decode('utf-8'),
1023                         'description':  video_description,
1024                         'thumbnail':    video_thumbnail,
1025                         'player_url':   None,
1026                 }]
1027
1028
1029 class VimeoIE(InfoExtractor):
1030         """Information extractor for vimeo.com."""
1031
1032         # _VALID_URL matches Vimeo URLs
1033         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
1034         IE_NAME = u'vimeo'
1035
1036         def __init__(self, downloader=None):
1037                 InfoExtractor.__init__(self, downloader)
1038
1039         def report_download_webpage(self, video_id):
1040                 """Report webpage download."""
1041                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1042
1043         def report_extraction(self, video_id):
1044                 """Report information extraction."""
1045                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1046
1047         def _real_extract(self, url, new_video=True):
1048                 # Extract ID from URL
1049                 mobj = re.match(self._VALID_URL, url)
1050                 if mobj is None:
1051                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1052                         return
1053
1054                 video_id = mobj.group(1)
1055
1056                 # Retrieve video webpage to extract further information
1057                 request = urllib2.Request(url, None, std_headers)
1058                 try:
1059                         self.report_download_webpage(video_id)
1060                         webpage = urllib2.urlopen(request).read()
1061                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1062                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1063                         return
1064
1065                 # Now we begin extracting as much information as we can from what we
1066                 # retrieved. First we extract the information common to all extractors,
1067                 # and latter we extract those that are Vimeo specific.
1068                 self.report_extraction(video_id)
1069
1070                 # Extract the config JSON
1071                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1072                 try:
1073                         config = json.loads(config)
1074                 except:
1075                         self._downloader.trouble(u'ERROR: unable to extract info section')
1076                         return
1077                 
1078                 # Extract title
1079                 video_title = config["video"]["title"]
1080
1081                 # Extract uploader
1082                 video_uploader = config["video"]["owner"]["name"]
1083
1084                 # Extract video thumbnail
1085                 video_thumbnail = config["video"]["thumbnail"]
1086
1087                 # Extract video description
1088                 video_description = get_element_by_id("description", webpage.decode('utf8'))
1089                 if video_description: video_description = clean_html(video_description)
1090                 else: video_description = ''
1091
1092                 # Extract upload date
1093                 video_upload_date = u'NA'
1094                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1095                 if mobj is not None:
1096                         video_upload_date = mobj.group(1)
1097
1098                 # Vimeo specific: extract request signature and timestamp
1099                 sig = config['request']['signature']
1100                 timestamp = config['request']['timestamp']
1101
1102                 # Vimeo specific: extract video codec and quality information
1103                 # First consider quality, then codecs, then take everything
1104                 # TODO bind to format param
1105                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1106                 files = { 'hd': [], 'sd': [], 'other': []}
1107                 for codec_name, codec_extension in codecs:
1108                         if codec_name in config["video"]["files"]:
1109                                 if 'hd' in config["video"]["files"][codec_name]:
1110                                         files['hd'].append((codec_name, codec_extension, 'hd'))
1111                                 elif 'sd' in config["video"]["files"][codec_name]:
1112                                         files['sd'].append((codec_name, codec_extension, 'sd'))
1113                                 else:
1114                                         files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1115
1116                 for quality in ('hd', 'sd', 'other'):
1117                         if len(files[quality]) > 0:
1118                                 video_quality = files[quality][0][2]
1119                                 video_codec = files[quality][0][0]
1120                                 video_extension = files[quality][0][1]
1121                                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1122                                 break
1123                 else:
1124                         self._downloader.trouble(u'ERROR: no known codec found')
1125                         return
1126
1127                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1128                                         %(video_id, sig, timestamp, video_quality, video_codec.upper())
1129
1130                 return [{
1131                         'id':           video_id,
1132                         'url':          video_url,
1133                         'uploader':     video_uploader,
1134                         'upload_date':  video_upload_date,
1135                         'title':        video_title,
1136                         'ext':          video_extension,
1137                         'thumbnail':    video_thumbnail,
1138                         'description':  video_description,
1139                         'player_url':   None,
1140                 }]
1141
1142
1143 class GenericIE(InfoExtractor):
1144         """Generic last-resort information extractor."""
1145
1146         _VALID_URL = r'.*'
1147         IE_NAME = u'generic'
1148
1149         def __init__(self, downloader=None):
1150                 InfoExtractor.__init__(self, downloader)
1151
1152         def report_download_webpage(self, video_id):
1153                 """Report webpage download."""
1154                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1155                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1156
1157         def report_extraction(self, video_id):
1158                 """Report information extraction."""
1159                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1160
1161         def report_following_redirect(self, new_url):
1162                 """Report information extraction."""
1163                 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1164                 
1165         def _test_redirect(self, url):
1166                 """Check if it is a redirect, like url shorteners, in case restart chain."""
1167                 class HeadRequest(urllib2.Request):
1168                         def get_method(self):
1169                                 return "HEAD"
1170
1171                 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1172                         """
1173                         Subclass the HTTPRedirectHandler to make it use our 
1174                         HeadRequest also on the redirected URL
1175                         """
1176                         def redirect_request(self, req, fp, code, msg, headers, newurl): 
1177                                 if code in (301, 302, 303, 307):
1178                                         newurl = newurl.replace(' ', '%20') 
1179                                         newheaders = dict((k,v) for k,v in req.headers.items()
1180                                                                           if k.lower() not in ("content-length", "content-type"))
1181                                         return HeadRequest(newurl, 
1182                                                                            headers=newheaders,
1183                                                                            origin_req_host=req.get_origin_req_host(), 
1184                                                                            unverifiable=True) 
1185                                 else: 
1186                                         raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp) 
1187
1188                 class HTTPMethodFallback(urllib2.BaseHandler):
1189                         """
1190                         Fallback to GET if HEAD is not allowed (405 HTTP error)
1191                         """
1192                         def http_error_405(self, req, fp, code, msg, headers): 
1193                                 fp.read()
1194                                 fp.close()
1195
1196                                 newheaders = dict((k,v) for k,v in req.headers.items()
1197                                                                   if k.lower() not in ("content-length", "content-type"))
1198                                 return self.parent.open(urllib2.Request(req.get_full_url(), 
1199                                                                                                  headers=newheaders, 
1200                                                                                                  origin_req_host=req.get_origin_req_host(), 
1201                                                                                                  unverifiable=True))
1202
1203                 # Build our opener
1204                 opener = urllib2.OpenerDirector() 
1205                 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1206                                                 HTTPMethodFallback, HEADRedirectHandler,
1207                                                 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1208                         opener.add_handler(handler())
1209
1210                 response = opener.open(HeadRequest(url))
1211                 new_url = response.geturl()
1212                 
1213                 if url == new_url: return False
1214                 
1215                 self.report_following_redirect(new_url)
1216                 self._downloader.download([new_url])
1217                 return True
1218
1219         def _real_extract(self, url):
1220                 if self._test_redirect(url): return
1221
1222                 video_id = url.split('/')[-1]
1223                 request = urllib2.Request(url)
1224                 try:
1225                         self.report_download_webpage(video_id)
1226                         webpage = urllib2.urlopen(request).read()
1227                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1228                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1229                         return
1230                 except ValueError, err:
1231                         # since this is the last-resort InfoExtractor, if
1232                         # this error is thrown, it'll be thrown here
1233                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1234                         return
1235
1236                 self.report_extraction(video_id)
1237                 # Start with something easy: JW Player in SWFObject
1238                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1239                 if mobj is None:
1240                         # Broaden the search a little bit
1241                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1242                 if mobj is None:
1243                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1244                         return
1245
1246                 # It's possible that one of the regexes
1247                 # matched, but returned an empty group:
1248                 if mobj.group(1) is None:
1249                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1250                         return
1251
1252                 video_url = urllib.unquote(mobj.group(1))
1253                 video_id = os.path.basename(video_url)
1254
1255                 # here's a fun little line of code for you:
1256                 video_extension = os.path.splitext(video_id)[1][1:]
1257                 video_id = os.path.splitext(video_id)[0]
1258
1259                 # it's tempting to parse this further, but you would
1260                 # have to take into account all the variations like
1261                 #   Video Title - Site Name
1262                 #   Site Name | Video Title
1263                 #   Video Title - Tagline | Site Name
1264                 # and so on and so forth; it's just not practical
1265                 mobj = re.search(r'<title>(.*)</title>', webpage)
1266                 if mobj is None:
1267                         self._downloader.trouble(u'ERROR: unable to extract title')
1268                         return
1269                 video_title = mobj.group(1).decode('utf-8')
1270
1271                 # video uploader is domain name
1272                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1273                 if mobj is None:
1274                         self._downloader.trouble(u'ERROR: unable to extract title')
1275                         return
1276                 video_uploader = mobj.group(1).decode('utf-8')
1277
1278                 return [{
1279                         'id':           video_id.decode('utf-8'),
1280                         'url':          video_url.decode('utf-8'),
1281                         'uploader':     video_uploader,
1282                         'upload_date':  u'NA',
1283                         'title':        video_title,
1284                         'ext':          video_extension.decode('utf-8'),
1285                         'player_url':   None,
1286                 }]
1287
1288
1289 class YoutubeSearchIE(InfoExtractor):
1290         """Information Extractor for YouTube search queries."""
1291         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1292         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1293         _max_youtube_results = 1000
1294         IE_NAME = u'youtube:search'
1295
1296         def __init__(self, downloader=None):
1297                 InfoExtractor.__init__(self, downloader)
1298
1299         def report_download_page(self, query, pagenum):
1300                 """Report attempt to download search page with given number."""
1301                 query = query.decode(preferredencoding())
1302                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1303
1304         def _real_extract(self, query):
1305                 mobj = re.match(self._VALID_URL, query)
1306                 if mobj is None:
1307                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1308                         return
1309
1310                 prefix, query = query.split(':')
1311                 prefix = prefix[8:]
1312                 query = query.encode('utf-8')
1313                 if prefix == '':
1314                         self._download_n_results(query, 1)
1315                         return
1316                 elif prefix == 'all':
1317                         self._download_n_results(query, self._max_youtube_results)
1318                         return
1319                 else:
1320                         try:
1321                                 n = long(prefix)
1322                                 if n <= 0:
1323                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1324                                         return
1325                                 elif n > self._max_youtube_results:
1326                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1327                                         n = self._max_youtube_results
1328                                 self._download_n_results(query, n)
1329                                 return
1330                         except ValueError: # parsing prefix as integer fails
1331                                 self._download_n_results(query, 1)
1332                                 return
1333
1334         def _download_n_results(self, query, n):
1335                 """Downloads a specified number of results for a query"""
1336
1337                 video_ids = []
1338                 pagenum = 0
1339                 limit = n
1340
1341                 while (50 * pagenum) < limit:
1342                         self.report_download_page(query, pagenum+1)
1343                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1344                         request = urllib2.Request(result_url)
1345                         try:
1346                                 data = urllib2.urlopen(request).read()
1347                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1348                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1349                                 return
1350                         api_response = json.loads(data)['data']
1351
1352                         new_ids = list(video['id'] for video in api_response['items'])
1353                         video_ids += new_ids
1354
1355                         limit = min(n, api_response['totalItems'])
1356                         pagenum += 1
1357
1358                 if len(video_ids) > n:
1359                         video_ids = video_ids[:n]
1360                 for id in video_ids:
1361                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1362                 return
1363
1364
1365 class GoogleSearchIE(InfoExtractor):
1366         """Information Extractor for Google Video search queries."""
1367         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1368         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1369         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1370         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1371         _max_google_results = 1000
1372         IE_NAME = u'video.google:search'
1373
1374         def __init__(self, downloader=None):
1375                 InfoExtractor.__init__(self, downloader)
1376
1377         def report_download_page(self, query, pagenum):
1378                 """Report attempt to download playlist page with given number."""
1379                 query = query.decode(preferredencoding())
1380                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1381
1382         def _real_extract(self, query):
1383                 mobj = re.match(self._VALID_URL, query)
1384                 if mobj is None:
1385                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1386                         return
1387
1388                 prefix, query = query.split(':')
1389                 prefix = prefix[8:]
1390                 query = query.encode('utf-8')
1391                 if prefix == '':
1392                         self._download_n_results(query, 1)
1393                         return
1394                 elif prefix == 'all':
1395                         self._download_n_results(query, self._max_google_results)
1396                         return
1397                 else:
1398                         try:
1399                                 n = long(prefix)
1400                                 if n <= 0:
1401                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1402                                         return
1403                                 elif n > self._max_google_results:
1404                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1405                                         n = self._max_google_results
1406                                 self._download_n_results(query, n)
1407                                 return
1408                         except ValueError: # parsing prefix as integer fails
1409                                 self._download_n_results(query, 1)
1410                                 return
1411
1412         def _download_n_results(self, query, n):
1413                 """Downloads a specified number of results for a query"""
1414
1415                 video_ids = []
1416                 pagenum = 0
1417
1418                 while True:
1419                         self.report_download_page(query, pagenum)
1420                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1421                         request = urllib2.Request(result_url)
1422                         try:
1423                                 page = urllib2.urlopen(request).read()
1424                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1425                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1426                                 return
1427
1428                         # Extract video identifiers
1429                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1430                                 video_id = mobj.group(1)
1431                                 if video_id not in video_ids:
1432                                         video_ids.append(video_id)
1433                                         if len(video_ids) == n:
1434                                                 # Specified n videos reached
1435                                                 for id in video_ids:
1436                                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1437                                                 return
1438
1439                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1440                                 for id in video_ids:
1441                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1442                                 return
1443
1444                         pagenum = pagenum + 1
1445
1446
1447 class YahooSearchIE(InfoExtractor):
1448         """Information Extractor for Yahoo! Video search queries."""
1449         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1450         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1451         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1452         _MORE_PAGES_INDICATOR = r'\s*Next'
1453         _max_yahoo_results = 1000
1454         IE_NAME = u'video.yahoo:search'
1455
1456         def __init__(self, downloader=None):
1457                 InfoExtractor.__init__(self, downloader)
1458
1459         def report_download_page(self, query, pagenum):
1460                 """Report attempt to download playlist page with given number."""
1461                 query = query.decode(preferredencoding())
1462                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1463
1464         def _real_extract(self, query):
1465                 mobj = re.match(self._VALID_URL, query)
1466                 if mobj is None:
1467                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1468                         return
1469
1470                 prefix, query = query.split(':')
1471                 prefix = prefix[8:]
1472                 query = query.encode('utf-8')
1473                 if prefix == '':
1474                         self._download_n_results(query, 1)
1475                         return
1476                 elif prefix == 'all':
1477                         self._download_n_results(query, self._max_yahoo_results)
1478                         return
1479                 else:
1480                         try:
1481                                 n = long(prefix)
1482                                 if n <= 0:
1483                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1484                                         return
1485                                 elif n > self._max_yahoo_results:
1486                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1487                                         n = self._max_yahoo_results
1488                                 self._download_n_results(query, n)
1489                                 return
1490                         except ValueError: # parsing prefix as integer fails
1491                                 self._download_n_results(query, 1)
1492                                 return
1493
1494         def _download_n_results(self, query, n):
1495                 """Downloads a specified number of results for a query"""
1496
1497                 video_ids = []
1498                 already_seen = set()
1499                 pagenum = 1
1500
1501                 while True:
1502                         self.report_download_page(query, pagenum)
1503                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1504                         request = urllib2.Request(result_url)
1505                         try:
1506                                 page = urllib2.urlopen(request).read()
1507                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1508                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1509                                 return
1510
1511                         # Extract video identifiers
1512                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1513                                 video_id = mobj.group(1)
1514                                 if video_id not in already_seen:
1515                                         video_ids.append(video_id)
1516                                         already_seen.add(video_id)
1517                                         if len(video_ids) == n:
1518                                                 # Specified n videos reached
1519                                                 for id in video_ids:
1520                                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1521                                                 return
1522
1523                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1524                                 for id in video_ids:
1525                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1526                                 return
1527
1528                         pagenum = pagenum + 1
1529
1530
1531 class YoutubePlaylistIE(InfoExtractor):
1532         """Information Extractor for YouTube playlists."""
1533
1534         _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1535         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1536         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1537         _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1538         IE_NAME = u'youtube:playlist'
1539
1540         def __init__(self, downloader=None):
1541                 InfoExtractor.__init__(self, downloader)
1542
1543         def report_download_page(self, playlist_id, pagenum):
1544                 """Report attempt to download playlist page with given number."""
1545                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1546
1547         def _real_extract(self, url):
1548                 # Extract playlist id
1549                 mobj = re.match(self._VALID_URL, url)
1550                 if mobj is None:
1551                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1552                         return
1553
1554                 # Single video case
1555                 if mobj.group(3) is not None:
1556                         self._downloader.download([mobj.group(3)])
1557                         return
1558
1559                 # Download playlist pages
1560                 # prefix is 'p' as default for playlists but there are other types that need extra care
1561                 playlist_prefix = mobj.group(1)
1562                 if playlist_prefix == 'a':
1563                         playlist_access = 'artist'
1564                 else:
1565                         playlist_prefix = 'p'
1566                         playlist_access = 'view_play_list'
1567                 playlist_id = mobj.group(2)
1568                 video_ids = []
1569                 pagenum = 1
1570
1571                 while True:
1572                         self.report_download_page(playlist_id, pagenum)
1573                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1574                         request = urllib2.Request(url)
1575                         try:
1576                                 page = urllib2.urlopen(request).read()
1577                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1578                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1579                                 return
1580
1581                         # Extract video identifiers
1582                         ids_in_page = []
1583                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1584                                 if mobj.group(1) not in ids_in_page:
1585                                         ids_in_page.append(mobj.group(1))
1586                         video_ids.extend(ids_in_page)
1587
1588                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1589                                 break
1590                         pagenum = pagenum + 1
1591
1592                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1593                 playlistend = self._downloader.params.get('playlistend', -1)
1594                 if playlistend == -1:
1595                         video_ids = video_ids[playliststart:]
1596                 else:
1597                         video_ids = video_ids[playliststart:playlistend]
1598
1599                 for id in video_ids:
1600                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1601                 return
1602
1603
1604 class YoutubeChannelIE(InfoExtractor):
1605         """Information Extractor for YouTube channels."""
1606
1607         _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1608         _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1609         _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1610         IE_NAME = u'youtube:channel'
1611
1612         def report_download_page(self, channel_id, pagenum):
1613                 """Report attempt to download channel page with given number."""
1614                 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1615
1616         def _real_extract(self, url):
1617                 # Extract channel id
1618                 mobj = re.match(self._VALID_URL, url)
1619                 if mobj is None:
1620                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1621                         return
1622
1623                 # Download channel pages
1624                 channel_id = mobj.group(1)
1625                 video_ids = []
1626                 pagenum = 1
1627
1628                 while True:
1629                         self.report_download_page(channel_id, pagenum)
1630                         url = self._TEMPLATE_URL % (channel_id, pagenum)
1631                         request = urllib2.Request(url)
1632                         try:
1633                                 page = urllib2.urlopen(request).read()
1634                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1635                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1636                                 return
1637
1638                         # Extract video identifiers
1639                         ids_in_page = []
1640                         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1641                                 if mobj.group(1) not in ids_in_page:
1642                                         ids_in_page.append(mobj.group(1))
1643                         video_ids.extend(ids_in_page)
1644
1645                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1646                                 break
1647                         pagenum = pagenum + 1
1648
1649                 for id in video_ids:
1650                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1651                 return
1652
1653
1654 class YoutubeUserIE(InfoExtractor):
1655         """Information Extractor for YouTube users."""
1656
1657         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1658         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1659         _GDATA_PAGE_SIZE = 50
1660         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1661         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1662         IE_NAME = u'youtube:user'
1663
1664         def __init__(self, downloader=None):
1665                 InfoExtractor.__init__(self, downloader)
1666
1667         def report_download_page(self, username, start_index):
1668                 """Report attempt to download user page."""
1669                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1670                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1671
1672         def _real_extract(self, url):
1673                 # Extract username
1674                 mobj = re.match(self._VALID_URL, url)
1675                 if mobj is None:
1676                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1677                         return
1678
1679                 username = mobj.group(1)
1680
1681                 # Download video ids using YouTube Data API. Result size per
1682                 # query is limited (currently to 50 videos) so we need to query
1683                 # page by page until there are no video ids - it means we got
1684                 # all of them.
1685
1686                 video_ids = []
1687                 pagenum = 0
1688
1689                 while True:
1690                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1691                         self.report_download_page(username, start_index)
1692
1693                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1694
1695                         try:
1696                                 page = urllib2.urlopen(request).read()
1697                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1698                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1699                                 return
1700
1701                         # Extract video identifiers
1702                         ids_in_page = []
1703
1704                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1705                                 if mobj.group(1) not in ids_in_page:
1706                                         ids_in_page.append(mobj.group(1))
1707
1708                         video_ids.extend(ids_in_page)
1709
1710                         # A little optimization - if current page is not
1711                         # "full", ie. does not contain PAGE_SIZE video ids then
1712                         # we can assume that this page is the last one - there
1713                         # are no more ids on further pages - no need to query
1714                         # again.
1715
1716                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1717                                 break
1718
1719                         pagenum += 1
1720
1721                 all_ids_count = len(video_ids)
1722                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1723                 playlistend = self._downloader.params.get('playlistend', -1)
1724
1725                 if playlistend == -1:
1726                         video_ids = video_ids[playliststart:]
1727                 else:
1728                         video_ids = video_ids[playliststart:playlistend]
1729
1730                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1731                                 (username, all_ids_count, len(video_ids)))
1732
1733                 for video_id in video_ids:
1734                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1735
1736
1737 class BlipTVUserIE(InfoExtractor):
1738         """Information Extractor for blip.tv users."""
1739
1740         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1741         _PAGE_SIZE = 12
1742         IE_NAME = u'blip.tv:user'
1743
1744         def __init__(self, downloader=None):
1745                 InfoExtractor.__init__(self, downloader)
1746
1747         def report_download_page(self, username, pagenum):
1748                 """Report attempt to download user page."""
1749                 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1750                                 (self.IE_NAME, username, pagenum))
1751
1752         def _real_extract(self, url):
1753                 # Extract username
1754                 mobj = re.match(self._VALID_URL, url)
1755                 if mobj is None:
1756                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1757                         return
1758
1759                 username = mobj.group(1)
1760
1761                 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1762
1763                 request = urllib2.Request(url)
1764
1765                 try:
1766                         page = urllib2.urlopen(request).read().decode('utf-8')
1767                         mobj = re.search(r'data-users-id="([^"]+)"', page)
1768                         page_base = page_base % mobj.group(1)
1769                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1770                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1771                         return
1772
1773
1774                 # Download video ids using BlipTV Ajax calls. Result size per
1775                 # query is limited (currently to 12 videos) so we need to query
1776                 # page by page until there are no video ids - it means we got
1777                 # all of them.
1778
1779                 video_ids = []
1780                 pagenum = 1
1781
1782                 while True:
1783                         self.report_download_page(username, pagenum)
1784
1785                         request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1786
1787                         try:
1788                                 page = urllib2.urlopen(request).read().decode('utf-8')
1789                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1790                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1791                                 return
1792
1793                         # Extract video identifiers
1794                         ids_in_page = []
1795
1796                         for mobj in re.finditer(r'href="/([^"]+)"', page):
1797                                 if mobj.group(1) not in ids_in_page:
1798                                         ids_in_page.append(unescapeHTML(mobj.group(1)))
1799
1800                         video_ids.extend(ids_in_page)
1801
1802                         # A little optimization - if current page is not
1803                         # "full", ie. does not contain PAGE_SIZE video ids then
1804                         # we can assume that this page is the last one - there
1805                         # are no more ids on further pages - no need to query
1806                         # again.
1807
1808                         if len(ids_in_page) < self._PAGE_SIZE:
1809                                 break
1810
1811                         pagenum += 1
1812
1813                 all_ids_count = len(video_ids)
1814                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1815                 playlistend = self._downloader.params.get('playlistend', -1)
1816
1817                 if playlistend == -1:
1818                         video_ids = video_ids[playliststart:]
1819                 else:
1820                         video_ids = video_ids[playliststart:playlistend]
1821
1822                 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1823                                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1824
1825                 for video_id in video_ids:
1826                         self._downloader.download([u'http://blip.tv/'+video_id])
1827
1828
1829 class DepositFilesIE(InfoExtractor):
1830         """Information extractor for depositfiles.com"""
1831
1832         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1833         IE_NAME = u'DepositFiles'
1834
1835         def __init__(self, downloader=None):
1836                 InfoExtractor.__init__(self, downloader)
1837
1838         def report_download_webpage(self, file_id):
1839                 """Report webpage download."""
1840                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1841
1842         def report_extraction(self, file_id):
1843                 """Report information extraction."""
1844                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1845
1846         def _real_extract(self, url):
1847                 file_id = url.split('/')[-1]
1848                 # Rebuild url in english locale
1849                 url = 'http://depositfiles.com/en/files/' + file_id
1850
1851                 # Retrieve file webpage with 'Free download' button pressed
1852                 free_download_indication = { 'gateway_result' : '1' }
1853                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1854                 try:
1855                         self.report_download_webpage(file_id)
1856                         webpage = urllib2.urlopen(request).read()
1857                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1858                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1859                         return
1860
1861                 # Search for the real file URL
1862                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1863                 if (mobj is None) or (mobj.group(1) is None):
1864                         # Try to figure out reason of the error.
1865                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1866                         if (mobj is not None) and (mobj.group(1) is not None):
1867                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1868                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1869                         else:
1870                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1871                         return
1872
1873                 file_url = mobj.group(1)
1874                 file_extension = os.path.splitext(file_url)[1][1:]
1875
1876                 # Search for file title
1877                 mobj = re.search(r'<b title="(.*?)">', webpage)
1878                 if mobj is None:
1879                         self._downloader.trouble(u'ERROR: unable to extract title')
1880                         return
1881                 file_title = mobj.group(1).decode('utf-8')
1882
1883                 return [{
1884                         'id':           file_id.decode('utf-8'),
1885                         'url':          file_url.decode('utf-8'),
1886                         'uploader':     u'NA',
1887                         'upload_date':  u'NA',
1888                         'title':        file_title,
1889                         'ext':          file_extension.decode('utf-8'),
1890                         'player_url':   None,
1891                 }]
1892
1893
1894 class FacebookIE(InfoExtractor):
1895         """Information Extractor for Facebook"""
1896
1897         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1898         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1899         _NETRC_MACHINE = 'facebook'
1900         _available_formats = ['video', 'highqual', 'lowqual']
1901         _video_extensions = {
1902                 'video': 'mp4',
1903                 'highqual': 'mp4',
1904                 'lowqual': 'mp4',
1905         }
1906         IE_NAME = u'facebook'
1907
1908         def __init__(self, downloader=None):
1909                 InfoExtractor.__init__(self, downloader)
1910
1911         def _reporter(self, message):
1912                 """Add header and report message."""
1913                 self._downloader.to_screen(u'[facebook] %s' % message)
1914
1915         def report_login(self):
1916                 """Report attempt to log in."""
1917                 self._reporter(u'Logging in')
1918
1919         def report_video_webpage_download(self, video_id):
1920                 """Report attempt to download video webpage."""
1921                 self._reporter(u'%s: Downloading video webpage' % video_id)
1922
1923         def report_information_extraction(self, video_id):
1924                 """Report attempt to extract video information."""
1925                 self._reporter(u'%s: Extracting video information' % video_id)
1926
1927         def _parse_page(self, video_webpage):
1928                 """Extract video information from page"""
1929                 # General data
1930                 data = {'title': r'\("video_title", "(.*?)"\)',
1931                         'description': r'<div class="datawrap">(.*?)</div>',
1932                         'owner': r'\("video_owner_name", "(.*?)"\)',
1933                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1934                         }
1935                 video_info = {}
1936                 for piece in data.keys():
1937                         mobj = re.search(data[piece], video_webpage)
1938                         if mobj is not None:
1939                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1940
1941                 # Video urls
1942                 video_urls = {}
1943                 for fmt in self._available_formats:
1944                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1945                         if mobj is not None:
1946                                 # URL is in a Javascript segment inside an escaped Unicode format within
1947                                 # the generally utf-8 page
1948                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1949                 video_info['video_urls'] = video_urls
1950
1951                 return video_info
1952
1953         def _real_initialize(self):
1954                 if self._downloader is None:
1955                         return
1956
1957                 useremail = None
1958                 password = None
1959                 downloader_params = self._downloader.params
1960
1961                 # Attempt to use provided username and password or .netrc data
1962                 if downloader_params.get('username', None) is not None:
1963                         useremail = downloader_params['username']
1964                         password = downloader_params['password']
1965                 elif downloader_params.get('usenetrc', False):
1966                         try:
1967                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1968                                 if info is not None:
1969                                         useremail = info[0]
1970                                         password = info[2]
1971                                 else:
1972                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1973                         except (IOError, netrc.NetrcParseError), err:
1974                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
1975                                 return
1976
1977                 if useremail is None:
1978                         return
1979
1980                 # Log in
1981                 login_form = {
1982                         'email': useremail,
1983                         'pass': password,
1984                         'login': 'Log+In'
1985                         }
1986                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1987                 try:
1988                         self.report_login()
1989                         login_results = urllib2.urlopen(request).read()
1990                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1991                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1992                                 return
1993                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1994                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
1995                         return
1996
1997         def _real_extract(self, url):
1998                 mobj = re.match(self._VALID_URL, url)
1999                 if mobj is None:
2000                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2001                         return
2002                 video_id = mobj.group('ID')
2003
2004                 # Get video webpage
2005                 self.report_video_webpage_download(video_id)
2006                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2007                 try:
2008                         page = urllib2.urlopen(request)
2009                         video_webpage = page.read()
2010                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2011                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2012                         return
2013
2014                 # Start extracting information
2015                 self.report_information_extraction(video_id)
2016
2017                 # Extract information
2018                 video_info = self._parse_page(video_webpage)
2019
2020                 # uploader
2021                 if 'owner' not in video_info:
2022                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2023                         return
2024                 video_uploader = video_info['owner']
2025
2026                 # title
2027                 if 'title' not in video_info:
2028                         self._downloader.trouble(u'ERROR: unable to extract video title')
2029                         return
2030                 video_title = video_info['title']
2031                 video_title = video_title.decode('utf-8')
2032
2033                 # thumbnail image
2034                 if 'thumbnail' not in video_info:
2035                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2036                         video_thumbnail = ''
2037                 else:
2038                         video_thumbnail = video_info['thumbnail']
2039
2040                 # upload date
2041                 upload_date = u'NA'
2042                 if 'upload_date' in video_info:
2043                         upload_time = video_info['upload_date']
2044                         timetuple = email.utils.parsedate_tz(upload_time)
2045                         if timetuple is not None:
2046                                 try:
2047                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2048                                 except:
2049                                         pass
2050
2051                 # description
2052                 video_description = video_info.get('description', 'No description available.')
2053
2054                 url_map = video_info['video_urls']
2055                 if len(url_map.keys()) > 0:
2056                         # Decide which formats to download
2057                         req_format = self._downloader.params.get('format', None)
2058                         format_limit = self._downloader.params.get('format_limit', None)
2059
2060                         if format_limit is not None and format_limit in self._available_formats:
2061                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2062                         else:
2063                                 format_list = self._available_formats
2064                         existing_formats = [x for x in format_list if x in url_map]
2065                         if len(existing_formats) == 0:
2066                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2067                                 return
2068                         if req_format is None:
2069                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2070                         elif req_format == 'worst':
2071                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2072                         elif req_format == '-1':
2073                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2074                         else:
2075                                 # Specific format
2076                                 if req_format not in url_map:
2077                                         self._downloader.trouble(u'ERROR: requested format not available')
2078                                         return
2079                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2080
2081                 results = []
2082                 for format_param, video_real_url in video_url_list:
2083                         # Extension
2084                         video_extension = self._video_extensions.get(format_param, 'mp4')
2085
2086                         results.append({
2087                                 'id':           video_id.decode('utf-8'),
2088                                 'url':          video_real_url.decode('utf-8'),
2089                                 'uploader':     video_uploader.decode('utf-8'),
2090                                 'upload_date':  upload_date,
2091                                 'title':        video_title,
2092                                 'ext':          video_extension.decode('utf-8'),
2093                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2094                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2095                                 'description':  video_description.decode('utf-8'),
2096                                 'player_url':   None,
2097                         })
2098                 return results
2099
2100 class BlipTVIE(InfoExtractor):
2101         """Information extractor for blip.tv"""
2102
2103         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2104         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2105         IE_NAME = u'blip.tv'
2106
2107         def report_extraction(self, file_id):
2108                 """Report information extraction."""
2109                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2110
2111         def report_direct_download(self, title):
2112                 """Report information extraction."""
2113                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2114
2115         def _real_extract(self, url):
2116                 mobj = re.match(self._VALID_URL, url)
2117                 if mobj is None:
2118                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2119                         return
2120
2121                 if '?' in url:
2122                         cchar = '&'
2123                 else:
2124                         cchar = '?'
2125                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2126                 request = urllib2.Request(json_url.encode('utf-8'))
2127                 self.report_extraction(mobj.group(1))
2128                 info = None
2129                 try:
2130                         urlh = urllib2.urlopen(request)
2131                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2132                                 basename = url.split('/')[-1]
2133                                 title,ext = os.path.splitext(basename)
2134                                 title = title.decode('UTF-8')
2135                                 ext = ext.replace('.', '')
2136                                 self.report_direct_download(title)
2137                                 info = {
2138                                         'id': title,
2139                                         'url': url,
2140                                         'title': title,
2141                                         'ext': ext,
2142                                         'urlhandle': urlh
2143                                 }
2144                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2145                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2146                         return
2147                 if info is None: # Regular URL
2148                         try:
2149                                 json_code = urlh.read()
2150                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2151                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2152                                 return
2153
2154                         try:
2155                                 json_data = json.loads(json_code)
2156                                 if 'Post' in json_data:
2157                                         data = json_data['Post']
2158                                 else:
2159                                         data = json_data
2160
2161                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2162                                 video_url = data['media']['url']
2163                                 umobj = re.match(self._URL_EXT, video_url)
2164                                 if umobj is None:
2165                                         raise ValueError('Can not determine filename extension')
2166                                 ext = umobj.group(1)
2167
2168                                 info = {
2169                                         'id': data['item_id'],
2170                                         'url': video_url,
2171                                         'uploader': data['display_name'],
2172                                         'upload_date': upload_date,
2173                                         'title': data['title'],
2174                                         'ext': ext,
2175                                         'format': data['media']['mimeType'],
2176                                         'thumbnail': data['thumbnailUrl'],
2177                                         'description': data['description'],
2178                                         'player_url': data['embedUrl']
2179                                 }
2180                         except (ValueError,KeyError), err:
2181                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2182                                 return
2183
2184                 std_headers['User-Agent'] = 'iTunes/10.6.1'
2185                 return [info]
2186
2187
2188 class MyVideoIE(InfoExtractor):
2189         """Information Extractor for myvideo.de."""
2190
2191         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2192         IE_NAME = u'myvideo'
2193
2194         def __init__(self, downloader=None):
2195                 InfoExtractor.__init__(self, downloader)
2196         
2197         def report_download_webpage(self, video_id):
2198                 """Report webpage download."""
2199                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2200
2201         def report_extraction(self, video_id):
2202                 """Report information extraction."""
2203                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2204
2205         def _real_extract(self,url):
2206                 mobj = re.match(self._VALID_URL, url)
2207                 if mobj is None:
2208                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2209                         return
2210
2211                 video_id = mobj.group(1)
2212
2213                 # Get video webpage
2214                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2215                 try:
2216                         self.report_download_webpage(video_id)
2217                         webpage = urllib2.urlopen(request).read()
2218                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2219                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2220                         return
2221
2222                 self.report_extraction(video_id)
2223                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2224                                  webpage)
2225                 if mobj is None:
2226                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2227                         return
2228                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2229
2230                 mobj = re.search('<title>([^<]+)</title>', webpage)
2231                 if mobj is None:
2232                         self._downloader.trouble(u'ERROR: unable to extract title')
2233                         return
2234
2235                 video_title = mobj.group(1)
2236
2237                 return [{
2238                         'id':           video_id,
2239                         'url':          video_url,
2240                         'uploader':     u'NA',
2241                         'upload_date':  u'NA',
2242                         'title':        video_title,
2243                         'ext':          u'flv',
2244                         'player_url':   None,
2245                 }]
2246
2247 class ComedyCentralIE(InfoExtractor):
2248         """Information extractor for The Daily Show and Colbert Report """
2249
2250         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2251         IE_NAME = u'comedycentral'
2252
2253         _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2254
2255         _video_extensions = {
2256                 '3500': 'mp4',
2257                 '2200': 'mp4',
2258                 '1700': 'mp4',
2259                 '1200': 'mp4',
2260                 '750': 'mp4',
2261                 '400': 'mp4',
2262         }
2263         _video_dimensions = {
2264                 '3500': '1280x720',
2265                 '2200': '960x540',
2266                 '1700': '768x432',
2267                 '1200': '640x360',
2268                 '750': '512x288',
2269                 '400': '384x216',
2270         }
2271
2272         def report_extraction(self, episode_id):
2273                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2274
2275         def report_config_download(self, episode_id):
2276                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2277
2278         def report_index_download(self, episode_id):
2279                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2280
2281         def report_player_url(self, episode_id):
2282                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2283
2284
2285         def _print_formats(self, formats):
2286                 print('Available formats:')
2287                 for x in formats:
2288                         print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2289
2290
2291         def _real_extract(self, url):
2292                 mobj = re.match(self._VALID_URL, url)
2293                 if mobj is None:
2294                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2295                         return
2296
2297                 if mobj.group('shortname'):
2298                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
2299                                 url = u'http://www.thedailyshow.com/full-episodes/'
2300                         else:
2301                                 url = u'http://www.colbertnation.com/full-episodes/'
2302                         mobj = re.match(self._VALID_URL, url)
2303                         assert mobj is not None
2304
2305                 dlNewest = not mobj.group('episode')
2306                 if dlNewest:
2307                         epTitle = mobj.group('showname')
2308                 else:
2309                         epTitle = mobj.group('episode')
2310
2311                 req = urllib2.Request(url)
2312                 self.report_extraction(epTitle)
2313                 try:
2314                         htmlHandle = urllib2.urlopen(req)
2315                         html = htmlHandle.read()
2316                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2317                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2318                         return
2319                 if dlNewest:
2320                         url = htmlHandle.geturl()
2321                         mobj = re.match(self._VALID_URL, url)
2322                         if mobj is None:
2323                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2324                                 return
2325                         if mobj.group('episode') == '':
2326                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2327                                 return
2328                         epTitle = mobj.group('episode')
2329
2330                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2331
2332                 if len(mMovieParams) == 0:
2333                         # The Colbert Report embeds the information in a without
2334                         # a URL prefix; so extract the alternate reference
2335                         # and then add the URL prefix manually.
2336
2337                         altMovieParams = re.findall('data-mgid="([^"]*episode.*?:.*?)"', html)
2338                         if len(altMovieParams) == 0:
2339                                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2340                                 return
2341                         else:
2342                                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2343                 
2344                 playerUrl_raw = mMovieParams[0][0]
2345                 self.report_player_url(epTitle)
2346                 try:
2347                         urlHandle = urllib2.urlopen(playerUrl_raw)
2348                         playerUrl = urlHandle.geturl()
2349                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2350                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2351                         return
2352
2353                 uri = mMovieParams[0][1]
2354                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2355                 self.report_index_download(epTitle)
2356                 try:
2357                         indexXml = urllib2.urlopen(indexUrl).read()
2358                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2359                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2360                         return
2361
2362                 results = []
2363
2364                 idoc = xml.etree.ElementTree.fromstring(indexXml)
2365                 itemEls = idoc.findall('.//item')
2366                 for itemEl in itemEls:
2367                         mediaId = itemEl.findall('./guid')[0].text
2368                         shortMediaId = mediaId.split(':')[-1]
2369                         showId = mediaId.split(':')[-2].replace('.com', '')
2370                         officialTitle = itemEl.findall('./title')[0].text
2371                         officialDate = itemEl.findall('./pubDate')[0].text
2372
2373                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2374                                                 urllib.urlencode({'uri': mediaId}))
2375                         configReq = urllib2.Request(configUrl)
2376                         self.report_config_download(epTitle)
2377                         try:
2378                                 configXml = urllib2.urlopen(configReq).read()
2379                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2380                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2381                                 return
2382
2383                         cdoc = xml.etree.ElementTree.fromstring(configXml)
2384                         turls = []
2385                         for rendition in cdoc.findall('.//rendition'):
2386                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2387                                 turls.append(finfo)
2388
2389                         if len(turls) == 0:
2390                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2391                                 continue
2392                         
2393                         if self._downloader.params.get('listformats', None):
2394                                 self._print_formats([i[0] for i in turls])
2395                                 return
2396
2397                         # For now, just pick the highest bitrate
2398                         format,video_url = turls[-1]
2399
2400                         # Get the format arg from the arg stream
2401                         req_format = self._downloader.params.get('format', None)
2402
2403                         # Select format if we can find one
2404                         for f,v in turls:
2405                                 if f == req_format:
2406                                         format, video_url = f, v
2407                                         break
2408
2409                         # Patch to download from alternative CDN, which does not
2410                         # break on current RTMPDump builds
2411                         broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2412                         better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2413
2414                         if video_url.startswith(broken_cdn):
2415                                 video_url = video_url.replace(broken_cdn, better_cdn)
2416
2417                         effTitle = showId + u'-' + epTitle
2418                         info = {
2419                                 'id': shortMediaId,
2420                                 'url': video_url,
2421                                 'uploader': showId,
2422                                 'upload_date': officialDate,
2423                                 'title': effTitle,
2424                                 'ext': 'mp4',
2425                                 'format': format,
2426                                 'thumbnail': None,
2427                                 'description': officialTitle,
2428                                 'player_url': None #playerUrl
2429                         }
2430
2431                         results.append(info)
2432                         
2433                 return results
2434
2435
2436 class EscapistIE(InfoExtractor):
2437         """Information extractor for The Escapist """
2438
2439         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2440         IE_NAME = u'escapist'
2441
2442         def report_extraction(self, showName):
2443                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2444
2445         def report_config_download(self, showName):
2446                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2447
2448         def _real_extract(self, url):
2449                 mobj = re.match(self._VALID_URL, url)
2450                 if mobj is None:
2451                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2452                         return
2453                 showName = mobj.group('showname')
2454                 videoId = mobj.group('episode')
2455
2456                 self.report_extraction(showName)
2457                 try:
2458                         webPage = urllib2.urlopen(url)
2459                         webPageBytes = webPage.read()
2460                         m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2461                         webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2462                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2463                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2464                         return
2465
2466                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2467                 description = unescapeHTML(descMatch.group(1))
2468                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2469                 imgUrl = unescapeHTML(imgMatch.group(1))
2470                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2471                 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2472                 configUrlMatch = re.search('config=(.*)$', playerUrl)
2473                 configUrl = urllib2.unquote(configUrlMatch.group(1))
2474
2475                 self.report_config_download(showName)
2476                 try:
2477                         configJSON = urllib2.urlopen(configUrl).read()
2478                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2479                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2480                         return
2481
2482                 # Technically, it's JavaScript, not JSON
2483                 configJSON = configJSON.replace("'", '"')
2484
2485                 try:
2486                         config = json.loads(configJSON)
2487                 except (ValueError,), err:
2488                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2489                         return
2490
2491                 playlist = config['playlist']
2492                 videoUrl = playlist[1]['url']
2493
2494                 info = {
2495                         'id': videoId,
2496                         'url': videoUrl,
2497                         'uploader': showName,
2498                         'upload_date': None,
2499                         'title': showName,
2500                         'ext': 'flv',
2501                         'thumbnail': imgUrl,
2502                         'description': description,
2503                         'player_url': playerUrl,
2504                 }
2505
2506                 return [info]
2507
2508
2509 class CollegeHumorIE(InfoExtractor):
2510         """Information extractor for collegehumor.com"""
2511
2512         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2513         IE_NAME = u'collegehumor'
2514
2515         def report_webpage(self, video_id):
2516                 """Report information extraction."""
2517                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2518
2519         def report_extraction(self, video_id):
2520                 """Report information extraction."""
2521                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2522
2523         def _real_extract(self, url):
2524                 mobj = re.match(self._VALID_URL, url)
2525                 if mobj is None:
2526                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2527                         return
2528                 video_id = mobj.group('videoid')
2529
2530                 self.report_webpage(video_id)
2531                 request = urllib2.Request(url)
2532                 try:
2533                         webpage = urllib2.urlopen(request).read()
2534                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2535                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2536                         return
2537
2538                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2539                 if m is None:
2540                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2541                         return
2542                 internal_video_id = m.group('internalvideoid')
2543
2544                 info = {
2545                         'id': video_id,
2546                         'internal_id': internal_video_id,
2547                 }
2548
2549                 self.report_extraction(video_id)
2550                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2551                 try:
2552                         metaXml = urllib2.urlopen(xmlUrl).read()
2553                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2554                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2555                         return
2556
2557                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2558                 try:
2559                         videoNode = mdoc.findall('./video')[0]
2560                         info['description'] = videoNode.findall('./description')[0].text
2561                         info['title'] = videoNode.findall('./caption')[0].text
2562                         info['url'] = videoNode.findall('./file')[0].text
2563                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2564                         info['ext'] = info['url'].rpartition('.')[2]
2565                 except IndexError:
2566                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2567                         return
2568
2569                 return [info]
2570
2571
2572 class XVideosIE(InfoExtractor):
2573         """Information extractor for xvideos.com"""
2574
2575         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2576         IE_NAME = u'xvideos'
2577
2578         def report_webpage(self, video_id):
2579                 """Report information extraction."""
2580                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2581
2582         def report_extraction(self, video_id):
2583                 """Report information extraction."""
2584                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2585
2586         def _real_extract(self, url):
2587                 mobj = re.match(self._VALID_URL, url)
2588                 if mobj is None:
2589                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2590                         return
2591                 video_id = mobj.group(1).decode('utf-8')
2592
2593                 self.report_webpage(video_id)
2594
2595                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2596                 try:
2597                         webpage = urllib2.urlopen(request).read()
2598                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2599                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2600                         return
2601
2602                 self.report_extraction(video_id)
2603
2604
2605                 # Extract video URL
2606                 mobj = re.search(r'flv_url=(.+?)&', webpage)
2607                 if mobj is None:
2608                         self._downloader.trouble(u'ERROR: unable to extract video url')
2609                         return
2610                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2611
2612
2613                 # Extract title
2614                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2615                 if mobj is None:
2616                         self._downloader.trouble(u'ERROR: unable to extract video title')
2617                         return
2618                 video_title = mobj.group(1).decode('utf-8')
2619
2620
2621                 # Extract video thumbnail
2622                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2623                 if mobj is None:
2624                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2625                         return
2626                 video_thumbnail = mobj.group(0).decode('utf-8')
2627
2628                 info = {
2629                         'id': video_id,
2630                         'url': video_url,
2631                         'uploader': None,
2632                         'upload_date': None,
2633                         'title': video_title,
2634                         'ext': 'flv',
2635                         'thumbnail': video_thumbnail,
2636                         'description': None,
2637                         'player_url': None,
2638                 }
2639
2640                 return [info]
2641
2642
2643 class SoundcloudIE(InfoExtractor):
2644         """Information extractor for soundcloud.com
2645            To access the media, the uid of the song and a stream token
2646            must be extracted from the page source and the script must make
2647            a request to media.soundcloud.com/crossdomain.xml. Then
2648            the media can be grabbed by requesting from an url composed
2649            of the stream token and uid
2650          """
2651
2652         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2653         IE_NAME = u'soundcloud'
2654
2655         def __init__(self, downloader=None):
2656                 InfoExtractor.__init__(self, downloader)
2657
2658         def report_webpage(self, video_id):
2659                 """Report information extraction."""
2660                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2661
2662         def report_extraction(self, video_id):
2663                 """Report information extraction."""
2664                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2665
2666         def _real_extract(self, url):
2667                 mobj = re.match(self._VALID_URL, url)
2668                 if mobj is None:
2669                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2670                         return
2671
2672                 # extract uploader (which is in the url)
2673                 uploader = mobj.group(1).decode('utf-8')
2674                 # extract simple title (uploader + slug of song title)
2675                 slug_title =  mobj.group(2).decode('utf-8')
2676                 simple_title = uploader + u'-' + slug_title
2677
2678                 self.report_webpage('%s/%s' % (uploader, slug_title))
2679
2680                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2681                 try:
2682                         webpage = urllib2.urlopen(request).read()
2683                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2684                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2685                         return
2686
2687                 self.report_extraction('%s/%s' % (uploader, slug_title))
2688
2689                 # extract uid and stream token that soundcloud hands out for access
2690                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2691                 if mobj:
2692                         video_id = mobj.group(1)
2693                         stream_token = mobj.group(2)
2694
2695                 # extract unsimplified title
2696                 mobj = re.search('"title":"(.*?)",', webpage)
2697                 if mobj:
2698                         title = mobj.group(1).decode('utf-8')
2699                 else:
2700                         title = simple_title
2701
2702                 # construct media url (with uid/token)
2703                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2704                 mediaURL = mediaURL % (video_id, stream_token)
2705
2706                 # description
2707                 description = u'No description available'
2708                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2709                 if mobj:
2710                         description = mobj.group(1)
2711
2712                 # upload date
2713                 upload_date = None
2714                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2715                 if mobj:
2716                         try:
2717                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2718                         except Exception, e:
2719                                 self._downloader.to_stderr(compat_str(e))
2720
2721                 # for soundcloud, a request to a cross domain is required for cookies
2722                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2723
2724                 return [{
2725                         'id':           video_id.decode('utf-8'),
2726                         'url':          mediaURL,
2727                         'uploader':     uploader.decode('utf-8'),
2728                         'upload_date':  upload_date,
2729                         'title':        title,
2730                         'ext':          u'mp3',
2731                         'player_url':   None,
2732                         'description': description.decode('utf-8')
2733                 }]
2734
2735
2736 class InfoQIE(InfoExtractor):
2737         """Information extractor for infoq.com"""
2738
2739         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2740         IE_NAME = u'infoq'
2741
2742         def report_webpage(self, video_id):
2743                 """Report information extraction."""
2744                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2745
2746         def report_extraction(self, video_id):
2747                 """Report information extraction."""
2748                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2749
2750         def _real_extract(self, url):
2751                 mobj = re.match(self._VALID_URL, url)
2752                 if mobj is None:
2753                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2754                         return
2755
2756                 self.report_webpage(url)
2757
2758                 request = urllib2.Request(url)
2759                 try:
2760                         webpage = urllib2.urlopen(request).read()
2761                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2762                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2763                         return
2764
2765                 self.report_extraction(url)
2766
2767
2768                 # Extract video URL
2769                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2770                 if mobj is None:
2771                         self._downloader.trouble(u'ERROR: unable to extract video url')
2772                         return
2773                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2774
2775
2776                 # Extract title
2777                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2778                 if mobj is None:
2779                         self._downloader.trouble(u'ERROR: unable to extract video title')
2780                         return
2781                 video_title = mobj.group(1).decode('utf-8')
2782
2783                 # Extract description
2784                 video_description = u'No description available.'
2785                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2786                 if mobj is not None:
2787                         video_description = mobj.group(1).decode('utf-8')
2788
2789                 video_filename = video_url.split('/')[-1]
2790                 video_id, extension = video_filename.split('.')
2791
2792                 info = {
2793                         'id': video_id,
2794                         'url': video_url,
2795                         'uploader': None,
2796                         'upload_date': None,
2797                         'title': video_title,
2798                         'ext': extension, # Extension is always(?) mp4, but seems to be flv
2799                         'thumbnail': None,
2800                         'description': video_description,
2801                         'player_url': None,
2802                 }
2803
2804                 return [info]
2805
2806 class MixcloudIE(InfoExtractor):
2807         """Information extractor for www.mixcloud.com"""
2808         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2809         IE_NAME = u'mixcloud'
2810
2811         def __init__(self, downloader=None):
2812                 InfoExtractor.__init__(self, downloader)
2813
2814         def report_download_json(self, file_id):
2815                 """Report JSON download."""
2816                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2817
2818         def report_extraction(self, file_id):
2819                 """Report information extraction."""
2820                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2821
2822         def get_urls(self, jsonData, fmt, bitrate='best'):
2823                 """Get urls from 'audio_formats' section in json"""
2824                 file_url = None
2825                 try:
2826                         bitrate_list = jsonData[fmt]
2827                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2828                                 bitrate = max(bitrate_list) # select highest
2829
2830                         url_list = jsonData[fmt][bitrate]
2831                 except TypeError: # we have no bitrate info.
2832                         url_list = jsonData[fmt]
2833                 return url_list
2834
2835         def check_urls(self, url_list):
2836                 """Returns 1st active url from list"""
2837                 for url in url_list:
2838                         try:
2839                                 urllib2.urlopen(url)
2840                                 return url
2841                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2842                                 url = None
2843
2844                 return None
2845
2846         def _print_formats(self, formats):
2847                 print('Available formats:')
2848                 for fmt in formats.keys():
2849                         for b in formats[fmt]:
2850                                 try:
2851                                         ext = formats[fmt][b][0]
2852                                         print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2853                                 except TypeError: # we have no bitrate info
2854                                         ext = formats[fmt][0]
2855                                         print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2856                                         break
2857
2858         def _real_extract(self, url):
2859                 mobj = re.match(self._VALID_URL, url)
2860                 if mobj is None:
2861                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2862                         return
2863                 # extract uploader & filename from url
2864                 uploader = mobj.group(1).decode('utf-8')
2865                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2866
2867                 # construct API request
2868                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2869                 # retrieve .json file with links to files
2870                 request = urllib2.Request(file_url)
2871                 try:
2872                         self.report_download_json(file_url)
2873                         jsonData = urllib2.urlopen(request).read()
2874                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2875                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2876                         return
2877
2878                 # parse JSON
2879                 json_data = json.loads(jsonData)
2880                 player_url = json_data['player_swf_url']
2881                 formats = dict(json_data['audio_formats'])
2882
2883                 req_format = self._downloader.params.get('format', None)
2884                 bitrate = None
2885
2886                 if self._downloader.params.get('listformats', None):
2887                         self._print_formats(formats)
2888                         return
2889
2890                 if req_format is None or req_format == 'best':
2891                         for format_param in formats.keys():
2892                                 url_list = self.get_urls(formats, format_param)
2893                                 # check urls
2894                                 file_url = self.check_urls(url_list)
2895                                 if file_url is not None:
2896                                         break # got it!
2897                 else:
2898                         if req_format not in formats.keys():
2899                                 self._downloader.trouble(u'ERROR: format is not available')
2900                                 return
2901
2902                         url_list = self.get_urls(formats, req_format)
2903                         file_url = self.check_urls(url_list)
2904                         format_param = req_format
2905
2906                 return [{
2907                         'id': file_id.decode('utf-8'),
2908                         'url': file_url.decode('utf-8'),
2909                         'uploader':     uploader.decode('utf-8'),
2910                         'upload_date': u'NA',
2911                         'title': json_data['name'],
2912                         'ext': file_url.split('.')[-1].decode('utf-8'),
2913                         'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2914                         'thumbnail': json_data['thumbnail_url'],
2915                         'description': json_data['description'],
2916                         'player_url': player_url.decode('utf-8'),
2917                 }]
2918
2919 class StanfordOpenClassroomIE(InfoExtractor):
2920         """Information extractor for Stanford's Open ClassRoom"""
2921
2922         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2923         IE_NAME = u'stanfordoc'
2924
2925         def report_download_webpage(self, objid):
2926                 """Report information extraction."""
2927                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2928
2929         def report_extraction(self, video_id):
2930                 """Report information extraction."""
2931                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2932
2933         def _real_extract(self, url):
2934                 mobj = re.match(self._VALID_URL, url)
2935                 if mobj is None:
2936                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2937                         return
2938
2939                 if mobj.group('course') and mobj.group('video'): # A specific video
2940                         course = mobj.group('course')
2941                         video = mobj.group('video')
2942                         info = {
2943                                 'id': course + '_' + video,
2944                         }
2945
2946                         self.report_extraction(info['id'])
2947                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2948                         xmlUrl = baseUrl + video + '.xml'
2949                         try:
2950                                 metaXml = urllib2.urlopen(xmlUrl).read()
2951                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2952                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2953                                 return
2954                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2955                         try:
2956                                 info['title'] = mdoc.findall('./title')[0].text
2957                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2958                         except IndexError:
2959                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2960                                 return
2961                         info['ext'] = info['url'].rpartition('.')[2]
2962                         return [info]
2963                 elif mobj.group('course'): # A course page
2964                         course = mobj.group('course')
2965                         info = {
2966                                 'id': course,
2967                                 'type': 'playlist',
2968                         }
2969
2970                         self.report_download_webpage(info['id'])
2971                         try:
2972                                 coursepage = urllib2.urlopen(url).read()
2973                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2974                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2975                                 return
2976
2977                         m = re.search('<h1>([^<]+)</h1>', coursepage)
2978                         if m:
2979                                 info['title'] = unescapeHTML(m.group(1))
2980                         else:
2981                                 info['title'] = info['id']
2982
2983                         m = re.search('<description>([^<]+)</description>', coursepage)
2984                         if m:
2985                                 info['description'] = unescapeHTML(m.group(1))
2986
2987                         links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2988                         info['list'] = [
2989                                 {
2990                                         'type': 'reference',
2991                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2992                                 }
2993                                         for vpage in links]
2994                         results = []
2995                         for entry in info['list']:
2996                                 assert entry['type'] == 'reference'
2997                                 results += self.extract(entry['url'])
2998                         return results
2999                         
3000                 else: # Root page
3001                         info = {
3002                                 'id': 'Stanford OpenClassroom',
3003                                 'type': 'playlist',
3004                         }
3005
3006                         self.report_download_webpage(info['id'])
3007                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3008                         try:
3009                                 rootpage = urllib2.urlopen(rootURL).read()
3010                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3011                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3012                                 return
3013
3014                         info['title'] = info['id']
3015
3016                         links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3017                         info['list'] = [
3018                                 {
3019                                         'type': 'reference',
3020                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3021                                 }
3022                                         for cpage in links]
3023
3024                         results = []
3025                         for entry in info['list']:
3026                                 assert entry['type'] == 'reference'
3027                                 results += self.extract(entry['url'])
3028                         return results
3029
3030 class MTVIE(InfoExtractor):
3031         """Information extractor for MTV.com"""
3032
3033         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3034         IE_NAME = u'mtv'
3035
3036         def report_webpage(self, video_id):
3037                 """Report information extraction."""
3038                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3039
3040         def report_extraction(self, video_id):
3041                 """Report information extraction."""
3042                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3043
3044         def _real_extract(self, url):
3045                 mobj = re.match(self._VALID_URL, url)
3046                 if mobj is None:
3047                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3048                         return
3049                 if not mobj.group('proto'):
3050                         url = 'http://' + url
3051                 video_id = mobj.group('videoid')
3052                 self.report_webpage(video_id)
3053
3054                 request = urllib2.Request(url)
3055                 try:
3056                         webpage = urllib2.urlopen(request).read()
3057                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3058                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3059                         return
3060
3061                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3062                 if mobj is None:
3063                         self._downloader.trouble(u'ERROR: unable to extract song name')
3064                         return
3065                 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3066                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3067                 if mobj is None:
3068                         self._downloader.trouble(u'ERROR: unable to extract performer')
3069                         return
3070                 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3071                 video_title = performer + ' - ' + song_name 
3072
3073                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3074                 if mobj is None:
3075                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3076                         return
3077                 mtvn_uri = mobj.group(1)
3078
3079                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3080                 if mobj is None:
3081                         self._downloader.trouble(u'ERROR: unable to extract content id')
3082                         return
3083                 content_id = mobj.group(1)
3084
3085                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3086                 self.report_extraction(video_id)
3087                 request = urllib2.Request(videogen_url)
3088                 try:
3089                         metadataXml = urllib2.urlopen(request).read()
3090                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3091                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3092                         return
3093
3094                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3095                 renditions = mdoc.findall('.//rendition')
3096
3097                 # For now, always pick the highest quality.
3098                 rendition = renditions[-1]
3099
3100                 try:
3101                         _,_,ext = rendition.attrib['type'].partition('/')
3102                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3103                         video_url = rendition.find('./src').text
3104                 except KeyError:
3105                         self._downloader.trouble('Invalid rendition field.')
3106                         return
3107
3108                 info = {
3109                         'id': video_id,
3110                         'url': video_url,
3111                         'uploader': performer,
3112                         'title': video_title,
3113                         'ext': ext,
3114                         'format': format,
3115                 }
3116
3117                 return [info]
3118
3119
3120 class YoukuIE(InfoExtractor):
3121
3122         _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3123         IE_NAME = u'Youku'
3124
3125         def __init__(self, downloader=None):
3126                 InfoExtractor.__init__(self, downloader)
3127
3128         def report_download_webpage(self, file_id):
3129                 """Report webpage download."""
3130                 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3131
3132         def report_extraction(self, file_id):
3133                 """Report information extraction."""
3134                 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3135
3136         def _gen_sid(self):
3137                 nowTime = int(time.time() * 1000)
3138                 random1 = random.randint(1000,1998)
3139                 random2 = random.randint(1000,9999)
3140
3141                 return "%d%d%d" %(nowTime,random1,random2)
3142
3143         def _get_file_ID_mix_string(self, seed):
3144                 mixed = []
3145                 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3146                 seed = float(seed)
3147                 for i in range(len(source)):
3148                         seed  =  (seed * 211 + 30031 ) % 65536
3149                         index  =  math.floor(seed / 65536 * len(source) )
3150                         mixed.append(source[int(index)])
3151                         source.remove(source[int(index)])
3152                 #return ''.join(mixed)
3153                 return mixed
3154
3155         def _get_file_id(self, fileId, seed):
3156                 mixed = self._get_file_ID_mix_string(seed)
3157                 ids = fileId.split('*')
3158                 realId = []
3159                 for ch in ids:
3160                         if ch:
3161                                 realId.append(mixed[int(ch)])
3162                 return ''.join(realId)
3163
3164         def _real_extract(self, url):
3165                 mobj = re.match(self._VALID_URL, url)
3166                 if mobj is None:
3167                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3168                         return
3169                 video_id = mobj.group('ID')
3170
3171                 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3172
3173                 request = urllib2.Request(info_url, None, std_headers)
3174                 try:
3175                         self.report_download_webpage(video_id)
3176                         jsondata = urllib2.urlopen(request).read()
3177                 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3178                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3179                         return
3180
3181                 self.report_extraction(video_id)
3182                 try:
3183                         config = json.loads(jsondata)
3184
3185                         video_title =  config['data'][0]['title']
3186                         seed = config['data'][0]['seed']
3187
3188                         format = self._downloader.params.get('format', None)
3189                         supported_format = config['data'][0]['streamfileids'].keys()
3190
3191                         if format is None or format == 'best':
3192                                 if 'hd2' in supported_format:
3193                                         format = 'hd2'
3194                                 else:
3195                                         format = 'flv'
3196                                 ext = u'flv'
3197                         elif format == 'worst':
3198                                 format = 'mp4'
3199                                 ext = u'mp4'
3200                         else:
3201                                 format = 'flv'
3202                                 ext = u'flv'
3203
3204
3205                         fileid = config['data'][0]['streamfileids'][format]
3206                         seg_number = len(config['data'][0]['segs'][format])
3207
3208                         keys=[]
3209                         for i in xrange(seg_number):
3210                                 keys.append(config['data'][0]['segs'][format][i]['k'])
3211
3212                         #TODO check error
3213                         #youku only could be viewed from mainland china
3214                 except:
3215                         self._downloader.trouble(u'ERROR: unable to extract info section')
3216                         return
3217
3218                 files_info=[]
3219                 sid = self._gen_sid()
3220                 fileid = self._get_file_id(fileid, seed)
3221
3222                 #column 8,9 of fileid represent the segment number
3223                 #fileid[7:9] should be changed
3224                 for index, key in enumerate(keys):
3225
3226                         temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3227                         download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3228
3229                         info = {
3230                                 'id': '%s_part%02d' % (video_id, index),
3231                                 'url': download_url,
3232                                 'uploader': None,
3233                                 'title': video_title,
3234                                 'ext': ext,
3235                         }
3236                         files_info.append(info)
3237
3238                 return files_info
3239
3240
3241 class XNXXIE(InfoExtractor):
3242         """Information extractor for xnxx.com"""
3243
3244         _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3245         IE_NAME = u'xnxx'
3246         VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3247         VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3248         VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3249
3250         def report_webpage(self, video_id):
3251                 """Report information extraction"""
3252                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3253
3254         def report_extraction(self, video_id):
3255                 """Report information extraction"""
3256                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3257
3258         def _real_extract(self, url):
3259                 mobj = re.match(self._VALID_URL, url)
3260                 if mobj is None:
3261                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3262                         return
3263                 video_id = mobj.group(1).decode('utf-8')
3264
3265                 self.report_webpage(video_id)
3266
3267                 # Get webpage content
3268                 try:
3269                         webpage = urllib2.urlopen(url).read()
3270                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3271                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3272                         return
3273
3274                 result = re.search(self.VIDEO_URL_RE, webpage)
3275                 if result is None:
3276                         self._downloader.trouble(u'ERROR: unable to extract video url')
3277                         return
3278                 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3279
3280                 result = re.search(self.VIDEO_TITLE_RE, webpage)
3281                 if result is None:
3282                         self._downloader.trouble(u'ERROR: unable to extract video title')
3283                         return
3284                 video_title = result.group(1).decode('utf-8')
3285
3286                 result = re.search(self.VIDEO_THUMB_RE, webpage)
3287                 if result is None:
3288                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3289                         return
3290                 video_thumbnail = result.group(1).decode('utf-8')
3291
3292                 info = {'id': video_id,
3293                                 'url': video_url,
3294                                 'uploader': None,
3295                                 'upload_date': None,
3296                                 'title': video_title,
3297                                 'ext': 'flv',
3298                                 'thumbnail': video_thumbnail,
3299                                 'description': None,
3300                                 'player_url': None}
3301
3302                 return [info]
3303
3304
3305 class GooglePlusIE(InfoExtractor):
3306         """Information extractor for plus.google.com."""
3307
3308         _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3309         IE_NAME = u'plus.google'
3310
3311         def __init__(self, downloader=None):
3312                 InfoExtractor.__init__(self, downloader)
3313
3314         def report_extract_entry(self, url):
3315                 """Report downloading extry"""
3316                 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3317
3318         def report_date(self, upload_date):
3319                 """Report downloading extry"""
3320                 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3321
3322         def report_uploader(self, uploader):
3323                 """Report downloading extry"""
3324                 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3325
3326         def report_title(self, video_title):
3327                 """Report downloading extry"""
3328                 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3329
3330         def report_extract_vid_page(self, video_page):
3331                 """Report information extraction."""
3332                 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3333
3334         def _real_extract(self, url):
3335                 # Extract id from URL
3336                 mobj = re.match(self._VALID_URL, url)
3337                 if mobj is None:
3338                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3339                         return
3340
3341                 post_url = mobj.group(0)
3342                 video_id = mobj.group(2)
3343
3344                 video_extension = 'flv'
3345
3346                 # Step 1, Retrieve post webpage to extract further information
3347                 self.report_extract_entry(post_url)
3348                 request = urllib2.Request(post_url)
3349                 try:
3350                         webpage = urllib2.urlopen(request).read()
3351                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3352                         self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3353                         return
3354
3355                 # Extract update date
3356                 upload_date = u'NA'
3357                 pattern = 'title="Timestamp">(.*?)</a>'
3358                 mobj = re.search(pattern, webpage)
3359                 if mobj:
3360                         upload_date = mobj.group(1)
3361                         # Convert timestring to a format suitable for filename
3362                         upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3363                         upload_date = upload_date.strftime('%Y%m%d')
3364                 self.report_date(upload_date)
3365
3366                 # Extract uploader
3367                 uploader = u'NA'
3368                 pattern = r'rel\="author".*?>(.*?)</a>'
3369                 mobj = re.search(pattern, webpage)
3370                 if mobj:
3371                         uploader = mobj.group(1)
3372                 self.report_uploader(uploader)
3373
3374                 # Extract title
3375                 # Get the first line for title
3376                 video_title = u'NA'
3377                 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3378                 mobj = re.search(pattern, webpage)
3379                 if mobj:
3380                         video_title = mobj.group(1)
3381                 self.report_title(video_title)
3382
3383                 # Step 2, Stimulate clicking the image box to launch video
3384                 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3385                 mobj = re.search(pattern, webpage)
3386                 if mobj is None:
3387                         self._downloader.trouble(u'ERROR: unable to extract video page URL')
3388
3389                 video_page = mobj.group(1)
3390                 request = urllib2.Request(video_page)
3391                 try:
3392                         webpage = urllib2.urlopen(request).read()
3393                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3394                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3395                         return
3396                 self.report_extract_vid_page(video_page)
3397
3398
3399                 # Extract video links on video page
3400                 """Extract video links of all sizes"""
3401                 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3402                 mobj = re.findall(pattern, webpage)
3403                 if len(mobj) == 0:
3404                         self._downloader.trouble(u'ERROR: unable to extract video links')
3405
3406                 # Sort in resolution
3407                 links = sorted(mobj)
3408
3409                 # Choose the lowest of the sort, i.e. highest resolution
3410                 video_url = links[-1]
3411                 # Only get the url. The resolution part in the tuple has no use anymore
3412                 video_url = video_url[-1]
3413                 # Treat escaped \u0026 style hex
3414                 video_url = unicode(video_url, "unicode_escape")
3415
3416
3417                 return [{
3418                         'id':           video_id.decode('utf-8'),
3419                         'url':          video_url,
3420                         'uploader':     uploader.decode('utf-8'),
3421                         'upload_date':  upload_date.decode('utf-8'),
3422                         'title':        video_title.decode('utf-8'),
3423                         'ext':          video_extension.decode('utf-8'),
3424                         'player_url':   None,
3425                 }]