a39e865dc13966e51f7bad1314a63db7d2b8e13a
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import datetime
5 import HTMLParser
6 import httplib
7 import netrc
8 import os
9 import re
10 import socket
11 import time
12 import urllib
13 import urllib2
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 from urlparse import parse_qs
19
20 try:
21         import cStringIO as StringIO
22 except ImportError:
23         import StringIO
24
25 from utils import *
26
27
28 class InfoExtractor(object):
29         """Information Extractor class.
30
31         Information extractors are the classes that, given a URL, extract
32         information about the video (or videos) the URL refers to. This
33         information includes the real video URL, the video title, author and
34         others. The information is stored in a dictionary which is then 
35         passed to the FileDownloader. The FileDownloader processes this
36         information possibly downloading the video to the file system, among
37         other possible outcomes.
38
39         The dictionaries must include the following fields:
40
41         id:         Video identifier.
42         url:        Final video URL.
43         uploader:   Nickname of the video uploader.
44         title:      Video title, unescaped.
45         ext:        Video filename extension.
46
47         The following fields are optional:
48
49         format:         The video format, defaults to ext (used for --get-format)
50         thumbnail:      Full URL to a video thumbnail image.
51         description     One-line video description.
52         player_url:     SWF Player URL (used for rtmpdump).
53
54         Subclasses of this one should re-define the _real_initialize() and
55         _real_extract() methods and define a _VALID_URL regexp.
56         Probably, they should also be added to the list of extractors.
57
58         _real_extract() must return a *list* of information dictionaries as
59         described above.
60         """
61
62         _ready = False
63         _downloader = None
64
65         def __init__(self, downloader=None):
66                 """Constructor. Receives an optional downloader."""
67                 self._ready = False
68                 self.set_downloader(downloader)
69
70         def suitable(self, url):
71                 """Receives a URL and returns True if suitable for this IE."""
72                 return re.match(self._VALID_URL, url) is not None
73
74         def initialize(self):
75                 """Initializes an instance (authentication, etc)."""
76                 if not self._ready:
77                         self._real_initialize()
78                         self._ready = True
79
80         def extract(self, url):
81                 """Extracts URL information and returns it in list of dicts."""
82                 self.initialize()
83                 return self._real_extract(url)
84
85         def set_downloader(self, downloader):
86                 """Sets the downloader for this IE."""
87                 self._downloader = downloader
88
89         def _real_initialize(self):
90                 """Real initialization process. Redefine in subclasses."""
91                 pass
92
93         def _real_extract(self, url):
94                 """Real extraction process. Redefine in subclasses."""
95                 pass
96
97
98 class YoutubeIE(InfoExtractor):
99         """Information extractor for youtube.com."""
100
101         _VALID_URL = r"""^
102                          (
103                              (?:https?://)?                                       # http(s):// (optional)
104                              (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
105                                 tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
106                              (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
107                              (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
108                              (?:                                                  # the various things that can precede the ID:
109                                  (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
110                                  |(?:                                             # or the v= param in all its forms
111                                      (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
112                                      (?:\?|\#!?)                                  # the params delimiter ? or # or #!
113                                      (?:.+&)?                                     # any other preceding param (like /?s=tuff&v=xxxx)
114                                      v=
115                                  )
116                              )?                                                   # optional -> youtube.com/xxxx is OK
117                          )?                                                       # all until now is optional -> you can pass the naked ID
118                          ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
119                          (?(1).+)?                                                # if we found the ID, everything can follow
120                          $"""
121         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
122         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
123         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
124         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
125         _NETRC_MACHINE = 'youtube'
126         # Listed in order of quality
127         _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
128         _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
129         _video_extensions = {
130                 '13': '3gp',
131                 '17': 'mp4',
132                 '18': 'mp4',
133                 '22': 'mp4',
134                 '37': 'mp4',
135                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
136                 '43': 'webm',
137                 '44': 'webm',
138                 '45': 'webm',
139                 '46': 'webm',
140         }
141         _video_dimensions = {
142                 '5': '240x400',
143                 '6': '???',
144                 '13': '???',
145                 '17': '144x176',
146                 '18': '360x640',
147                 '22': '720x1280',
148                 '34': '360x640',
149                 '35': '480x854',
150                 '37': '1080x1920',
151                 '38': '3072x4096',
152                 '43': '360x640',
153                 '44': '480x854',
154                 '45': '720x1280',
155                 '46': '1080x1920',
156         }       
157         IE_NAME = u'youtube'
158
159         def suitable(self, url):
160                 """Receives a URL and returns True if suitable for this IE."""
161                 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
162
163         def report_lang(self):
164                 """Report attempt to set language."""
165                 self._downloader.to_screen(u'[youtube] Setting language')
166
167         def report_login(self):
168                 """Report attempt to log in."""
169                 self._downloader.to_screen(u'[youtube] Logging in')
170
171         def report_age_confirmation(self):
172                 """Report attempt to confirm age."""
173                 self._downloader.to_screen(u'[youtube] Confirming age')
174
175         def report_video_webpage_download(self, video_id):
176                 """Report attempt to download video webpage."""
177                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
178
179         def report_video_info_webpage_download(self, video_id):
180                 """Report attempt to download video info webpage."""
181                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
182
183         def report_video_subtitles_download(self, video_id):
184                 """Report attempt to download video info webpage."""
185                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
186
187         def report_information_extraction(self, video_id):
188                 """Report attempt to extract video information."""
189                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
190
191         def report_unavailable_format(self, video_id, format):
192                 """Report extracted video URL."""
193                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
194
195         def report_rtmp_download(self):
196                 """Indicate the download will use the RTMP protocol."""
197                 self._downloader.to_screen(u'[youtube] RTMP download detected')
198
199         def _closed_captions_xml_to_srt(self, xml_string):
200                 srt = ''
201                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
202                 # TODO parse xml instead of regex
203                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
204                         if not dur: dur = '4'
205                         start = float(start)
206                         end = start + float(dur)
207                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
208                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
209                         caption = unescapeHTML(caption)
210                         caption = unescapeHTML(caption) # double cycle, intentional
211                         srt += str(n+1) + '\n'
212                         srt += start + ' --> ' + end + '\n'
213                         srt += caption + '\n\n'
214                 return srt
215
216         def _print_formats(self, formats):
217                 print('Available formats:')
218                 for x in formats:
219                         print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
220
221         def _real_initialize(self):
222                 if self._downloader is None:
223                         return
224
225                 username = None
226                 password = None
227                 downloader_params = self._downloader.params
228
229                 # Attempt to use provided username and password or .netrc data
230                 if downloader_params.get('username', None) is not None:
231                         username = downloader_params['username']
232                         password = downloader_params['password']
233                 elif downloader_params.get('usenetrc', False):
234                         try:
235                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
236                                 if info is not None:
237                                         username = info[0]
238                                         password = info[2]
239                                 else:
240                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
241                         except (IOError, netrc.NetrcParseError), err:
242                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
243                                 return
244
245                 # Set language
246                 request = urllib2.Request(self._LANG_URL)
247                 try:
248                         self.report_lang()
249                         urllib2.urlopen(request).read()
250                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
251                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
252                         return
253
254                 # No authentication to be performed
255                 if username is None:
256                         return
257
258                 # Log in
259                 login_form = {
260                                 'current_form': 'loginForm',
261                                 'next':         '/',
262                                 'action_login': 'Log In',
263                                 'username':     username,
264                                 'password':     password,
265                                 }
266                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
267                 try:
268                         self.report_login()
269                         login_results = urllib2.urlopen(request).read()
270                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
271                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
272                                 return
273                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
274                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
275                         return
276
277                 # Confirm age
278                 age_form = {
279                                 'next_url':             '/',
280                                 'action_confirm':       'Confirm',
281                                 }
282                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
283                 try:
284                         self.report_age_confirmation()
285                         age_results = urllib2.urlopen(request).read()
286                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
287                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
288                         return
289
290         def _real_extract(self, url):
291                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
292                 mobj = re.search(self._NEXT_URL_RE, url)
293                 if mobj:
294                         url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
295
296                 # Extract video id from URL
297                 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
298                 if mobj is None:
299                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
300                         return
301                 video_id = mobj.group(2)
302
303                 # Get video webpage
304                 self.report_video_webpage_download(video_id)
305                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
306                 try:
307                         video_webpage = urllib2.urlopen(request).read()
308                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
309                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
310                         return
311
312                 # Attempt to extract SWF player URL
313                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
314                 if mobj is not None:
315                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
316                 else:
317                         player_url = None
318
319                 # Get video info
320                 self.report_video_info_webpage_download(video_id)
321                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
322                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
323                                         % (video_id, el_type))
324                         request = urllib2.Request(video_info_url)
325                         try:
326                                 video_info_webpage = urllib2.urlopen(request).read()
327                                 video_info = parse_qs(video_info_webpage)
328                                 if 'token' in video_info:
329                                         break
330                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
331                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
332                                 return
333                 if 'token' not in video_info:
334                         if 'reason' in video_info:
335                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
336                         else:
337                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
338                         return
339
340                 # Check for "rental" videos
341                 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
342                         self._downloader.trouble(u'ERROR: "rental" videos not supported')
343                         return
344
345                 # Start extracting information
346                 self.report_information_extraction(video_id)
347
348                 # uploader
349                 if 'author' not in video_info:
350                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
351                         return
352                 video_uploader = urllib.unquote_plus(video_info['author'][0])
353
354                 # title
355                 if 'title' not in video_info:
356                         self._downloader.trouble(u'ERROR: unable to extract video title')
357                         return
358                 video_title = urllib.unquote_plus(video_info['title'][0])
359                 video_title = video_title.decode('utf-8')
360
361                 # thumbnail image
362                 if 'thumbnail_url' not in video_info:
363                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
364                         video_thumbnail = ''
365                 else:   # don't panic if we can't find it
366                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
367
368                 # upload date
369                 upload_date = u'NA'
370                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
371                 if mobj is not None:
372                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
373                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
374                         for expression in format_expressions:
375                                 try:
376                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
377                                 except:
378                                         pass
379
380                 # description
381                 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
382                 if video_description: video_description = clean_html(video_description)
383                 else: video_description = ''
384                         
385                 # closed captions
386                 video_subtitles = None
387                 if self._downloader.params.get('writesubtitles', False):
388                         try:
389                                 self.report_video_subtitles_download(video_id)
390                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
391                                 try:
392                                         srt_list = urllib2.urlopen(request).read()
393                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
394                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
395                                 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
396                                 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
397                                 if not srt_lang_list:
398                                         raise Trouble(u'WARNING: video has no closed captions')
399                                 if self._downloader.params.get('subtitleslang', False):
400                                         srt_lang = self._downloader.params.get('subtitleslang')
401                                 elif 'en' in srt_lang_list:
402                                         srt_lang = 'en'
403                                 else:
404                                         srt_lang = srt_lang_list.keys()[0]
405                                 if not srt_lang in srt_lang_list:
406                                         raise Trouble(u'WARNING: no closed captions found in the specified language')
407                                 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
408                                 try:
409                                         srt_xml = urllib2.urlopen(request).read()
410                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
411                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
412                                 if not srt_xml:
413                                         raise Trouble(u'WARNING: unable to download video subtitles')
414                                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
415                         except Trouble as trouble:
416                                 self._downloader.trouble(trouble[0])
417
418                 if 'length_seconds' not in video_info:
419                         self._downloader.trouble(u'WARNING: unable to extract video duration')
420                         video_duration = ''
421                 else:
422                         video_duration = urllib.unquote_plus(video_info['length_seconds'][0])
423
424                 # token
425                 video_token = urllib.unquote_plus(video_info['token'][0])
426
427                 # Decide which formats to download
428                 req_format = self._downloader.params.get('format', None)
429
430                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
431                         self.report_rtmp_download()
432                         video_url_list = [(None, video_info['conn'][0])]
433                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
434                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
435                         url_data = [parse_qs(uds) for uds in url_data_strs]
436                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
437                         url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
438
439                         format_limit = self._downloader.params.get('format_limit', None)
440                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
441                         if format_limit is not None and format_limit in available_formats:
442                                 format_list = available_formats[available_formats.index(format_limit):]
443                         else:
444                                 format_list = available_formats
445                         existing_formats = [x for x in format_list if x in url_map]
446                         if len(existing_formats) == 0:
447                                 self._downloader.trouble(u'ERROR: no known formats available for video')
448                                 return
449                         if self._downloader.params.get('listformats', None):
450                                 self._print_formats(existing_formats)
451                                 return
452                         if req_format is None or req_format == 'best':
453                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
454                         elif req_format == 'worst':
455                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
456                         elif req_format in ('-1', 'all'):
457                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
458                         else:
459                                 # Specific formats. We pick the first in a slash-delimeted sequence.
460                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
461                                 req_formats = req_format.split('/')
462                                 video_url_list = None
463                                 for rf in req_formats:
464                                         if rf in url_map:
465                                                 video_url_list = [(rf, url_map[rf])]
466                                                 break
467                                 if video_url_list is None:
468                                         self._downloader.trouble(u'ERROR: requested format not available')
469                                         return
470                 else:
471                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
472                         return
473
474                 results = []
475                 for format_param, video_real_url in video_url_list:
476                         # Extension
477                         video_extension = self._video_extensions.get(format_param, 'flv')
478
479                         video_format = '{} - {}'.format(format_param.decode('utf-8') if format_param else video_extension.decode('utf-8'),
480                                                             self._video_dimensions.get(format_param, '???'))
481
482                         results.append({
483                                 'id':           video_id.decode('utf-8'),
484                                 'url':          video_real_url.decode('utf-8'),
485                                 'uploader':     video_uploader.decode('utf-8'),
486                                 'upload_date':  upload_date,
487                                 'title':        video_title,
488                                 'ext':          video_extension.decode('utf-8'),
489                                 'format':       video_format,
490                                 'thumbnail':    video_thumbnail.decode('utf-8'),
491                                 'description':  video_description,
492                                 'player_url':   player_url,
493                                 'subtitles':    video_subtitles,
494                                 'duration':             video_duration
495                         })
496                 return results
497
498
499 class MetacafeIE(InfoExtractor):
500         """Information Extractor for metacafe.com."""
501
502         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
503         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
504         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
505         IE_NAME = u'metacafe'
506
507         def __init__(self, downloader=None):
508                 InfoExtractor.__init__(self, downloader)
509
510         def report_disclaimer(self):
511                 """Report disclaimer retrieval."""
512                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
513
514         def report_age_confirmation(self):
515                 """Report attempt to confirm age."""
516                 self._downloader.to_screen(u'[metacafe] Confirming age')
517
518         def report_download_webpage(self, video_id):
519                 """Report webpage download."""
520                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
521
522         def report_extraction(self, video_id):
523                 """Report information extraction."""
524                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
525
526         def _real_initialize(self):
527                 # Retrieve disclaimer
528                 request = urllib2.Request(self._DISCLAIMER)
529                 try:
530                         self.report_disclaimer()
531                         disclaimer = urllib2.urlopen(request).read()
532                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
533                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
534                         return
535
536                 # Confirm age
537                 disclaimer_form = {
538                         'filters': '0',
539                         'submit': "Continue - I'm over 18",
540                         }
541                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
542                 try:
543                         self.report_age_confirmation()
544                         disclaimer = urllib2.urlopen(request).read()
545                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
546                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
547                         return
548
549         def _real_extract(self, url):
550                 # Extract id and simplified title from URL
551                 mobj = re.match(self._VALID_URL, url)
552                 if mobj is None:
553                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
554                         return
555
556                 video_id = mobj.group(1)
557
558                 # Check if video comes from YouTube
559                 mobj2 = re.match(r'^yt-(.*)$', video_id)
560                 if mobj2 is not None:
561                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
562                         return
563
564                 # Retrieve video webpage to extract further information
565                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
566                 try:
567                         self.report_download_webpage(video_id)
568                         webpage = urllib2.urlopen(request).read()
569                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
570                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
571                         return
572
573                 # Extract URL, uploader and title from webpage
574                 self.report_extraction(video_id)
575                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
576                 if mobj is not None:
577                         mediaURL = urllib.unquote(mobj.group(1))
578                         video_extension = mediaURL[-3:]
579
580                         # Extract gdaKey if available
581                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
582                         if mobj is None:
583                                 video_url = mediaURL
584                         else:
585                                 gdaKey = mobj.group(1)
586                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
587                 else:
588                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
589                         if mobj is None:
590                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
591                                 return
592                         vardict = parse_qs(mobj.group(1))
593                         if 'mediaData' not in vardict:
594                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
595                                 return
596                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
597                         if mobj is None:
598                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
599                                 return
600                         mediaURL = mobj.group(1).replace('\\/', '/')
601                         video_extension = mediaURL[-3:]
602                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
603
604                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
605                 if mobj is None:
606                         self._downloader.trouble(u'ERROR: unable to extract title')
607                         return
608                 video_title = mobj.group(1).decode('utf-8')
609
610                 mobj = re.search(r'submitter=(.*?);', webpage)
611                 if mobj is None:
612                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
613                         return
614                 video_uploader = mobj.group(1)
615
616                 return [{
617                         'id':           video_id.decode('utf-8'),
618                         'url':          video_url.decode('utf-8'),
619                         'uploader':     video_uploader.decode('utf-8'),
620                         'upload_date':  u'NA',
621                         'title':        video_title,
622                         'ext':          video_extension.decode('utf-8'),
623                 }]
624
625
626 class DailymotionIE(InfoExtractor):
627         """Information Extractor for Dailymotion"""
628
629         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
630         IE_NAME = u'dailymotion'
631
632         def __init__(self, downloader=None):
633                 InfoExtractor.__init__(self, downloader)
634
635         def report_download_webpage(self, video_id):
636                 """Report webpage download."""
637                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
638
639         def report_extraction(self, video_id):
640                 """Report information extraction."""
641                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
642
643         def _real_extract(self, url):
644                 # Extract id and simplified title from URL
645                 mobj = re.match(self._VALID_URL, url)
646                 if mobj is None:
647                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
648                         return
649
650                 video_id = mobj.group(1).split('_')[0].split('?')[0]
651
652                 video_extension = 'mp4'
653
654                 # Retrieve video webpage to extract further information
655                 request = urllib2.Request(url)
656                 request.add_header('Cookie', 'family_filter=off')
657                 try:
658                         self.report_download_webpage(video_id)
659                         webpage = urllib2.urlopen(request).read()
660                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
661                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
662                         return
663
664                 # Extract URL, uploader and title from webpage
665                 self.report_extraction(video_id)
666                 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
667                 if mobj is None:
668                         self._downloader.trouble(u'ERROR: unable to extract media URL')
669                         return
670                 flashvars = urllib.unquote(mobj.group(1))
671
672                 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
673                         if key in flashvars:
674                                 max_quality = key
675                                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
676                                 break
677                 else:
678                         self._downloader.trouble(u'ERROR: unable to extract video URL')
679                         return
680
681                 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
682                 if mobj is None:
683                         self._downloader.trouble(u'ERROR: unable to extract video URL')
684                         return
685
686                 video_url = urllib.unquote(mobj.group(1)).replace('\\/', '/')
687
688                 # TODO: support choosing qualities
689
690                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
691                 if mobj is None:
692                         self._downloader.trouble(u'ERROR: unable to extract title')
693                         return
694                 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
695
696                 video_uploader = u'NA'
697                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
698                 if mobj is None:
699                         # lookin for official user
700                         mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
701                         if mobj_official is None:
702                                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
703                         else:
704                                 video_uploader = mobj_official.group(1)
705                 else:
706                         video_uploader = mobj.group(1)
707
708                 video_upload_date = u'NA'
709                 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
710                 if mobj is not None:
711                         video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
712
713                 return [{
714                         'id':           video_id.decode('utf-8'),
715                         'url':          video_url.decode('utf-8'),
716                         'uploader':     video_uploader.decode('utf-8'),
717                         'upload_date':  video_upload_date,
718                         'title':        video_title,
719                         'ext':          video_extension.decode('utf-8'),
720                 }]
721
722
723 class GoogleIE(InfoExtractor):
724         """Information extractor for video.google.com."""
725
726         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
727         IE_NAME = u'video.google'
728
729         def __init__(self, downloader=None):
730                 InfoExtractor.__init__(self, downloader)
731
732         def report_download_webpage(self, video_id):
733                 """Report webpage download."""
734                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
735
736         def report_extraction(self, video_id):
737                 """Report information extraction."""
738                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
739
740         def _real_extract(self, url):
741                 # Extract id from URL
742                 mobj = re.match(self._VALID_URL, url)
743                 if mobj is None:
744                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
745                         return
746
747                 video_id = mobj.group(1)
748
749                 video_extension = 'mp4'
750
751                 # Retrieve video webpage to extract further information
752                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
753                 try:
754                         self.report_download_webpage(video_id)
755                         webpage = urllib2.urlopen(request).read()
756                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
757                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
758                         return
759
760                 # Extract URL, uploader, and title from webpage
761                 self.report_extraction(video_id)
762                 mobj = re.search(r"download_url:'([^']+)'", webpage)
763                 if mobj is None:
764                         video_extension = 'flv'
765                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
766                 if mobj is None:
767                         self._downloader.trouble(u'ERROR: unable to extract media URL')
768                         return
769                 mediaURL = urllib.unquote(mobj.group(1))
770                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
771                 mediaURL = mediaURL.replace('\\x26', '\x26')
772
773                 video_url = mediaURL
774
775                 mobj = re.search(r'<title>(.*)</title>', webpage)
776                 if mobj is None:
777                         self._downloader.trouble(u'ERROR: unable to extract title')
778                         return
779                 video_title = mobj.group(1).decode('utf-8')
780
781                 # Extract video description
782                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
783                 if mobj is None:
784                         self._downloader.trouble(u'ERROR: unable to extract video description')
785                         return
786                 video_description = mobj.group(1).decode('utf-8')
787                 if not video_description:
788                         video_description = 'No description available.'
789
790                 # Extract video thumbnail
791                 if self._downloader.params.get('forcethumbnail', False):
792                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
793                         try:
794                                 webpage = urllib2.urlopen(request).read()
795                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
796                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
797                                 return
798                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
799                         if mobj is None:
800                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
801                                 return
802                         video_thumbnail = mobj.group(1)
803                 else:   # we need something to pass to process_info
804                         video_thumbnail = ''
805
806                 return [{
807                         'id':           video_id.decode('utf-8'),
808                         'url':          video_url.decode('utf-8'),
809                         'uploader':     u'NA',
810                         'upload_date':  u'NA',
811                         'title':        video_title,
812                         'ext':          video_extension.decode('utf-8'),
813                 }]
814
815
816 class PhotobucketIE(InfoExtractor):
817         """Information extractor for photobucket.com."""
818
819         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
820         IE_NAME = u'photobucket'
821
822         def __init__(self, downloader=None):
823                 InfoExtractor.__init__(self, downloader)
824
825         def report_download_webpage(self, video_id):
826                 """Report webpage download."""
827                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
828
829         def report_extraction(self, video_id):
830                 """Report information extraction."""
831                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
832
833         def _real_extract(self, url):
834                 # Extract id from URL
835                 mobj = re.match(self._VALID_URL, url)
836                 if mobj is None:
837                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
838                         return
839
840                 video_id = mobj.group(1)
841
842                 video_extension = 'flv'
843
844                 # Retrieve video webpage to extract further information
845                 request = urllib2.Request(url)
846                 try:
847                         self.report_download_webpage(video_id)
848                         webpage = urllib2.urlopen(request).read()
849                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
850                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
851                         return
852
853                 # Extract URL, uploader, and title from webpage
854                 self.report_extraction(video_id)
855                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
856                 if mobj is None:
857                         self._downloader.trouble(u'ERROR: unable to extract media URL')
858                         return
859                 mediaURL = urllib.unquote(mobj.group(1))
860
861                 video_url = mediaURL
862
863                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
864                 if mobj is None:
865                         self._downloader.trouble(u'ERROR: unable to extract title')
866                         return
867                 video_title = mobj.group(1).decode('utf-8')
868
869                 video_uploader = mobj.group(2).decode('utf-8')
870
871                 return [{
872                         'id':           video_id.decode('utf-8'),
873                         'url':          video_url.decode('utf-8'),
874                         'uploader':     video_uploader,
875                         'upload_date':  u'NA',
876                         'title':        video_title,
877                         'ext':          video_extension.decode('utf-8'),
878                 }]
879
880
881 class YahooIE(InfoExtractor):
882         """Information extractor for video.yahoo.com."""
883
884         # _VALID_URL matches all Yahoo! Video URLs
885         # _VPAGE_URL matches only the extractable '/watch/' URLs
886         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
887         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
888         IE_NAME = u'video.yahoo'
889
890         def __init__(self, downloader=None):
891                 InfoExtractor.__init__(self, downloader)
892
893         def report_download_webpage(self, video_id):
894                 """Report webpage download."""
895                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
896
897         def report_extraction(self, video_id):
898                 """Report information extraction."""
899                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
900
901         def _real_extract(self, url, new_video=True):
902                 # Extract ID from URL
903                 mobj = re.match(self._VALID_URL, url)
904                 if mobj is None:
905                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
906                         return
907
908                 video_id = mobj.group(2)
909                 video_extension = 'flv'
910
911                 # Rewrite valid but non-extractable URLs as
912                 # extractable English language /watch/ URLs
913                 if re.match(self._VPAGE_URL, url) is None:
914                         request = urllib2.Request(url)
915                         try:
916                                 webpage = urllib2.urlopen(request).read()
917                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
918                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
919                                 return
920
921                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
922                         if mobj is None:
923                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
924                                 return
925                         yahoo_id = mobj.group(1)
926
927                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
928                         if mobj is None:
929                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
930                                 return
931                         yahoo_vid = mobj.group(1)
932
933                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
934                         return self._real_extract(url, new_video=False)
935
936                 # Retrieve video webpage to extract further information
937                 request = urllib2.Request(url)
938                 try:
939                         self.report_download_webpage(video_id)
940                         webpage = urllib2.urlopen(request).read()
941                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
942                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
943                         return
944
945                 # Extract uploader and title from webpage
946                 self.report_extraction(video_id)
947                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
948                 if mobj is None:
949                         self._downloader.trouble(u'ERROR: unable to extract video title')
950                         return
951                 video_title = mobj.group(1).decode('utf-8')
952
953                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
954                 if mobj is None:
955                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
956                         return
957                 video_uploader = mobj.group(1).decode('utf-8')
958
959                 # Extract video thumbnail
960                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
961                 if mobj is None:
962                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
963                         return
964                 video_thumbnail = mobj.group(1).decode('utf-8')
965
966                 # Extract video description
967                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
968                 if mobj is None:
969                         self._downloader.trouble(u'ERROR: unable to extract video description')
970                         return
971                 video_description = mobj.group(1).decode('utf-8')
972                 if not video_description:
973                         video_description = 'No description available.'
974
975                 # Extract video height and width
976                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
977                 if mobj is None:
978                         self._downloader.trouble(u'ERROR: unable to extract video height')
979                         return
980                 yv_video_height = mobj.group(1)
981
982                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
983                 if mobj is None:
984                         self._downloader.trouble(u'ERROR: unable to extract video width')
985                         return
986                 yv_video_width = mobj.group(1)
987
988                 # Retrieve video playlist to extract media URL
989                 # I'm not completely sure what all these options are, but we
990                 # seem to need most of them, otherwise the server sends a 401.
991                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
992                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
993                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
994                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
995                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
996                 try:
997                         self.report_download_webpage(video_id)
998                         webpage = urllib2.urlopen(request).read()
999                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1000                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1001                         return
1002
1003                 # Extract media URL from playlist XML
1004                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1005                 if mobj is None:
1006                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1007                         return
1008                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1009                 video_url = unescapeHTML(video_url)
1010
1011                 return [{
1012                         'id':           video_id.decode('utf-8'),
1013                         'url':          video_url,
1014                         'uploader':     video_uploader,
1015                         'upload_date':  u'NA',
1016                         'title':        video_title,
1017                         'ext':          video_extension.decode('utf-8'),
1018                         'thumbnail':    video_thumbnail.decode('utf-8'),
1019                         'description':  video_description,
1020                         'thumbnail':    video_thumbnail,
1021                 }]
1022
1023
1024 class VimeoIE(InfoExtractor):
1025         """Information extractor for vimeo.com."""
1026
1027         # _VALID_URL matches Vimeo URLs
1028         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
1029         IE_NAME = u'vimeo'
1030
1031         def __init__(self, downloader=None):
1032                 InfoExtractor.__init__(self, downloader)
1033
1034         def report_download_webpage(self, video_id):
1035                 """Report webpage download."""
1036                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1037
1038         def report_extraction(self, video_id):
1039                 """Report information extraction."""
1040                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1041
1042         def _real_extract(self, url, new_video=True):
1043                 # Extract ID from URL
1044                 mobj = re.match(self._VALID_URL, url)
1045                 if mobj is None:
1046                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1047                         return
1048
1049                 video_id = mobj.group(1)
1050
1051                 # Retrieve video webpage to extract further information
1052                 request = urllib2.Request(url, None, std_headers)
1053                 try:
1054                         self.report_download_webpage(video_id)
1055                         webpage = urllib2.urlopen(request).read()
1056                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1057                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1058                         return
1059
1060                 # Now we begin extracting as much information as we can from what we
1061                 # retrieved. First we extract the information common to all extractors,
1062                 # and latter we extract those that are Vimeo specific.
1063                 self.report_extraction(video_id)
1064
1065                 # Extract the config JSON
1066                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1067                 try:
1068                         config = json.loads(config)
1069                 except:
1070                         self._downloader.trouble(u'ERROR: unable to extract info section')
1071                         return
1072                 
1073                 # Extract title
1074                 video_title = config["video"]["title"]
1075
1076                 # Extract uploader
1077                 video_uploader = config["video"]["owner"]["name"]
1078
1079                 # Extract video thumbnail
1080                 video_thumbnail = config["video"]["thumbnail"]
1081
1082                 # Extract video description
1083                 video_description = get_element_by_id("description", webpage.decode('utf8'))
1084                 if video_description: video_description = clean_html(video_description)
1085                 else: video_description = ''
1086
1087                 # Extract upload date
1088                 video_upload_date = u'NA'
1089                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1090                 if mobj is not None:
1091                         video_upload_date = mobj.group(1)
1092
1093                 # Vimeo specific: extract request signature and timestamp
1094                 sig = config['request']['signature']
1095                 timestamp = config['request']['timestamp']
1096
1097                 # Vimeo specific: extract video codec and quality information
1098                 # First consider quality, then codecs, then take everything
1099                 # TODO bind to format param
1100                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1101                 files = { 'hd': [], 'sd': [], 'other': []}
1102                 for codec_name, codec_extension in codecs:
1103                         if codec_name in config["video"]["files"]:
1104                                 if 'hd' in config["video"]["files"][codec_name]:
1105                                         files['hd'].append((codec_name, codec_extension, 'hd'))
1106                                 elif 'sd' in config["video"]["files"][codec_name]:
1107                                         files['sd'].append((codec_name, codec_extension, 'sd'))
1108                                 else:
1109                                         files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1110
1111                 for quality in ('hd', 'sd', 'other'):
1112                         if len(files[quality]) > 0:
1113                                 video_quality = files[quality][0][2]
1114                                 video_codec = files[quality][0][0]
1115                                 video_extension = files[quality][0][1]
1116                                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1117                                 break
1118                 else:
1119                         self._downloader.trouble(u'ERROR: no known codec found')
1120                         return
1121
1122                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1123                                         %(video_id, sig, timestamp, video_quality, video_codec.upper())
1124
1125                 return [{
1126                         'id':           video_id,
1127                         'url':          video_url,
1128                         'uploader':     video_uploader,
1129                         'upload_date':  video_upload_date,
1130                         'title':        video_title,
1131                         'ext':          video_extension,
1132                         'thumbnail':    video_thumbnail,
1133                         'description':  video_description,
1134                 }]
1135
1136
1137 class GenericIE(InfoExtractor):
1138         """Generic last-resort information extractor."""
1139
1140         _VALID_URL = r'.*'
1141         IE_NAME = u'generic'
1142
1143         def __init__(self, downloader=None):
1144                 InfoExtractor.__init__(self, downloader)
1145
1146         def report_download_webpage(self, video_id):
1147                 """Report webpage download."""
1148                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1149                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1150
1151         def report_extraction(self, video_id):
1152                 """Report information extraction."""
1153                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1154
1155         def report_following_redirect(self, new_url):
1156                 """Report information extraction."""
1157                 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1158                 
1159         def _test_redirect(self, url):
1160                 """Check if it is a redirect, like url shorteners, in case restart chain."""
1161                 class HeadRequest(urllib2.Request):
1162                         def get_method(self):
1163                                 return "HEAD"
1164
1165                 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1166                         """
1167                         Subclass the HTTPRedirectHandler to make it use our 
1168                         HeadRequest also on the redirected URL
1169                         """
1170                         def redirect_request(self, req, fp, code, msg, headers, newurl): 
1171                                 if code in (301, 302, 303, 307):
1172                                         newurl = newurl.replace(' ', '%20') 
1173                                         newheaders = dict((k,v) for k,v in req.headers.items()
1174                                                                           if k.lower() not in ("content-length", "content-type"))
1175                                         return HeadRequest(newurl, 
1176                                                                            headers=newheaders,
1177                                                                            origin_req_host=req.get_origin_req_host(), 
1178                                                                            unverifiable=True) 
1179                                 else: 
1180                                         raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp) 
1181
1182                 class HTTPMethodFallback(urllib2.BaseHandler):
1183                         """
1184                         Fallback to GET if HEAD is not allowed (405 HTTP error)
1185                         """
1186                         def http_error_405(self, req, fp, code, msg, headers): 
1187                                 fp.read()
1188                                 fp.close()
1189
1190                                 newheaders = dict((k,v) for k,v in req.headers.items()
1191                                                                   if k.lower() not in ("content-length", "content-type"))
1192                                 return self.parent.open(urllib2.Request(req.get_full_url(), 
1193                                                                                                  headers=newheaders, 
1194                                                                                                  origin_req_host=req.get_origin_req_host(), 
1195                                                                                                  unverifiable=True))
1196
1197                 # Build our opener
1198                 opener = urllib2.OpenerDirector() 
1199                 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1200                                                 HTTPMethodFallback, HEADRedirectHandler,
1201                                                 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1202                         opener.add_handler(handler())
1203
1204                 response = opener.open(HeadRequest(url))
1205                 new_url = response.geturl()
1206                 
1207                 if url == new_url: return False
1208                 
1209                 self.report_following_redirect(new_url)
1210                 self._downloader.download([new_url])
1211                 return True
1212
1213         def _real_extract(self, url):
1214                 if self._test_redirect(url): return
1215
1216                 video_id = url.split('/')[-1]
1217                 request = urllib2.Request(url)
1218                 try:
1219                         self.report_download_webpage(video_id)
1220                         webpage = urllib2.urlopen(request).read()
1221                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1222                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1223                         return
1224                 except ValueError, err:
1225                         # since this is the last-resort InfoExtractor, if
1226                         # this error is thrown, it'll be thrown here
1227                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1228                         return
1229
1230                 self.report_extraction(video_id)
1231                 # Start with something easy: JW Player in SWFObject
1232                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1233                 if mobj is None:
1234                         # Broaden the search a little bit
1235                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1236                 if mobj is None:
1237                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1238                         return
1239
1240                 # It's possible that one of the regexes
1241                 # matched, but returned an empty group:
1242                 if mobj.group(1) is None:
1243                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1244                         return
1245
1246                 video_url = urllib.unquote(mobj.group(1))
1247                 video_id = os.path.basename(video_url)
1248
1249                 # here's a fun little line of code for you:
1250                 video_extension = os.path.splitext(video_id)[1][1:]
1251                 video_id = os.path.splitext(video_id)[0]
1252
1253                 # it's tempting to parse this further, but you would
1254                 # have to take into account all the variations like
1255                 #   Video Title - Site Name
1256                 #   Site Name | Video Title
1257                 #   Video Title - Tagline | Site Name
1258                 # and so on and so forth; it's just not practical
1259                 mobj = re.search(r'<title>(.*)</title>', webpage)
1260                 if mobj is None:
1261                         self._downloader.trouble(u'ERROR: unable to extract title')
1262                         return
1263                 video_title = mobj.group(1).decode('utf-8')
1264
1265                 # video uploader is domain name
1266                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1267                 if mobj is None:
1268                         self._downloader.trouble(u'ERROR: unable to extract title')
1269                         return
1270                 video_uploader = mobj.group(1).decode('utf-8')
1271
1272                 return [{
1273                         'id':           video_id.decode('utf-8'),
1274                         'url':          video_url.decode('utf-8'),
1275                         'uploader':     video_uploader,
1276                         'upload_date':  u'NA',
1277                         'title':        video_title,
1278                         'ext':          video_extension.decode('utf-8'),
1279                 }]
1280
1281
1282 class YoutubeSearchIE(InfoExtractor):
1283         """Information Extractor for YouTube search queries."""
1284         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1285         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1286         _max_youtube_results = 1000
1287         IE_NAME = u'youtube:search'
1288
1289         def __init__(self, downloader=None):
1290                 InfoExtractor.__init__(self, downloader)
1291
1292         def report_download_page(self, query, pagenum):
1293                 """Report attempt to download search page with given number."""
1294                 query = query.decode(preferredencoding())
1295                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1296
1297         def _real_extract(self, query):
1298                 mobj = re.match(self._VALID_URL, query)
1299                 if mobj is None:
1300                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1301                         return
1302
1303                 prefix, query = query.split(':')
1304                 prefix = prefix[8:]
1305                 query = query.encode('utf-8')
1306                 if prefix == '':
1307                         self._download_n_results(query, 1)
1308                         return
1309                 elif prefix == 'all':
1310                         self._download_n_results(query, self._max_youtube_results)
1311                         return
1312                 else:
1313                         try:
1314                                 n = long(prefix)
1315                                 if n <= 0:
1316                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1317                                         return
1318                                 elif n > self._max_youtube_results:
1319                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1320                                         n = self._max_youtube_results
1321                                 self._download_n_results(query, n)
1322                                 return
1323                         except ValueError: # parsing prefix as integer fails
1324                                 self._download_n_results(query, 1)
1325                                 return
1326
1327         def _download_n_results(self, query, n):
1328                 """Downloads a specified number of results for a query"""
1329
1330                 video_ids = []
1331                 pagenum = 0
1332                 limit = n
1333
1334                 while (50 * pagenum) < limit:
1335                         self.report_download_page(query, pagenum+1)
1336                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1337                         request = urllib2.Request(result_url)
1338                         try:
1339                                 data = urllib2.urlopen(request).read()
1340                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1341                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1342                                 return
1343                         api_response = json.loads(data)['data']
1344
1345                         new_ids = list(video['id'] for video in api_response['items'])
1346                         video_ids += new_ids
1347
1348                         limit = min(n, api_response['totalItems'])
1349                         pagenum += 1
1350
1351                 if len(video_ids) > n:
1352                         video_ids = video_ids[:n]
1353                 for id in video_ids:
1354                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1355                 return
1356
1357
1358 class GoogleSearchIE(InfoExtractor):
1359         """Information Extractor for Google Video search queries."""
1360         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1361         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1362         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1363         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1364         _max_google_results = 1000
1365         IE_NAME = u'video.google:search'
1366
1367         def __init__(self, downloader=None):
1368                 InfoExtractor.__init__(self, downloader)
1369
1370         def report_download_page(self, query, pagenum):
1371                 """Report attempt to download playlist page with given number."""
1372                 query = query.decode(preferredencoding())
1373                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1374
1375         def _real_extract(self, query):
1376                 mobj = re.match(self._VALID_URL, query)
1377                 if mobj is None:
1378                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1379                         return
1380
1381                 prefix, query = query.split(':')
1382                 prefix = prefix[8:]
1383                 query = query.encode('utf-8')
1384                 if prefix == '':
1385                         self._download_n_results(query, 1)
1386                         return
1387                 elif prefix == 'all':
1388                         self._download_n_results(query, self._max_google_results)
1389                         return
1390                 else:
1391                         try:
1392                                 n = long(prefix)
1393                                 if n <= 0:
1394                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1395                                         return
1396                                 elif n > self._max_google_results:
1397                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1398                                         n = self._max_google_results
1399                                 self._download_n_results(query, n)
1400                                 return
1401                         except ValueError: # parsing prefix as integer fails
1402                                 self._download_n_results(query, 1)
1403                                 return
1404
1405         def _download_n_results(self, query, n):
1406                 """Downloads a specified number of results for a query"""
1407
1408                 video_ids = []
1409                 pagenum = 0
1410
1411                 while True:
1412                         self.report_download_page(query, pagenum)
1413                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1414                         request = urllib2.Request(result_url)
1415                         try:
1416                                 page = urllib2.urlopen(request).read()
1417                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1418                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1419                                 return
1420
1421                         # Extract video identifiers
1422                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1423                                 video_id = mobj.group(1)
1424                                 if video_id not in video_ids:
1425                                         video_ids.append(video_id)
1426                                         if len(video_ids) == n:
1427                                                 # Specified n videos reached
1428                                                 for id in video_ids:
1429                                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1430                                                 return
1431
1432                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1433                                 for id in video_ids:
1434                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1435                                 return
1436
1437                         pagenum = pagenum + 1
1438
1439
1440 class YahooSearchIE(InfoExtractor):
1441         """Information Extractor for Yahoo! Video search queries."""
1442         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1443         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1444         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1445         _MORE_PAGES_INDICATOR = r'\s*Next'
1446         _max_yahoo_results = 1000
1447         IE_NAME = u'video.yahoo:search'
1448
1449         def __init__(self, downloader=None):
1450                 InfoExtractor.__init__(self, downloader)
1451
1452         def report_download_page(self, query, pagenum):
1453                 """Report attempt to download playlist page with given number."""
1454                 query = query.decode(preferredencoding())
1455                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1456
1457         def _real_extract(self, query):
1458                 mobj = re.match(self._VALID_URL, query)
1459                 if mobj is None:
1460                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1461                         return
1462
1463                 prefix, query = query.split(':')
1464                 prefix = prefix[8:]
1465                 query = query.encode('utf-8')
1466                 if prefix == '':
1467                         self._download_n_results(query, 1)
1468                         return
1469                 elif prefix == 'all':
1470                         self._download_n_results(query, self._max_yahoo_results)
1471                         return
1472                 else:
1473                         try:
1474                                 n = long(prefix)
1475                                 if n <= 0:
1476                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1477                                         return
1478                                 elif n > self._max_yahoo_results:
1479                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1480                                         n = self._max_yahoo_results
1481                                 self._download_n_results(query, n)
1482                                 return
1483                         except ValueError: # parsing prefix as integer fails
1484                                 self._download_n_results(query, 1)
1485                                 return
1486
1487         def _download_n_results(self, query, n):
1488                 """Downloads a specified number of results for a query"""
1489
1490                 video_ids = []
1491                 already_seen = set()
1492                 pagenum = 1
1493
1494                 while True:
1495                         self.report_download_page(query, pagenum)
1496                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1497                         request = urllib2.Request(result_url)
1498                         try:
1499                                 page = urllib2.urlopen(request).read()
1500                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1501                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1502                                 return
1503
1504                         # Extract video identifiers
1505                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1506                                 video_id = mobj.group(1)
1507                                 if video_id not in already_seen:
1508                                         video_ids.append(video_id)
1509                                         already_seen.add(video_id)
1510                                         if len(video_ids) == n:
1511                                                 # Specified n videos reached
1512                                                 for id in video_ids:
1513                                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1514                                                 return
1515
1516                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1517                                 for id in video_ids:
1518                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1519                                 return
1520
1521                         pagenum = pagenum + 1
1522
1523
1524 class YoutubePlaylistIE(InfoExtractor):
1525         """Information Extractor for YouTube playlists."""
1526
1527         _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1528         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1529         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1530         _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1531         IE_NAME = u'youtube:playlist'
1532
1533         def __init__(self, downloader=None):
1534                 InfoExtractor.__init__(self, downloader)
1535
1536         def report_download_page(self, playlist_id, pagenum):
1537                 """Report attempt to download playlist page with given number."""
1538                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1539
1540         def _real_extract(self, url):
1541                 # Extract playlist id
1542                 mobj = re.match(self._VALID_URL, url)
1543                 if mobj is None:
1544                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1545                         return
1546
1547                 # Single video case
1548                 if mobj.group(3) is not None:
1549                         self._downloader.download([mobj.group(3)])
1550                         return
1551
1552                 # Download playlist pages
1553                 # prefix is 'p' as default for playlists but there are other types that need extra care
1554                 playlist_prefix = mobj.group(1)
1555                 if playlist_prefix == 'a':
1556                         playlist_access = 'artist'
1557                 else:
1558                         playlist_prefix = 'p'
1559                         playlist_access = 'view_play_list'
1560                 playlist_id = mobj.group(2)
1561                 video_ids = []
1562                 pagenum = 1
1563
1564                 while True:
1565                         self.report_download_page(playlist_id, pagenum)
1566                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1567                         request = urllib2.Request(url)
1568                         try:
1569                                 page = urllib2.urlopen(request).read()
1570                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1571                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1572                                 return
1573
1574                         # Extract video identifiers
1575                         ids_in_page = []
1576                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1577                                 if mobj.group(1) not in ids_in_page:
1578                                         ids_in_page.append(mobj.group(1))
1579                         video_ids.extend(ids_in_page)
1580
1581                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1582                                 break
1583                         pagenum = pagenum + 1
1584
1585                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1586                 playlistend = self._downloader.params.get('playlistend', -1)
1587                 if playlistend == -1:
1588                         video_ids = video_ids[playliststart:]
1589                 else:
1590                         video_ids = video_ids[playliststart:playlistend]
1591
1592                 for id in video_ids:
1593                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1594                 return
1595
1596
1597 class YoutubeChannelIE(InfoExtractor):
1598         """Information Extractor for YouTube channels."""
1599
1600         _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1601         _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1602         _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1603         IE_NAME = u'youtube:channel'
1604
1605         def report_download_page(self, channel_id, pagenum):
1606                 """Report attempt to download channel page with given number."""
1607                 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1608
1609         def _real_extract(self, url):
1610                 # Extract channel id
1611                 mobj = re.match(self._VALID_URL, url)
1612                 if mobj is None:
1613                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1614                         return
1615
1616                 # Download channel pages
1617                 channel_id = mobj.group(1)
1618                 video_ids = []
1619                 pagenum = 1
1620
1621                 while True:
1622                         self.report_download_page(channel_id, pagenum)
1623                         url = self._TEMPLATE_URL % (channel_id, pagenum)
1624                         request = urllib2.Request(url)
1625                         try:
1626                                 page = urllib2.urlopen(request).read()
1627                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1628                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1629                                 return
1630
1631                         # Extract video identifiers
1632                         ids_in_page = []
1633                         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1634                                 if mobj.group(1) not in ids_in_page:
1635                                         ids_in_page.append(mobj.group(1))
1636                         video_ids.extend(ids_in_page)
1637
1638                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1639                                 break
1640                         pagenum = pagenum + 1
1641
1642                 for id in video_ids:
1643                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1644                 return
1645
1646
1647 class YoutubeUserIE(InfoExtractor):
1648         """Information Extractor for YouTube users."""
1649
1650         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1651         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1652         _GDATA_PAGE_SIZE = 50
1653         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1654         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1655         IE_NAME = u'youtube:user'
1656
1657         def __init__(self, downloader=None):
1658                 InfoExtractor.__init__(self, downloader)
1659
1660         def report_download_page(self, username, start_index):
1661                 """Report attempt to download user page."""
1662                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1663                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1664
1665         def _real_extract(self, url):
1666                 # Extract username
1667                 mobj = re.match(self._VALID_URL, url)
1668                 if mobj is None:
1669                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1670                         return
1671
1672                 username = mobj.group(1)
1673
1674                 # Download video ids using YouTube Data API. Result size per
1675                 # query is limited (currently to 50 videos) so we need to query
1676                 # page by page until there are no video ids - it means we got
1677                 # all of them.
1678
1679                 video_ids = []
1680                 pagenum = 0
1681
1682                 while True:
1683                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1684                         self.report_download_page(username, start_index)
1685
1686                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1687
1688                         try:
1689                                 page = urllib2.urlopen(request).read()
1690                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1691                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1692                                 return
1693
1694                         # Extract video identifiers
1695                         ids_in_page = []
1696
1697                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1698                                 if mobj.group(1) not in ids_in_page:
1699                                         ids_in_page.append(mobj.group(1))
1700
1701                         video_ids.extend(ids_in_page)
1702
1703                         # A little optimization - if current page is not
1704                         # "full", ie. does not contain PAGE_SIZE video ids then
1705                         # we can assume that this page is the last one - there
1706                         # are no more ids on further pages - no need to query
1707                         # again.
1708
1709                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1710                                 break
1711
1712                         pagenum += 1
1713
1714                 all_ids_count = len(video_ids)
1715                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1716                 playlistend = self._downloader.params.get('playlistend', -1)
1717
1718                 if playlistend == -1:
1719                         video_ids = video_ids[playliststart:]
1720                 else:
1721                         video_ids = video_ids[playliststart:playlistend]
1722
1723                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1724                                 (username, all_ids_count, len(video_ids)))
1725
1726                 for video_id in video_ids:
1727                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1728
1729
1730 class BlipTVUserIE(InfoExtractor):
1731         """Information Extractor for blip.tv users."""
1732
1733         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1734         _PAGE_SIZE = 12
1735         IE_NAME = u'blip.tv:user'
1736
1737         def __init__(self, downloader=None):
1738                 InfoExtractor.__init__(self, downloader)
1739
1740         def report_download_page(self, username, pagenum):
1741                 """Report attempt to download user page."""
1742                 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1743                                 (self.IE_NAME, username, pagenum))
1744
1745         def _real_extract(self, url):
1746                 # Extract username
1747                 mobj = re.match(self._VALID_URL, url)
1748                 if mobj is None:
1749                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1750                         return
1751
1752                 username = mobj.group(1)
1753
1754                 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1755
1756                 request = urllib2.Request(url)
1757
1758                 try:
1759                         page = urllib2.urlopen(request).read().decode('utf-8')
1760                         mobj = re.search(r'data-users-id="([^"]+)"', page)
1761                         page_base = page_base % mobj.group(1)
1762                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1763                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1764                         return
1765
1766
1767                 # Download video ids using BlipTV Ajax calls. Result size per
1768                 # query is limited (currently to 12 videos) so we need to query
1769                 # page by page until there are no video ids - it means we got
1770                 # all of them.
1771
1772                 video_ids = []
1773                 pagenum = 1
1774
1775                 while True:
1776                         self.report_download_page(username, pagenum)
1777
1778                         request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1779
1780                         try:
1781                                 page = urllib2.urlopen(request).read().decode('utf-8')
1782                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1783                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1784                                 return
1785
1786                         # Extract video identifiers
1787                         ids_in_page = []
1788
1789                         for mobj in re.finditer(r'href="/([^"]+)"', page):
1790                                 if mobj.group(1) not in ids_in_page:
1791                                         ids_in_page.append(unescapeHTML(mobj.group(1)))
1792
1793                         video_ids.extend(ids_in_page)
1794
1795                         # A little optimization - if current page is not
1796                         # "full", ie. does not contain PAGE_SIZE video ids then
1797                         # we can assume that this page is the last one - there
1798                         # are no more ids on further pages - no need to query
1799                         # again.
1800
1801                         if len(ids_in_page) < self._PAGE_SIZE:
1802                                 break
1803
1804                         pagenum += 1
1805
1806                 all_ids_count = len(video_ids)
1807                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1808                 playlistend = self._downloader.params.get('playlistend', -1)
1809
1810                 if playlistend == -1:
1811                         video_ids = video_ids[playliststart:]
1812                 else:
1813                         video_ids = video_ids[playliststart:playlistend]
1814
1815                 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1816                                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1817
1818                 for video_id in video_ids:
1819                         self._downloader.download([u'http://blip.tv/'+video_id])
1820
1821
1822 class DepositFilesIE(InfoExtractor):
1823         """Information extractor for depositfiles.com"""
1824
1825         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1826         IE_NAME = u'DepositFiles'
1827
1828         def __init__(self, downloader=None):
1829                 InfoExtractor.__init__(self, downloader)
1830
1831         def report_download_webpage(self, file_id):
1832                 """Report webpage download."""
1833                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1834
1835         def report_extraction(self, file_id):
1836                 """Report information extraction."""
1837                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1838
1839         def _real_extract(self, url):
1840                 file_id = url.split('/')[-1]
1841                 # Rebuild url in english locale
1842                 url = 'http://depositfiles.com/en/files/' + file_id
1843
1844                 # Retrieve file webpage with 'Free download' button pressed
1845                 free_download_indication = { 'gateway_result' : '1' }
1846                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1847                 try:
1848                         self.report_download_webpage(file_id)
1849                         webpage = urllib2.urlopen(request).read()
1850                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1851                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1852                         return
1853
1854                 # Search for the real file URL
1855                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1856                 if (mobj is None) or (mobj.group(1) is None):
1857                         # Try to figure out reason of the error.
1858                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1859                         if (mobj is not None) and (mobj.group(1) is not None):
1860                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1861                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1862                         else:
1863                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1864                         return
1865
1866                 file_url = mobj.group(1)
1867                 file_extension = os.path.splitext(file_url)[1][1:]
1868
1869                 # Search for file title
1870                 mobj = re.search(r'<b title="(.*?)">', webpage)
1871                 if mobj is None:
1872                         self._downloader.trouble(u'ERROR: unable to extract title')
1873                         return
1874                 file_title = mobj.group(1).decode('utf-8')
1875
1876                 return [{
1877                         'id':           file_id.decode('utf-8'),
1878                         'url':          file_url.decode('utf-8'),
1879                         'uploader':     u'NA',
1880                         'upload_date':  u'NA',
1881                         'title':        file_title,
1882                         'ext':          file_extension.decode('utf-8'),
1883                 }]
1884
1885
1886 class FacebookIE(InfoExtractor):
1887         """Information Extractor for Facebook"""
1888
1889         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1890         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1891         _NETRC_MACHINE = 'facebook'
1892         _available_formats = ['video', 'highqual', 'lowqual']
1893         _video_extensions = {
1894                 'video': 'mp4',
1895                 'highqual': 'mp4',
1896                 'lowqual': 'mp4',
1897         }
1898         IE_NAME = u'facebook'
1899
1900         def __init__(self, downloader=None):
1901                 InfoExtractor.__init__(self, downloader)
1902
1903         def _reporter(self, message):
1904                 """Add header and report message."""
1905                 self._downloader.to_screen(u'[facebook] %s' % message)
1906
1907         def report_login(self):
1908                 """Report attempt to log in."""
1909                 self._reporter(u'Logging in')
1910
1911         def report_video_webpage_download(self, video_id):
1912                 """Report attempt to download video webpage."""
1913                 self._reporter(u'%s: Downloading video webpage' % video_id)
1914
1915         def report_information_extraction(self, video_id):
1916                 """Report attempt to extract video information."""
1917                 self._reporter(u'%s: Extracting video information' % video_id)
1918
1919         def _parse_page(self, video_webpage):
1920                 """Extract video information from page"""
1921                 # General data
1922                 data = {'title': r'\("video_title", "(.*?)"\)',
1923                         'description': r'<div class="datawrap">(.*?)</div>',
1924                         'owner': r'\("video_owner_name", "(.*?)"\)',
1925                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1926                         }
1927                 video_info = {}
1928                 for piece in data.keys():
1929                         mobj = re.search(data[piece], video_webpage)
1930                         if mobj is not None:
1931                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1932
1933                 # Video urls
1934                 video_urls = {}
1935                 for fmt in self._available_formats:
1936                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1937                         if mobj is not None:
1938                                 # URL is in a Javascript segment inside an escaped Unicode format within
1939                                 # the generally utf-8 page
1940                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1941                 video_info['video_urls'] = video_urls
1942
1943                 return video_info
1944
1945         def _real_initialize(self):
1946                 if self._downloader is None:
1947                         return
1948
1949                 useremail = None
1950                 password = None
1951                 downloader_params = self._downloader.params
1952
1953                 # Attempt to use provided username and password or .netrc data
1954                 if downloader_params.get('username', None) is not None:
1955                         useremail = downloader_params['username']
1956                         password = downloader_params['password']
1957                 elif downloader_params.get('usenetrc', False):
1958                         try:
1959                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1960                                 if info is not None:
1961                                         useremail = info[0]
1962                                         password = info[2]
1963                                 else:
1964                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1965                         except (IOError, netrc.NetrcParseError), err:
1966                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
1967                                 return
1968
1969                 if useremail is None:
1970                         return
1971
1972                 # Log in
1973                 login_form = {
1974                         'email': useremail,
1975                         'pass': password,
1976                         'login': 'Log+In'
1977                         }
1978                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1979                 try:
1980                         self.report_login()
1981                         login_results = urllib2.urlopen(request).read()
1982                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1983                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1984                                 return
1985                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1986                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
1987                         return
1988
1989         def _real_extract(self, url):
1990                 mobj = re.match(self._VALID_URL, url)
1991                 if mobj is None:
1992                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1993                         return
1994                 video_id = mobj.group('ID')
1995
1996                 # Get video webpage
1997                 self.report_video_webpage_download(video_id)
1998                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1999                 try:
2000                         page = urllib2.urlopen(request)
2001                         video_webpage = page.read()
2002                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2003                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2004                         return
2005
2006                 # Start extracting information
2007                 self.report_information_extraction(video_id)
2008
2009                 # Extract information
2010                 video_info = self._parse_page(video_webpage)
2011
2012                 # uploader
2013                 if 'owner' not in video_info:
2014                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2015                         return
2016                 video_uploader = video_info['owner']
2017
2018                 # title
2019                 if 'title' not in video_info:
2020                         self._downloader.trouble(u'ERROR: unable to extract video title')
2021                         return
2022                 video_title = video_info['title']
2023                 video_title = video_title.decode('utf-8')
2024
2025                 # thumbnail image
2026                 if 'thumbnail' not in video_info:
2027                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2028                         video_thumbnail = ''
2029                 else:
2030                         video_thumbnail = video_info['thumbnail']
2031
2032                 # upload date
2033                 upload_date = u'NA'
2034                 if 'upload_date' in video_info:
2035                         upload_time = video_info['upload_date']
2036                         timetuple = email.utils.parsedate_tz(upload_time)
2037                         if timetuple is not None:
2038                                 try:
2039                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2040                                 except:
2041                                         pass
2042
2043                 # description
2044                 video_description = video_info.get('description', 'No description available.')
2045
2046                 url_map = video_info['video_urls']
2047                 if len(url_map.keys()) > 0:
2048                         # Decide which formats to download
2049                         req_format = self._downloader.params.get('format', None)
2050                         format_limit = self._downloader.params.get('format_limit', None)
2051
2052                         if format_limit is not None and format_limit in self._available_formats:
2053                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2054                         else:
2055                                 format_list = self._available_formats
2056                         existing_formats = [x for x in format_list if x in url_map]
2057                         if len(existing_formats) == 0:
2058                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2059                                 return
2060                         if req_format is None:
2061                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2062                         elif req_format == 'worst':
2063                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2064                         elif req_format == '-1':
2065                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2066                         else:
2067                                 # Specific format
2068                                 if req_format not in url_map:
2069                                         self._downloader.trouble(u'ERROR: requested format not available')
2070                                         return
2071                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2072
2073                 results = []
2074                 for format_param, video_real_url in video_url_list:
2075                         # Extension
2076                         video_extension = self._video_extensions.get(format_param, 'mp4')
2077
2078                         results.append({
2079                                 'id':           video_id.decode('utf-8'),
2080                                 'url':          video_real_url.decode('utf-8'),
2081                                 'uploader':     video_uploader.decode('utf-8'),
2082                                 'upload_date':  upload_date,
2083                                 'title':        video_title,
2084                                 'ext':          video_extension.decode('utf-8'),
2085                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2086                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2087                                 'description':  video_description.decode('utf-8'),
2088                         })
2089                 return results
2090
2091 class BlipTVIE(InfoExtractor):
2092         """Information extractor for blip.tv"""
2093
2094         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2095         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2096         IE_NAME = u'blip.tv'
2097
2098         def report_extraction(self, file_id):
2099                 """Report information extraction."""
2100                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2101
2102         def report_direct_download(self, title):
2103                 """Report information extraction."""
2104                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2105
2106         def _real_extract(self, url):
2107                 mobj = re.match(self._VALID_URL, url)
2108                 if mobj is None:
2109                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2110                         return
2111
2112                 if '?' in url:
2113                         cchar = '&'
2114                 else:
2115                         cchar = '?'
2116                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2117                 request = urllib2.Request(json_url.encode('utf-8'))
2118                 self.report_extraction(mobj.group(1))
2119                 info = None
2120                 try:
2121                         urlh = urllib2.urlopen(request)
2122                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2123                                 basename = url.split('/')[-1]
2124                                 title,ext = os.path.splitext(basename)
2125                                 title = title.decode('UTF-8')
2126                                 ext = ext.replace('.', '')
2127                                 self.report_direct_download(title)
2128                                 info = {
2129                                         'id': title,
2130                                         'url': url,
2131                                         'title': title,
2132                                         'ext': ext,
2133                                         'urlhandle': urlh
2134                                 }
2135                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2136                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2137                         return
2138                 if info is None: # Regular URL
2139                         try:
2140                                 json_code = urlh.read()
2141                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2142                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2143                                 return
2144
2145                         try:
2146                                 json_data = json.loads(json_code)
2147                                 if 'Post' in json_data:
2148                                         data = json_data['Post']
2149                                 else:
2150                                         data = json_data
2151
2152                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2153                                 video_url = data['media']['url']
2154                                 umobj = re.match(self._URL_EXT, video_url)
2155                                 if umobj is None:
2156                                         raise ValueError('Can not determine filename extension')
2157                                 ext = umobj.group(1)
2158
2159                                 info = {
2160                                         'id': data['item_id'],
2161                                         'url': video_url,
2162                                         'uploader': data['display_name'],
2163                                         'upload_date': upload_date,
2164                                         'title': data['title'],
2165                                         'ext': ext,
2166                                         'format': data['media']['mimeType'],
2167                                         'thumbnail': data['thumbnailUrl'],
2168                                         'description': data['description'],
2169                                         'player_url': data['embedUrl']
2170                                 }
2171                         except (ValueError,KeyError), err:
2172                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2173                                 return
2174
2175                 std_headers['User-Agent'] = 'iTunes/10.6.1'
2176                 return [info]
2177
2178
2179 class MyVideoIE(InfoExtractor):
2180         """Information Extractor for myvideo.de."""
2181
2182         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2183         IE_NAME = u'myvideo'
2184
2185         def __init__(self, downloader=None):
2186                 InfoExtractor.__init__(self, downloader)
2187         
2188         def report_download_webpage(self, video_id):
2189                 """Report webpage download."""
2190                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2191
2192         def report_extraction(self, video_id):
2193                 """Report information extraction."""
2194                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2195
2196         def _real_extract(self,url):
2197                 mobj = re.match(self._VALID_URL, url)
2198                 if mobj is None:
2199                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2200                         return
2201
2202                 video_id = mobj.group(1)
2203
2204                 # Get video webpage
2205                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2206                 try:
2207                         self.report_download_webpage(video_id)
2208                         webpage = urllib2.urlopen(request).read()
2209                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2210                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2211                         return
2212
2213                 self.report_extraction(video_id)
2214                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2215                                  webpage)
2216                 if mobj is None:
2217                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2218                         return
2219                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2220
2221                 mobj = re.search('<title>([^<]+)</title>', webpage)
2222                 if mobj is None:
2223                         self._downloader.trouble(u'ERROR: unable to extract title')
2224                         return
2225
2226                 video_title = mobj.group(1)
2227
2228                 return [{
2229                         'id':           video_id,
2230                         'url':          video_url,
2231                         'uploader':     u'NA',
2232                         'upload_date':  u'NA',
2233                         'title':        video_title,
2234                         'ext':          u'flv',
2235                 }]
2236
2237 class ComedyCentralIE(InfoExtractor):
2238         """Information extractor for The Daily Show and Colbert Report """
2239
2240         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2241         IE_NAME = u'comedycentral'
2242
2243         _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2244
2245         _video_extensions = {
2246                 '3500': 'mp4',
2247                 '2200': 'mp4',
2248                 '1700': 'mp4',
2249                 '1200': 'mp4',
2250                 '750': 'mp4',
2251                 '400': 'mp4',
2252         }
2253         _video_dimensions = {
2254                 '3500': '1280x720',
2255                 '2200': '960x540',
2256                 '1700': '768x432',
2257                 '1200': '640x360',
2258                 '750': '512x288',
2259                 '400': '384x216',
2260         }
2261
2262         def report_extraction(self, episode_id):
2263                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2264
2265         def report_config_download(self, episode_id):
2266                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2267
2268         def report_index_download(self, episode_id):
2269                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2270
2271         def report_player_url(self, episode_id):
2272                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2273
2274
2275         def _print_formats(self, formats):
2276                 print('Available formats:')
2277                 for x in formats:
2278                         print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2279
2280
2281         def _real_extract(self, url):
2282                 mobj = re.match(self._VALID_URL, url)
2283                 if mobj is None:
2284                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2285                         return
2286
2287                 if mobj.group('shortname'):
2288                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
2289                                 url = u'http://www.thedailyshow.com/full-episodes/'
2290                         else:
2291                                 url = u'http://www.colbertnation.com/full-episodes/'
2292                         mobj = re.match(self._VALID_URL, url)
2293                         assert mobj is not None
2294
2295                 dlNewest = not mobj.group('episode')
2296                 if dlNewest:
2297                         epTitle = mobj.group('showname')
2298                 else:
2299                         epTitle = mobj.group('episode')
2300
2301                 req = urllib2.Request(url)
2302                 self.report_extraction(epTitle)
2303                 try:
2304                         htmlHandle = urllib2.urlopen(req)
2305                         html = htmlHandle.read()
2306                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2307                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2308                         return
2309                 if dlNewest:
2310                         url = htmlHandle.geturl()
2311                         mobj = re.match(self._VALID_URL, url)
2312                         if mobj is None:
2313                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2314                                 return
2315                         if mobj.group('episode') == '':
2316                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2317                                 return
2318                         epTitle = mobj.group('episode')
2319
2320                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2321
2322                 if len(mMovieParams) == 0:
2323                         # The Colbert Report embeds the information in a without
2324                         # a URL prefix; so extract the alternate reference
2325                         # and then add the URL prefix manually.
2326
2327                         altMovieParams = re.findall('data-mgid="([^"]*episode.*?:.*?)"', html)
2328                         if len(altMovieParams) == 0:
2329                                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2330                                 return
2331                         else:
2332                                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2333                 
2334                 playerUrl_raw = mMovieParams[0][0]
2335                 self.report_player_url(epTitle)
2336                 try:
2337                         urlHandle = urllib2.urlopen(playerUrl_raw)
2338                         playerUrl = urlHandle.geturl()
2339                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2340                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2341                         return
2342
2343                 uri = mMovieParams[0][1]
2344                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2345                 self.report_index_download(epTitle)
2346                 try:
2347                         indexXml = urllib2.urlopen(indexUrl).read()
2348                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2349                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2350                         return
2351
2352                 results = []
2353
2354                 idoc = xml.etree.ElementTree.fromstring(indexXml)
2355                 itemEls = idoc.findall('.//item')
2356                 for itemEl in itemEls:
2357                         mediaId = itemEl.findall('./guid')[0].text
2358                         shortMediaId = mediaId.split(':')[-1]
2359                         showId = mediaId.split(':')[-2].replace('.com', '')
2360                         officialTitle = itemEl.findall('./title')[0].text
2361                         officialDate = itemEl.findall('./pubDate')[0].text
2362
2363                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2364                                                 urllib.urlencode({'uri': mediaId}))
2365                         configReq = urllib2.Request(configUrl)
2366                         self.report_config_download(epTitle)
2367                         try:
2368                                 configXml = urllib2.urlopen(configReq).read()
2369                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2370                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2371                                 return
2372
2373                         cdoc = xml.etree.ElementTree.fromstring(configXml)
2374                         turls = []
2375                         for rendition in cdoc.findall('.//rendition'):
2376                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2377                                 turls.append(finfo)
2378
2379                         if len(turls) == 0:
2380                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2381                                 continue
2382                         
2383                         if self._downloader.params.get('listformats', None):
2384                                 self._print_formats([i[0] for i in turls])
2385                                 return
2386
2387                         # For now, just pick the highest bitrate
2388                         format,video_url = turls[-1]
2389
2390                         # Get the format arg from the arg stream
2391                         req_format = self._downloader.params.get('format', None)
2392
2393                         # Select format if we can find one
2394                         for f,v in turls:
2395                                 if f == req_format:
2396                                         format, video_url = f, v
2397                                         break
2398
2399                         # Patch to download from alternative CDN, which does not
2400                         # break on current RTMPDump builds
2401                         broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2402                         better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2403
2404                         if video_url.startswith(broken_cdn):
2405                                 video_url = video_url.replace(broken_cdn, better_cdn)
2406
2407                         effTitle = showId + u'-' + epTitle
2408                         info = {
2409                                 'id': shortMediaId,
2410                                 'url': video_url,
2411                                 'uploader': showId,
2412                                 'upload_date': officialDate,
2413                                 'title': effTitle,
2414                                 'ext': 'mp4',
2415                                 'format': format,
2416                                 'thumbnail': None,
2417                                 'description': officialTitle,
2418                                 'player_url': None #playerUrl
2419                         }
2420
2421                         results.append(info)
2422                         
2423                 return results
2424
2425
2426 class EscapistIE(InfoExtractor):
2427         """Information extractor for The Escapist """
2428
2429         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2430         IE_NAME = u'escapist'
2431
2432         def report_extraction(self, showName):
2433                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2434
2435         def report_config_download(self, showName):
2436                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2437
2438         def _real_extract(self, url):
2439                 mobj = re.match(self._VALID_URL, url)
2440                 if mobj is None:
2441                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2442                         return
2443                 showName = mobj.group('showname')
2444                 videoId = mobj.group('episode')
2445
2446                 self.report_extraction(showName)
2447                 try:
2448                         webPage = urllib2.urlopen(url)
2449                         webPageBytes = webPage.read()
2450                         m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2451                         webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2452                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2453                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2454                         return
2455
2456                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2457                 description = unescapeHTML(descMatch.group(1))
2458                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2459                 imgUrl = unescapeHTML(imgMatch.group(1))
2460                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2461                 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2462                 configUrlMatch = re.search('config=(.*)$', playerUrl)
2463                 configUrl = urllib2.unquote(configUrlMatch.group(1))
2464
2465                 self.report_config_download(showName)
2466                 try:
2467                         configJSON = urllib2.urlopen(configUrl).read()
2468                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2469                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2470                         return
2471
2472                 # Technically, it's JavaScript, not JSON
2473                 configJSON = configJSON.replace("'", '"')
2474
2475                 try:
2476                         config = json.loads(configJSON)
2477                 except (ValueError,), err:
2478                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2479                         return
2480
2481                 playlist = config['playlist']
2482                 videoUrl = playlist[1]['url']
2483
2484                 info = {
2485                         'id': videoId,
2486                         'url': videoUrl,
2487                         'uploader': showName,
2488                         'upload_date': None,
2489                         'title': showName,
2490                         'ext': 'flv',
2491                         'thumbnail': imgUrl,
2492                         'description': description,
2493                         'player_url': playerUrl,
2494                 }
2495
2496                 return [info]
2497
2498
2499 class CollegeHumorIE(InfoExtractor):
2500         """Information extractor for collegehumor.com"""
2501
2502         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2503         IE_NAME = u'collegehumor'
2504
2505         def report_webpage(self, video_id):
2506                 """Report information extraction."""
2507                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2508
2509         def report_extraction(self, video_id):
2510                 """Report information extraction."""
2511                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2512
2513         def _real_extract(self, url):
2514                 mobj = re.match(self._VALID_URL, url)
2515                 if mobj is None:
2516                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2517                         return
2518                 video_id = mobj.group('videoid')
2519
2520                 self.report_webpage(video_id)
2521                 request = urllib2.Request(url)
2522                 try:
2523                         webpage = urllib2.urlopen(request).read()
2524                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2525                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2526                         return
2527
2528                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2529                 if m is None:
2530                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2531                         return
2532                 internal_video_id = m.group('internalvideoid')
2533
2534                 info = {
2535                         'id': video_id,
2536                         'internal_id': internal_video_id,
2537                 }
2538
2539                 self.report_extraction(video_id)
2540                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2541                 try:
2542                         metaXml = urllib2.urlopen(xmlUrl).read()
2543                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2544                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2545                         return
2546
2547                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2548                 try:
2549                         videoNode = mdoc.findall('./video')[0]
2550                         info['description'] = videoNode.findall('./description')[0].text
2551                         info['title'] = videoNode.findall('./caption')[0].text
2552                         info['url'] = videoNode.findall('./file')[0].text
2553                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2554                         info['ext'] = info['url'].rpartition('.')[2]
2555                 except IndexError:
2556                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2557                         return
2558
2559                 return [info]
2560
2561
2562 class XVideosIE(InfoExtractor):
2563         """Information extractor for xvideos.com"""
2564
2565         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2566         IE_NAME = u'xvideos'
2567
2568         def report_webpage(self, video_id):
2569                 """Report information extraction."""
2570                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2571
2572         def report_extraction(self, video_id):
2573                 """Report information extraction."""
2574                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2575
2576         def _real_extract(self, url):
2577                 mobj = re.match(self._VALID_URL, url)
2578                 if mobj is None:
2579                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2580                         return
2581                 video_id = mobj.group(1).decode('utf-8')
2582
2583                 self.report_webpage(video_id)
2584
2585                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2586                 try:
2587                         webpage = urllib2.urlopen(request).read()
2588                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2589                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2590                         return
2591
2592                 self.report_extraction(video_id)
2593
2594
2595                 # Extract video URL
2596                 mobj = re.search(r'flv_url=(.+?)&', webpage)
2597                 if mobj is None:
2598                         self._downloader.trouble(u'ERROR: unable to extract video url')
2599                         return
2600                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2601
2602
2603                 # Extract title
2604                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2605                 if mobj is None:
2606                         self._downloader.trouble(u'ERROR: unable to extract video title')
2607                         return
2608                 video_title = mobj.group(1).decode('utf-8')
2609
2610
2611                 # Extract video thumbnail
2612                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2613                 if mobj is None:
2614                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2615                         return
2616                 video_thumbnail = mobj.group(0).decode('utf-8')
2617
2618                 info = {
2619                         'id': video_id,
2620                         'url': video_url,
2621                         'uploader': None,
2622                         'upload_date': None,
2623                         'title': video_title,
2624                         'ext': 'flv',
2625                         'thumbnail': video_thumbnail,
2626                         'description': None,
2627                 }
2628
2629                 return [info]
2630
2631
2632 class SoundcloudIE(InfoExtractor):
2633         """Information extractor for soundcloud.com
2634            To access the media, the uid of the song and a stream token
2635            must be extracted from the page source and the script must make
2636            a request to media.soundcloud.com/crossdomain.xml. Then
2637            the media can be grabbed by requesting from an url composed
2638            of the stream token and uid
2639          """
2640
2641         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2642         IE_NAME = u'soundcloud'
2643
2644         def __init__(self, downloader=None):
2645                 InfoExtractor.__init__(self, downloader)
2646
2647         def report_webpage(self, video_id):
2648                 """Report information extraction."""
2649                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2650
2651         def report_extraction(self, video_id):
2652                 """Report information extraction."""
2653                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2654
2655         def _real_extract(self, url):
2656                 mobj = re.match(self._VALID_URL, url)
2657                 if mobj is None:
2658                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2659                         return
2660
2661                 # extract uploader (which is in the url)
2662                 uploader = mobj.group(1).decode('utf-8')
2663                 # extract simple title (uploader + slug of song title)
2664                 slug_title =  mobj.group(2).decode('utf-8')
2665                 simple_title = uploader + u'-' + slug_title
2666
2667                 self.report_webpage('%s/%s' % (uploader, slug_title))
2668
2669                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2670                 try:
2671                         webpage = urllib2.urlopen(request).read()
2672                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2673                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2674                         return
2675
2676                 self.report_extraction('%s/%s' % (uploader, slug_title))
2677
2678                 # extract uid and stream token that soundcloud hands out for access
2679                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2680                 if mobj:
2681                         video_id = mobj.group(1)
2682                         stream_token = mobj.group(2)
2683
2684                 # extract unsimplified title
2685                 mobj = re.search('"title":"(.*?)",', webpage)
2686                 if mobj:
2687                         title = mobj.group(1).decode('utf-8')
2688                 else:
2689                         title = simple_title
2690
2691                 # construct media url (with uid/token)
2692                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2693                 mediaURL = mediaURL % (video_id, stream_token)
2694
2695                 # description
2696                 description = u'No description available'
2697                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2698                 if mobj:
2699                         description = mobj.group(1)
2700
2701                 # upload date
2702                 upload_date = None
2703                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2704                 if mobj:
2705                         try:
2706                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2707                         except Exception, e:
2708                                 self._downloader.to_stderr(compat_str(e))
2709
2710                 # for soundcloud, a request to a cross domain is required for cookies
2711                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2712
2713                 return [{
2714                         'id':           video_id.decode('utf-8'),
2715                         'url':          mediaURL,
2716                         'uploader':     uploader.decode('utf-8'),
2717                         'upload_date':  upload_date,
2718                         'title':        title,
2719                         'ext':          u'mp3',
2720                         'description': description.decode('utf-8')
2721                 }]
2722
2723
2724 class InfoQIE(InfoExtractor):
2725         """Information extractor for infoq.com"""
2726
2727         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2728         IE_NAME = u'infoq'
2729
2730         def report_webpage(self, video_id):
2731                 """Report information extraction."""
2732                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2733
2734         def report_extraction(self, video_id):
2735                 """Report information extraction."""
2736                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2737
2738         def _real_extract(self, url):
2739                 mobj = re.match(self._VALID_URL, url)
2740                 if mobj is None:
2741                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2742                         return
2743
2744                 self.report_webpage(url)
2745
2746                 request = urllib2.Request(url)
2747                 try:
2748                         webpage = urllib2.urlopen(request).read()
2749                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2750                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2751                         return
2752
2753                 self.report_extraction(url)
2754
2755
2756                 # Extract video URL
2757                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2758                 if mobj is None:
2759                         self._downloader.trouble(u'ERROR: unable to extract video url')
2760                         return
2761                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2762
2763
2764                 # Extract title
2765                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2766                 if mobj is None:
2767                         self._downloader.trouble(u'ERROR: unable to extract video title')
2768                         return
2769                 video_title = mobj.group(1).decode('utf-8')
2770
2771                 # Extract description
2772                 video_description = u'No description available.'
2773                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2774                 if mobj is not None:
2775                         video_description = mobj.group(1).decode('utf-8')
2776
2777                 video_filename = video_url.split('/')[-1]
2778                 video_id, extension = video_filename.split('.')
2779
2780                 info = {
2781                         'id': video_id,
2782                         'url': video_url,
2783                         'uploader': None,
2784                         'upload_date': None,
2785                         'title': video_title,
2786                         'ext': extension, # Extension is always(?) mp4, but seems to be flv
2787                         'thumbnail': None,
2788                         'description': video_description,
2789                 }
2790
2791                 return [info]
2792
2793 class MixcloudIE(InfoExtractor):
2794         """Information extractor for www.mixcloud.com"""
2795         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2796         IE_NAME = u'mixcloud'
2797
2798         def __init__(self, downloader=None):
2799                 InfoExtractor.__init__(self, downloader)
2800
2801         def report_download_json(self, file_id):
2802                 """Report JSON download."""
2803                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2804
2805         def report_extraction(self, file_id):
2806                 """Report information extraction."""
2807                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2808
2809         def get_urls(self, jsonData, fmt, bitrate='best'):
2810                 """Get urls from 'audio_formats' section in json"""
2811                 file_url = None
2812                 try:
2813                         bitrate_list = jsonData[fmt]
2814                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2815                                 bitrate = max(bitrate_list) # select highest
2816
2817                         url_list = jsonData[fmt][bitrate]
2818                 except TypeError: # we have no bitrate info.
2819                         url_list = jsonData[fmt]
2820                 return url_list
2821
2822         def check_urls(self, url_list):
2823                 """Returns 1st active url from list"""
2824                 for url in url_list:
2825                         try:
2826                                 urllib2.urlopen(url)
2827                                 return url
2828                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2829                                 url = None
2830
2831                 return None
2832
2833         def _print_formats(self, formats):
2834                 print('Available formats:')
2835                 for fmt in formats.keys():
2836                         for b in formats[fmt]:
2837                                 try:
2838                                         ext = formats[fmt][b][0]
2839                                         print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2840                                 except TypeError: # we have no bitrate info
2841                                         ext = formats[fmt][0]
2842                                         print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2843                                         break
2844
2845         def _real_extract(self, url):
2846                 mobj = re.match(self._VALID_URL, url)
2847                 if mobj is None:
2848                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2849                         return
2850                 # extract uploader & filename from url
2851                 uploader = mobj.group(1).decode('utf-8')
2852                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2853
2854                 # construct API request
2855                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2856                 # retrieve .json file with links to files
2857                 request = urllib2.Request(file_url)
2858                 try:
2859                         self.report_download_json(file_url)
2860                         jsonData = urllib2.urlopen(request).read()
2861                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2862                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2863                         return
2864
2865                 # parse JSON
2866                 json_data = json.loads(jsonData)
2867                 player_url = json_data['player_swf_url']
2868                 formats = dict(json_data['audio_formats'])
2869
2870                 req_format = self._downloader.params.get('format', None)
2871                 bitrate = None
2872
2873                 if self._downloader.params.get('listformats', None):
2874                         self._print_formats(formats)
2875                         return
2876
2877                 if req_format is None or req_format == 'best':
2878                         for format_param in formats.keys():
2879                                 url_list = self.get_urls(formats, format_param)
2880                                 # check urls
2881                                 file_url = self.check_urls(url_list)
2882                                 if file_url is not None:
2883                                         break # got it!
2884                 else:
2885                         if req_format not in formats.keys():
2886                                 self._downloader.trouble(u'ERROR: format is not available')
2887                                 return
2888
2889                         url_list = self.get_urls(formats, req_format)
2890                         file_url = self.check_urls(url_list)
2891                         format_param = req_format
2892
2893                 return [{
2894                         'id': file_id.decode('utf-8'),
2895                         'url': file_url.decode('utf-8'),
2896                         'uploader':     uploader.decode('utf-8'),
2897                         'upload_date': u'NA',
2898                         'title': json_data['name'],
2899                         'ext': file_url.split('.')[-1].decode('utf-8'),
2900                         'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2901                         'thumbnail': json_data['thumbnail_url'],
2902                         'description': json_data['description'],
2903                         'player_url': player_url.decode('utf-8'),
2904                 }]
2905
2906 class StanfordOpenClassroomIE(InfoExtractor):
2907         """Information extractor for Stanford's Open ClassRoom"""
2908
2909         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2910         IE_NAME = u'stanfordoc'
2911
2912         def report_download_webpage(self, objid):
2913                 """Report information extraction."""
2914                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2915
2916         def report_extraction(self, video_id):
2917                 """Report information extraction."""
2918                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2919
2920         def _real_extract(self, url):
2921                 mobj = re.match(self._VALID_URL, url)
2922                 if mobj is None:
2923                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2924                         return
2925
2926                 if mobj.group('course') and mobj.group('video'): # A specific video
2927                         course = mobj.group('course')
2928                         video = mobj.group('video')
2929                         info = {
2930                                 'id': course + '_' + video,
2931                         }
2932
2933                         self.report_extraction(info['id'])
2934                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2935                         xmlUrl = baseUrl + video + '.xml'
2936                         try:
2937                                 metaXml = urllib2.urlopen(xmlUrl).read()
2938                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2939                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2940                                 return
2941                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2942                         try:
2943                                 info['title'] = mdoc.findall('./title')[0].text
2944                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2945                         except IndexError:
2946                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2947                                 return
2948                         info['ext'] = info['url'].rpartition('.')[2]
2949                         return [info]
2950                 elif mobj.group('course'): # A course page
2951                         course = mobj.group('course')
2952                         info = {
2953                                 'id': course,
2954                                 'type': 'playlist',
2955                         }
2956
2957                         self.report_download_webpage(info['id'])
2958                         try:
2959                                 coursepage = urllib2.urlopen(url).read()
2960                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2961                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2962                                 return
2963
2964                         m = re.search('<h1>([^<]+)</h1>', coursepage)
2965                         if m:
2966                                 info['title'] = unescapeHTML(m.group(1))
2967                         else:
2968                                 info['title'] = info['id']
2969
2970                         m = re.search('<description>([^<]+)</description>', coursepage)
2971                         if m:
2972                                 info['description'] = unescapeHTML(m.group(1))
2973
2974                         links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2975                         info['list'] = [
2976                                 {
2977                                         'type': 'reference',
2978                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2979                                 }
2980                                         for vpage in links]
2981                         results = []
2982                         for entry in info['list']:
2983                                 assert entry['type'] == 'reference'
2984                                 results += self.extract(entry['url'])
2985                         return results
2986                         
2987                 else: # Root page
2988                         info = {
2989                                 'id': 'Stanford OpenClassroom',
2990                                 'type': 'playlist',
2991                         }
2992
2993                         self.report_download_webpage(info['id'])
2994                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2995                         try:
2996                                 rootpage = urllib2.urlopen(rootURL).read()
2997                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2998                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2999                                 return
3000
3001                         info['title'] = info['id']
3002
3003                         links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3004                         info['list'] = [
3005                                 {
3006                                         'type': 'reference',
3007                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3008                                 }
3009                                         for cpage in links]
3010
3011                         results = []
3012                         for entry in info['list']:
3013                                 assert entry['type'] == 'reference'
3014                                 results += self.extract(entry['url'])
3015                         return results
3016
3017 class MTVIE(InfoExtractor):
3018         """Information extractor for MTV.com"""
3019
3020         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3021         IE_NAME = u'mtv'
3022
3023         def report_webpage(self, video_id):
3024                 """Report information extraction."""
3025                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3026
3027         def report_extraction(self, video_id):
3028                 """Report information extraction."""
3029                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3030
3031         def _real_extract(self, url):
3032                 mobj = re.match(self._VALID_URL, url)
3033                 if mobj is None:
3034                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3035                         return
3036                 if not mobj.group('proto'):
3037                         url = 'http://' + url
3038                 video_id = mobj.group('videoid')
3039                 self.report_webpage(video_id)
3040
3041                 request = urllib2.Request(url)
3042                 try:
3043                         webpage = urllib2.urlopen(request).read()
3044                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3045                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3046                         return
3047
3048                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3049                 if mobj is None:
3050                         self._downloader.trouble(u'ERROR: unable to extract song name')
3051                         return
3052                 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3053                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3054                 if mobj is None:
3055                         self._downloader.trouble(u'ERROR: unable to extract performer')
3056                         return
3057                 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3058                 video_title = performer + ' - ' + song_name 
3059
3060                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3061                 if mobj is None:
3062                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3063                         return
3064                 mtvn_uri = mobj.group(1)
3065
3066                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3067                 if mobj is None:
3068                         self._downloader.trouble(u'ERROR: unable to extract content id')
3069                         return
3070                 content_id = mobj.group(1)
3071
3072                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3073                 self.report_extraction(video_id)
3074                 request = urllib2.Request(videogen_url)
3075                 try:
3076                         metadataXml = urllib2.urlopen(request).read()
3077                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3078                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3079                         return
3080
3081                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3082                 renditions = mdoc.findall('.//rendition')
3083
3084                 # For now, always pick the highest quality.
3085                 rendition = renditions[-1]
3086
3087                 try:
3088                         _,_,ext = rendition.attrib['type'].partition('/')
3089                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3090                         video_url = rendition.find('./src').text
3091                 except KeyError:
3092                         self._downloader.trouble('Invalid rendition field.')
3093                         return
3094
3095                 info = {
3096                         'id': video_id,
3097                         'url': video_url,
3098                         'uploader': performer,
3099                         'title': video_title,
3100                         'ext': ext,
3101                         'format': format,
3102                 }
3103
3104                 return [info]
3105
3106
3107 class YoukuIE(InfoExtractor):
3108
3109         _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3110         IE_NAME = u'Youku'
3111
3112         def __init__(self, downloader=None):
3113                 InfoExtractor.__init__(self, downloader)
3114
3115         def report_download_webpage(self, file_id):
3116                 """Report webpage download."""
3117                 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3118
3119         def report_extraction(self, file_id):
3120                 """Report information extraction."""
3121                 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3122
3123         def _gen_sid(self):
3124                 nowTime = int(time.time() * 1000)
3125                 random1 = random.randint(1000,1998)
3126                 random2 = random.randint(1000,9999)
3127
3128                 return "%d%d%d" %(nowTime,random1,random2)
3129
3130         def _get_file_ID_mix_string(self, seed):
3131                 mixed = []
3132                 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3133                 seed = float(seed)
3134                 for i in range(len(source)):
3135                         seed  =  (seed * 211 + 30031 ) % 65536
3136                         index  =  math.floor(seed / 65536 * len(source) )
3137                         mixed.append(source[int(index)])
3138                         source.remove(source[int(index)])
3139                 #return ''.join(mixed)
3140                 return mixed
3141
3142         def _get_file_id(self, fileId, seed):
3143                 mixed = self._get_file_ID_mix_string(seed)
3144                 ids = fileId.split('*')
3145                 realId = []
3146                 for ch in ids:
3147                         if ch:
3148                                 realId.append(mixed[int(ch)])
3149                 return ''.join(realId)
3150
3151         def _real_extract(self, url):
3152                 mobj = re.match(self._VALID_URL, url)
3153                 if mobj is None:
3154                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3155                         return
3156                 video_id = mobj.group('ID')
3157
3158                 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3159
3160                 request = urllib2.Request(info_url, None, std_headers)
3161                 try:
3162                         self.report_download_webpage(video_id)
3163                         jsondata = urllib2.urlopen(request).read()
3164                 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3165                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3166                         return
3167
3168                 self.report_extraction(video_id)
3169                 try:
3170                         config = json.loads(jsondata)
3171
3172                         video_title =  config['data'][0]['title']
3173                         seed = config['data'][0]['seed']
3174
3175                         format = self._downloader.params.get('format', None)
3176                         supported_format = config['data'][0]['streamfileids'].keys()
3177
3178                         if format is None or format == 'best':
3179                                 if 'hd2' in supported_format:
3180                                         format = 'hd2'
3181                                 else:
3182                                         format = 'flv'
3183                                 ext = u'flv'
3184                         elif format == 'worst':
3185                                 format = 'mp4'
3186                                 ext = u'mp4'
3187                         else:
3188                                 format = 'flv'
3189                                 ext = u'flv'
3190
3191
3192                         fileid = config['data'][0]['streamfileids'][format]
3193                         seg_number = len(config['data'][0]['segs'][format])
3194
3195                         keys=[]
3196                         for i in xrange(seg_number):
3197                                 keys.append(config['data'][0]['segs'][format][i]['k'])
3198
3199                         #TODO check error
3200                         #youku only could be viewed from mainland china
3201                 except:
3202                         self._downloader.trouble(u'ERROR: unable to extract info section')
3203                         return
3204
3205                 files_info=[]
3206                 sid = self._gen_sid()
3207                 fileid = self._get_file_id(fileid, seed)
3208
3209                 #column 8,9 of fileid represent the segment number
3210                 #fileid[7:9] should be changed
3211                 for index, key in enumerate(keys):
3212
3213                         temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3214                         download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3215
3216                         info = {
3217                                 'id': '%s_part%02d' % (video_id, index),
3218                                 'url': download_url,
3219                                 'uploader': None,
3220                                 'title': video_title,
3221                                 'ext': ext,
3222                         }
3223                         files_info.append(info)
3224
3225                 return files_info
3226
3227
3228 class XNXXIE(InfoExtractor):
3229         """Information extractor for xnxx.com"""
3230
3231         _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3232         IE_NAME = u'xnxx'
3233         VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3234         VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3235         VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3236
3237         def report_webpage(self, video_id):
3238                 """Report information extraction"""
3239                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3240
3241         def report_extraction(self, video_id):
3242                 """Report information extraction"""
3243                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3244
3245         def _real_extract(self, url):
3246                 mobj = re.match(self._VALID_URL, url)
3247                 if mobj is None:
3248                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3249                         return
3250                 video_id = mobj.group(1).decode('utf-8')
3251
3252                 self.report_webpage(video_id)
3253
3254                 # Get webpage content
3255                 try:
3256                         webpage = urllib2.urlopen(url).read()
3257                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3258                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3259                         return
3260
3261                 result = re.search(self.VIDEO_URL_RE, webpage)
3262                 if result is None:
3263                         self._downloader.trouble(u'ERROR: unable to extract video url')
3264                         return
3265                 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3266
3267                 result = re.search(self.VIDEO_TITLE_RE, webpage)
3268                 if result is None:
3269                         self._downloader.trouble(u'ERROR: unable to extract video title')
3270                         return
3271                 video_title = result.group(1).decode('utf-8')
3272
3273                 result = re.search(self.VIDEO_THUMB_RE, webpage)
3274                 if result is None:
3275                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3276                         return
3277                 video_thumbnail = result.group(1).decode('utf-8')
3278
3279                 return [{
3280                         'id': video_id,
3281                         'url': video_url,
3282                         'uploader': None,
3283                         'upload_date': None,
3284                         'title': video_title,
3285                         'ext': 'flv',
3286                         'thumbnail': video_thumbnail,
3287                         'description': None,
3288                 }]
3289
3290
3291 class GooglePlusIE(InfoExtractor):
3292         """Information extractor for plus.google.com."""
3293
3294         _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3295         IE_NAME = u'plus.google'
3296
3297         def __init__(self, downloader=None):
3298                 InfoExtractor.__init__(self, downloader)
3299
3300         def report_extract_entry(self, url):
3301                 """Report downloading extry"""
3302                 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3303
3304         def report_date(self, upload_date):
3305                 """Report downloading extry"""
3306                 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3307
3308         def report_uploader(self, uploader):
3309                 """Report downloading extry"""
3310                 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3311
3312         def report_title(self, video_title):
3313                 """Report downloading extry"""
3314                 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3315
3316         def report_extract_vid_page(self, video_page):
3317                 """Report information extraction."""
3318                 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3319
3320         def _real_extract(self, url):
3321                 # Extract id from URL
3322                 mobj = re.match(self._VALID_URL, url)
3323                 if mobj is None:
3324                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3325                         return
3326
3327                 post_url = mobj.group(0)
3328                 video_id = mobj.group(2)
3329
3330                 video_extension = 'flv'
3331
3332                 # Step 1, Retrieve post webpage to extract further information
3333                 self.report_extract_entry(post_url)
3334                 request = urllib2.Request(post_url)
3335                 try:
3336                         webpage = urllib2.urlopen(request).read()
3337                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3338                         self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3339                         return
3340
3341                 # Extract update date
3342                 upload_date = u'NA'
3343                 pattern = 'title="Timestamp">(.*?)</a>'
3344                 mobj = re.search(pattern, webpage)
3345                 if mobj:
3346                         upload_date = mobj.group(1)
3347                         # Convert timestring to a format suitable for filename
3348                         upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3349                         upload_date = upload_date.strftime('%Y%m%d')
3350                 self.report_date(upload_date)
3351
3352                 # Extract uploader
3353                 uploader = u'NA'
3354                 pattern = r'rel\="author".*?>(.*?)</a>'
3355                 mobj = re.search(pattern, webpage)
3356                 if mobj:
3357                         uploader = mobj.group(1)
3358                 self.report_uploader(uploader)
3359
3360                 # Extract title
3361                 # Get the first line for title
3362                 video_title = u'NA'
3363                 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3364                 mobj = re.search(pattern, webpage)
3365                 if mobj:
3366                         video_title = mobj.group(1)
3367                 self.report_title(video_title)
3368
3369                 # Step 2, Stimulate clicking the image box to launch video
3370                 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3371                 mobj = re.search(pattern, webpage)
3372                 if mobj is None:
3373                         self._downloader.trouble(u'ERROR: unable to extract video page URL')
3374
3375                 video_page = mobj.group(1)
3376                 request = urllib2.Request(video_page)
3377                 try:
3378                         webpage = urllib2.urlopen(request).read()
3379                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3380                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3381                         return
3382                 self.report_extract_vid_page(video_page)
3383
3384
3385                 # Extract video links on video page
3386                 """Extract video links of all sizes"""
3387                 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3388                 mobj = re.findall(pattern, webpage)
3389                 if len(mobj) == 0:
3390                         self._downloader.trouble(u'ERROR: unable to extract video links')
3391
3392                 # Sort in resolution
3393                 links = sorted(mobj)
3394
3395                 # Choose the lowest of the sort, i.e. highest resolution
3396                 video_url = links[-1]
3397                 # Only get the url. The resolution part in the tuple has no use anymore
3398                 video_url = video_url[-1]
3399                 # Treat escaped \u0026 style hex
3400                 video_url = unicode(video_url, "unicode_escape")
3401
3402
3403                 return [{
3404                         'id':           video_id.decode('utf-8'),
3405                         'url':          video_url,
3406                         'uploader':     uploader.decode('utf-8'),
3407                         'upload_date':  upload_date.decode('utf-8'),
3408                         'title':        video_title.decode('utf-8'),
3409                         'ext':          video_extension.decode('utf-8'),
3410                 }]