make all IEs return 'upload_date' and 'uploader', even if only u'NA'
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import datetime
5 import HTMLParser
6 import httplib
7 import netrc
8 import os
9 import re
10 import socket
11 import time
12 import urllib
13 import urllib2
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 from urlparse import parse_qs
19
20 try:
21         import cStringIO as StringIO
22 except ImportError:
23         import StringIO
24
25 from utils import *
26
27
28 class InfoExtractor(object):
29         """Information Extractor class.
30
31         Information extractors are the classes that, given a URL, extract
32         information about the video (or videos) the URL refers to. This
33         information includes the real video URL, the video title, author and
34         others. The information is stored in a dictionary which is then 
35         passed to the FileDownloader. The FileDownloader processes this
36         information possibly downloading the video to the file system, among
37         other possible outcomes.
38
39         The dictionaries must include the following fields:
40
41         id:             Video identifier.
42         url:            Final video URL.
43         uploader:       Nickname of the video uploader, unescaped.
44         upload_date:    Video upload date (YYYYMMDD).
45         title:          Video title, unescaped.
46         ext:            Video filename extension.
47
48         The following fields are optional:
49
50         format:         The video format, defaults to ext (used for --get-format)
51         thumbnail:      Full URL to a video thumbnail image.
52         description:    One-line video description.
53         player_url:     SWF Player URL (used for rtmpdump).
54
55         The fields should all be Unicode strings.
56
57         Subclasses of this one should re-define the _real_initialize() and
58         _real_extract() methods and define a _VALID_URL regexp.
59         Probably, they should also be added to the list of extractors.
60
61         _real_extract() must return a *list* of information dictionaries as
62         described above.
63         """
64
65         _ready = False
66         _downloader = None
67
68         def __init__(self, downloader=None):
69                 """Constructor. Receives an optional downloader."""
70                 self._ready = False
71                 self.set_downloader(downloader)
72
73         def suitable(self, url):
74                 """Receives a URL and returns True if suitable for this IE."""
75                 return re.match(self._VALID_URL, url) is not None
76
77         def initialize(self):
78                 """Initializes an instance (authentication, etc)."""
79                 if not self._ready:
80                         self._real_initialize()
81                         self._ready = True
82
83         def extract(self, url):
84                 """Extracts URL information and returns it in list of dicts."""
85                 self.initialize()
86                 return self._real_extract(url)
87
88         def set_downloader(self, downloader):
89                 """Sets the downloader for this IE."""
90                 self._downloader = downloader
91
92         def _real_initialize(self):
93                 """Real initialization process. Redefine in subclasses."""
94                 pass
95
96         def _real_extract(self, url):
97                 """Real extraction process. Redefine in subclasses."""
98                 pass
99
100
101 class YoutubeIE(InfoExtractor):
102         """Information extractor for youtube.com."""
103
104         _VALID_URL = r"""^
105                          (
106                              (?:https?://)?                                       # http(s):// (optional)
107                              (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
108                                 tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
109                              (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
110                              (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
111                              (?:                                                  # the various things that can precede the ID:
112                                  (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
113                                  |(?:                                             # or the v= param in all its forms
114                                      (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
115                                      (?:\?|\#!?)                                  # the params delimiter ? or # or #!
116                                      (?:.+&)?                                     # any other preceding param (like /?s=tuff&v=xxxx)
117                                      v=
118                                  )
119                              )?                                                   # optional -> youtube.com/xxxx is OK
120                          )?                                                       # all until now is optional -> you can pass the naked ID
121                          ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
122                          (?(1).+)?                                                # if we found the ID, everything can follow
123                          $"""
124         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
125         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
126         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
127         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
128         _NETRC_MACHINE = 'youtube'
129         # Listed in order of quality
130         _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
131         _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
132         _video_extensions = {
133                 '13': '3gp',
134                 '17': 'mp4',
135                 '18': 'mp4',
136                 '22': 'mp4',
137                 '37': 'mp4',
138                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
139                 '43': 'webm',
140                 '44': 'webm',
141                 '45': 'webm',
142                 '46': 'webm',
143         }
144         _video_dimensions = {
145                 '5': '240x400',
146                 '6': '???',
147                 '13': '???',
148                 '17': '144x176',
149                 '18': '360x640',
150                 '22': '720x1280',
151                 '34': '360x640',
152                 '35': '480x854',
153                 '37': '1080x1920',
154                 '38': '3072x4096',
155                 '43': '360x640',
156                 '44': '480x854',
157                 '45': '720x1280',
158                 '46': '1080x1920',
159         }       
160         IE_NAME = u'youtube'
161
162         def suitable(self, url):
163                 """Receives a URL and returns True if suitable for this IE."""
164                 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
165
166         def report_lang(self):
167                 """Report attempt to set language."""
168                 self._downloader.to_screen(u'[youtube] Setting language')
169
170         def report_login(self):
171                 """Report attempt to log in."""
172                 self._downloader.to_screen(u'[youtube] Logging in')
173
174         def report_age_confirmation(self):
175                 """Report attempt to confirm age."""
176                 self._downloader.to_screen(u'[youtube] Confirming age')
177
178         def report_video_webpage_download(self, video_id):
179                 """Report attempt to download video webpage."""
180                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
181
182         def report_video_info_webpage_download(self, video_id):
183                 """Report attempt to download video info webpage."""
184                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
185
186         def report_video_subtitles_download(self, video_id):
187                 """Report attempt to download video info webpage."""
188                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
189
190         def report_information_extraction(self, video_id):
191                 """Report attempt to extract video information."""
192                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
193
194         def report_unavailable_format(self, video_id, format):
195                 """Report extracted video URL."""
196                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
197
198         def report_rtmp_download(self):
199                 """Indicate the download will use the RTMP protocol."""
200                 self._downloader.to_screen(u'[youtube] RTMP download detected')
201
202         def _closed_captions_xml_to_srt(self, xml_string):
203                 srt = ''
204                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
205                 # TODO parse xml instead of regex
206                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
207                         if not dur: dur = '4'
208                         start = float(start)
209                         end = start + float(dur)
210                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
211                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
212                         caption = unescapeHTML(caption)
213                         caption = unescapeHTML(caption) # double cycle, intentional
214                         srt += str(n+1) + '\n'
215                         srt += start + ' --> ' + end + '\n'
216                         srt += caption + '\n\n'
217                 return srt
218
219         def _print_formats(self, formats):
220                 print('Available formats:')
221                 for x in formats:
222                         print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
223
224         def _real_initialize(self):
225                 if self._downloader is None:
226                         return
227
228                 username = None
229                 password = None
230                 downloader_params = self._downloader.params
231
232                 # Attempt to use provided username and password or .netrc data
233                 if downloader_params.get('username', None) is not None:
234                         username = downloader_params['username']
235                         password = downloader_params['password']
236                 elif downloader_params.get('usenetrc', False):
237                         try:
238                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
239                                 if info is not None:
240                                         username = info[0]
241                                         password = info[2]
242                                 else:
243                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
244                         except (IOError, netrc.NetrcParseError), err:
245                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
246                                 return
247
248                 # Set language
249                 request = urllib2.Request(self._LANG_URL)
250                 try:
251                         self.report_lang()
252                         urllib2.urlopen(request).read()
253                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
254                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
255                         return
256
257                 # No authentication to be performed
258                 if username is None:
259                         return
260
261                 # Log in
262                 login_form = {
263                                 'current_form': 'loginForm',
264                                 'next':         '/',
265                                 'action_login': 'Log In',
266                                 'username':     username,
267                                 'password':     password,
268                                 }
269                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
270                 try:
271                         self.report_login()
272                         login_results = urllib2.urlopen(request).read()
273                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
274                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
275                                 return
276                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
277                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
278                         return
279
280                 # Confirm age
281                 age_form = {
282                                 'next_url':             '/',
283                                 'action_confirm':       'Confirm',
284                                 }
285                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
286                 try:
287                         self.report_age_confirmation()
288                         age_results = urllib2.urlopen(request).read()
289                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
290                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
291                         return
292
293         def _real_extract(self, url):
294                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
295                 mobj = re.search(self._NEXT_URL_RE, url)
296                 if mobj:
297                         url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
298
299                 # Extract video id from URL
300                 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
301                 if mobj is None:
302                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
303                         return
304                 video_id = mobj.group(2)
305
306                 # Get video webpage
307                 self.report_video_webpage_download(video_id)
308                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
309                 try:
310                         video_webpage = urllib2.urlopen(request).read()
311                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
312                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
313                         return
314
315                 # Attempt to extract SWF player URL
316                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
317                 if mobj is not None:
318                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
319                 else:
320                         player_url = None
321
322                 # Get video info
323                 self.report_video_info_webpage_download(video_id)
324                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
325                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
326                                         % (video_id, el_type))
327                         request = urllib2.Request(video_info_url)
328                         try:
329                                 video_info_webpage = urllib2.urlopen(request).read()
330                                 video_info = parse_qs(video_info_webpage)
331                                 if 'token' in video_info:
332                                         break
333                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
334                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
335                                 return
336                 if 'token' not in video_info:
337                         if 'reason' in video_info:
338                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
339                         else:
340                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
341                         return
342
343                 # Check for "rental" videos
344                 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
345                         self._downloader.trouble(u'ERROR: "rental" videos not supported')
346                         return
347
348                 # Start extracting information
349                 self.report_information_extraction(video_id)
350
351                 # uploader
352                 if 'author' not in video_info:
353                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
354                         return
355                 video_uploader = urllib.unquote_plus(video_info['author'][0])
356
357                 # title
358                 if 'title' not in video_info:
359                         self._downloader.trouble(u'ERROR: unable to extract video title')
360                         return
361                 video_title = urllib.unquote_plus(video_info['title'][0])
362                 video_title = video_title.decode('utf-8')
363
364                 # thumbnail image
365                 if 'thumbnail_url' not in video_info:
366                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
367                         video_thumbnail = ''
368                 else:   # don't panic if we can't find it
369                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
370
371                 # upload date
372                 upload_date = u'NA'
373                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
374                 if mobj is not None:
375                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
376                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
377                         for expression in format_expressions:
378                                 try:
379                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
380                                 except:
381                                         pass
382
383                 # description
384                 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
385                 if video_description: video_description = clean_html(video_description)
386                 else: video_description = ''
387                         
388                 # closed captions
389                 video_subtitles = None
390                 if self._downloader.params.get('writesubtitles', False):
391                         try:
392                                 self.report_video_subtitles_download(video_id)
393                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
394                                 try:
395                                         srt_list = urllib2.urlopen(request).read()
396                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
397                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
398                                 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
399                                 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
400                                 if not srt_lang_list:
401                                         raise Trouble(u'WARNING: video has no closed captions')
402                                 if self._downloader.params.get('subtitleslang', False):
403                                         srt_lang = self._downloader.params.get('subtitleslang')
404                                 elif 'en' in srt_lang_list:
405                                         srt_lang = 'en'
406                                 else:
407                                         srt_lang = srt_lang_list.keys()[0]
408                                 if not srt_lang in srt_lang_list:
409                                         raise Trouble(u'WARNING: no closed captions found in the specified language')
410                                 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
411                                 try:
412                                         srt_xml = urllib2.urlopen(request).read()
413                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
414                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
415                                 if not srt_xml:
416                                         raise Trouble(u'WARNING: unable to download video subtitles')
417                                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
418                         except Trouble as trouble:
419                                 self._downloader.trouble(trouble[0])
420
421                 if 'length_seconds' not in video_info:
422                         self._downloader.trouble(u'WARNING: unable to extract video duration')
423                         video_duration = ''
424                 else:
425                         video_duration = urllib.unquote_plus(video_info['length_seconds'][0])
426
427                 # token
428                 video_token = urllib.unquote_plus(video_info['token'][0])
429
430                 # Decide which formats to download
431                 req_format = self._downloader.params.get('format', None)
432
433                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
434                         self.report_rtmp_download()
435                         video_url_list = [(None, video_info['conn'][0])]
436                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
437                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
438                         url_data = [parse_qs(uds) for uds in url_data_strs]
439                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
440                         url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
441
442                         format_limit = self._downloader.params.get('format_limit', None)
443                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
444                         if format_limit is not None and format_limit in available_formats:
445                                 format_list = available_formats[available_formats.index(format_limit):]
446                         else:
447                                 format_list = available_formats
448                         existing_formats = [x for x in format_list if x in url_map]
449                         if len(existing_formats) == 0:
450                                 self._downloader.trouble(u'ERROR: no known formats available for video')
451                                 return
452                         if self._downloader.params.get('listformats', None):
453                                 self._print_formats(existing_formats)
454                                 return
455                         if req_format is None or req_format == 'best':
456                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
457                         elif req_format == 'worst':
458                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
459                         elif req_format in ('-1', 'all'):
460                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
461                         else:
462                                 # Specific formats. We pick the first in a slash-delimeted sequence.
463                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
464                                 req_formats = req_format.split('/')
465                                 video_url_list = None
466                                 for rf in req_formats:
467                                         if rf in url_map:
468                                                 video_url_list = [(rf, url_map[rf])]
469                                                 break
470                                 if video_url_list is None:
471                                         self._downloader.trouble(u'ERROR: requested format not available')
472                                         return
473                 else:
474                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
475                         return
476
477                 results = []
478                 for format_param, video_real_url in video_url_list:
479                         # Extension
480                         video_extension = self._video_extensions.get(format_param, 'flv')
481
482                         video_format = '{} - {}'.format(format_param.decode('utf-8') if format_param else video_extension.decode('utf-8'),
483                                                             self._video_dimensions.get(format_param, '???'))
484
485                         results.append({
486                                 'id':           video_id.decode('utf-8'),
487                                 'url':          video_real_url.decode('utf-8'),
488                                 'uploader':     video_uploader.decode('utf-8'),
489                                 'upload_date':  upload_date,
490                                 'title':        video_title,
491                                 'ext':          video_extension.decode('utf-8'),
492                                 'format':       video_format,
493                                 'thumbnail':    video_thumbnail.decode('utf-8'),
494                                 'description':  video_description,
495                                 'player_url':   player_url,
496                                 'subtitles':    video_subtitles,
497                                 'duration':             video_duration
498                         })
499                 return results
500
501
502 class MetacafeIE(InfoExtractor):
503         """Information Extractor for metacafe.com."""
504
505         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
506         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
507         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
508         IE_NAME = u'metacafe'
509
510         def __init__(self, downloader=None):
511                 InfoExtractor.__init__(self, downloader)
512
513         def report_disclaimer(self):
514                 """Report disclaimer retrieval."""
515                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
516
517         def report_age_confirmation(self):
518                 """Report attempt to confirm age."""
519                 self._downloader.to_screen(u'[metacafe] Confirming age')
520
521         def report_download_webpage(self, video_id):
522                 """Report webpage download."""
523                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
524
525         def report_extraction(self, video_id):
526                 """Report information extraction."""
527                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
528
529         def _real_initialize(self):
530                 # Retrieve disclaimer
531                 request = urllib2.Request(self._DISCLAIMER)
532                 try:
533                         self.report_disclaimer()
534                         disclaimer = urllib2.urlopen(request).read()
535                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
536                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
537                         return
538
539                 # Confirm age
540                 disclaimer_form = {
541                         'filters': '0',
542                         'submit': "Continue - I'm over 18",
543                         }
544                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
545                 try:
546                         self.report_age_confirmation()
547                         disclaimer = urllib2.urlopen(request).read()
548                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
549                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
550                         return
551
552         def _real_extract(self, url):
553                 # Extract id and simplified title from URL
554                 mobj = re.match(self._VALID_URL, url)
555                 if mobj is None:
556                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
557                         return
558
559                 video_id = mobj.group(1)
560
561                 # Check if video comes from YouTube
562                 mobj2 = re.match(r'^yt-(.*)$', video_id)
563                 if mobj2 is not None:
564                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
565                         return
566
567                 # Retrieve video webpage to extract further information
568                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
569                 try:
570                         self.report_download_webpage(video_id)
571                         webpage = urllib2.urlopen(request).read()
572                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
573                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
574                         return
575
576                 # Extract URL, uploader and title from webpage
577                 self.report_extraction(video_id)
578                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
579                 if mobj is not None:
580                         mediaURL = urllib.unquote(mobj.group(1))
581                         video_extension = mediaURL[-3:]
582
583                         # Extract gdaKey if available
584                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
585                         if mobj is None:
586                                 video_url = mediaURL
587                         else:
588                                 gdaKey = mobj.group(1)
589                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
590                 else:
591                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
592                         if mobj is None:
593                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
594                                 return
595                         vardict = parse_qs(mobj.group(1))
596                         if 'mediaData' not in vardict:
597                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
598                                 return
599                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
600                         if mobj is None:
601                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
602                                 return
603                         mediaURL = mobj.group(1).replace('\\/', '/')
604                         video_extension = mediaURL[-3:]
605                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
606
607                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
608                 if mobj is None:
609                         self._downloader.trouble(u'ERROR: unable to extract title')
610                         return
611                 video_title = mobj.group(1).decode('utf-8')
612
613                 mobj = re.search(r'submitter=(.*?);', webpage)
614                 if mobj is None:
615                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
616                         return
617                 video_uploader = mobj.group(1)
618
619                 return [{
620                         'id':           video_id.decode('utf-8'),
621                         'url':          video_url.decode('utf-8'),
622                         'uploader':     video_uploader.decode('utf-8'),
623                         'upload_date':  u'NA',
624                         'title':        video_title,
625                         'ext':          video_extension.decode('utf-8'),
626                 }]
627
628
629 class DailymotionIE(InfoExtractor):
630         """Information Extractor for Dailymotion"""
631
632         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
633         IE_NAME = u'dailymotion'
634
635         def __init__(self, downloader=None):
636                 InfoExtractor.__init__(self, downloader)
637
638         def report_download_webpage(self, video_id):
639                 """Report webpage download."""
640                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
641
642         def report_extraction(self, video_id):
643                 """Report information extraction."""
644                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
645
646         def _real_extract(self, url):
647                 # Extract id and simplified title from URL
648                 mobj = re.match(self._VALID_URL, url)
649                 if mobj is None:
650                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
651                         return
652
653                 video_id = mobj.group(1).split('_')[0].split('?')[0]
654
655                 video_extension = 'mp4'
656
657                 # Retrieve video webpage to extract further information
658                 request = urllib2.Request(url)
659                 request.add_header('Cookie', 'family_filter=off')
660                 try:
661                         self.report_download_webpage(video_id)
662                         webpage = urllib2.urlopen(request).read()
663                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
664                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
665                         return
666
667                 # Extract URL, uploader and title from webpage
668                 self.report_extraction(video_id)
669                 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
670                 if mobj is None:
671                         self._downloader.trouble(u'ERROR: unable to extract media URL')
672                         return
673                 flashvars = urllib.unquote(mobj.group(1))
674
675                 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
676                         if key in flashvars:
677                                 max_quality = key
678                                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
679                                 break
680                 else:
681                         self._downloader.trouble(u'ERROR: unable to extract video URL')
682                         return
683
684                 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
685                 if mobj is None:
686                         self._downloader.trouble(u'ERROR: unable to extract video URL')
687                         return
688
689                 video_url = urllib.unquote(mobj.group(1)).replace('\\/', '/')
690
691                 # TODO: support choosing qualities
692
693                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
694                 if mobj is None:
695                         self._downloader.trouble(u'ERROR: unable to extract title')
696                         return
697                 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
698
699                 video_uploader = u'NA'
700                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
701                 if mobj is None:
702                         # lookin for official user
703                         mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
704                         if mobj_official is None:
705                                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
706                         else:
707                                 video_uploader = mobj_official.group(1)
708                 else:
709                         video_uploader = mobj.group(1)
710
711                 video_upload_date = u'NA'
712                 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
713                 if mobj is not None:
714                         video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
715
716                 return [{
717                         'id':           video_id.decode('utf-8'),
718                         'url':          video_url.decode('utf-8'),
719                         'uploader':     video_uploader.decode('utf-8'),
720                         'upload_date':  video_upload_date,
721                         'title':        video_title,
722                         'ext':          video_extension.decode('utf-8'),
723                 }]
724
725
726 class GoogleIE(InfoExtractor):
727         """Information extractor for video.google.com."""
728
729         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
730         IE_NAME = u'video.google'
731
732         def __init__(self, downloader=None):
733                 InfoExtractor.__init__(self, downloader)
734
735         def report_download_webpage(self, video_id):
736                 """Report webpage download."""
737                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
738
739         def report_extraction(self, video_id):
740                 """Report information extraction."""
741                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
742
743         def _real_extract(self, url):
744                 # Extract id from URL
745                 mobj = re.match(self._VALID_URL, url)
746                 if mobj is None:
747                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
748                         return
749
750                 video_id = mobj.group(1)
751
752                 video_extension = 'mp4'
753
754                 # Retrieve video webpage to extract further information
755                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
756                 try:
757                         self.report_download_webpage(video_id)
758                         webpage = urllib2.urlopen(request).read()
759                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
760                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
761                         return
762
763                 # Extract URL, uploader, and title from webpage
764                 self.report_extraction(video_id)
765                 mobj = re.search(r"download_url:'([^']+)'", webpage)
766                 if mobj is None:
767                         video_extension = 'flv'
768                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
769                 if mobj is None:
770                         self._downloader.trouble(u'ERROR: unable to extract media URL')
771                         return
772                 mediaURL = urllib.unquote(mobj.group(1))
773                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
774                 mediaURL = mediaURL.replace('\\x26', '\x26')
775
776                 video_url = mediaURL
777
778                 mobj = re.search(r'<title>(.*)</title>', webpage)
779                 if mobj is None:
780                         self._downloader.trouble(u'ERROR: unable to extract title')
781                         return
782                 video_title = mobj.group(1).decode('utf-8')
783
784                 # Extract video description
785                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
786                 if mobj is None:
787                         self._downloader.trouble(u'ERROR: unable to extract video description')
788                         return
789                 video_description = mobj.group(1).decode('utf-8')
790                 if not video_description:
791                         video_description = 'No description available.'
792
793                 # Extract video thumbnail
794                 if self._downloader.params.get('forcethumbnail', False):
795                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
796                         try:
797                                 webpage = urllib2.urlopen(request).read()
798                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
799                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
800                                 return
801                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
802                         if mobj is None:
803                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
804                                 return
805                         video_thumbnail = mobj.group(1)
806                 else:   # we need something to pass to process_info
807                         video_thumbnail = ''
808
809                 return [{
810                         'id':           video_id.decode('utf-8'),
811                         'url':          video_url.decode('utf-8'),
812                         'uploader':     u'NA',
813                         'upload_date':  u'NA',
814                         'title':        video_title,
815                         'ext':          video_extension.decode('utf-8'),
816                 }]
817
818
819 class PhotobucketIE(InfoExtractor):
820         """Information extractor for photobucket.com."""
821
822         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
823         IE_NAME = u'photobucket'
824
825         def __init__(self, downloader=None):
826                 InfoExtractor.__init__(self, downloader)
827
828         def report_download_webpage(self, video_id):
829                 """Report webpage download."""
830                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
831
832         def report_extraction(self, video_id):
833                 """Report information extraction."""
834                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
835
836         def _real_extract(self, url):
837                 # Extract id from URL
838                 mobj = re.match(self._VALID_URL, url)
839                 if mobj is None:
840                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
841                         return
842
843                 video_id = mobj.group(1)
844
845                 video_extension = 'flv'
846
847                 # Retrieve video webpage to extract further information
848                 request = urllib2.Request(url)
849                 try:
850                         self.report_download_webpage(video_id)
851                         webpage = urllib2.urlopen(request).read()
852                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
853                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
854                         return
855
856                 # Extract URL, uploader, and title from webpage
857                 self.report_extraction(video_id)
858                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
859                 if mobj is None:
860                         self._downloader.trouble(u'ERROR: unable to extract media URL')
861                         return
862                 mediaURL = urllib.unquote(mobj.group(1))
863
864                 video_url = mediaURL
865
866                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
867                 if mobj is None:
868                         self._downloader.trouble(u'ERROR: unable to extract title')
869                         return
870                 video_title = mobj.group(1).decode('utf-8')
871
872                 video_uploader = mobj.group(2).decode('utf-8')
873
874                 return [{
875                         'id':           video_id.decode('utf-8'),
876                         'url':          video_url.decode('utf-8'),
877                         'uploader':     video_uploader,
878                         'upload_date':  u'NA',
879                         'title':        video_title,
880                         'ext':          video_extension.decode('utf-8'),
881                 }]
882
883
884 class YahooIE(InfoExtractor):
885         """Information extractor for video.yahoo.com."""
886
887         # _VALID_URL matches all Yahoo! Video URLs
888         # _VPAGE_URL matches only the extractable '/watch/' URLs
889         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
890         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
891         IE_NAME = u'video.yahoo'
892
893         def __init__(self, downloader=None):
894                 InfoExtractor.__init__(self, downloader)
895
896         def report_download_webpage(self, video_id):
897                 """Report webpage download."""
898                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
899
900         def report_extraction(self, video_id):
901                 """Report information extraction."""
902                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
903
904         def _real_extract(self, url, new_video=True):
905                 # Extract ID from URL
906                 mobj = re.match(self._VALID_URL, url)
907                 if mobj is None:
908                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
909                         return
910
911                 video_id = mobj.group(2)
912                 video_extension = 'flv'
913
914                 # Rewrite valid but non-extractable URLs as
915                 # extractable English language /watch/ URLs
916                 if re.match(self._VPAGE_URL, url) is None:
917                         request = urllib2.Request(url)
918                         try:
919                                 webpage = urllib2.urlopen(request).read()
920                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
921                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
922                                 return
923
924                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
925                         if mobj is None:
926                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
927                                 return
928                         yahoo_id = mobj.group(1)
929
930                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
931                         if mobj is None:
932                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
933                                 return
934                         yahoo_vid = mobj.group(1)
935
936                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
937                         return self._real_extract(url, new_video=False)
938
939                 # Retrieve video webpage to extract further information
940                 request = urllib2.Request(url)
941                 try:
942                         self.report_download_webpage(video_id)
943                         webpage = urllib2.urlopen(request).read()
944                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
945                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
946                         return
947
948                 # Extract uploader and title from webpage
949                 self.report_extraction(video_id)
950                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
951                 if mobj is None:
952                         self._downloader.trouble(u'ERROR: unable to extract video title')
953                         return
954                 video_title = mobj.group(1).decode('utf-8')
955
956                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
957                 if mobj is None:
958                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
959                         return
960                 video_uploader = mobj.group(1).decode('utf-8')
961
962                 # Extract video thumbnail
963                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
964                 if mobj is None:
965                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
966                         return
967                 video_thumbnail = mobj.group(1).decode('utf-8')
968
969                 # Extract video description
970                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
971                 if mobj is None:
972                         self._downloader.trouble(u'ERROR: unable to extract video description')
973                         return
974                 video_description = mobj.group(1).decode('utf-8')
975                 if not video_description:
976                         video_description = 'No description available.'
977
978                 # Extract video height and width
979                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
980                 if mobj is None:
981                         self._downloader.trouble(u'ERROR: unable to extract video height')
982                         return
983                 yv_video_height = mobj.group(1)
984
985                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
986                 if mobj is None:
987                         self._downloader.trouble(u'ERROR: unable to extract video width')
988                         return
989                 yv_video_width = mobj.group(1)
990
991                 # Retrieve video playlist to extract media URL
992                 # I'm not completely sure what all these options are, but we
993                 # seem to need most of them, otherwise the server sends a 401.
994                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
995                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
996                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
997                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
998                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
999                 try:
1000                         self.report_download_webpage(video_id)
1001                         webpage = urllib2.urlopen(request).read()
1002                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1003                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1004                         return
1005
1006                 # Extract media URL from playlist XML
1007                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1008                 if mobj is None:
1009                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1010                         return
1011                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1012                 video_url = unescapeHTML(video_url)
1013
1014                 return [{
1015                         'id':           video_id.decode('utf-8'),
1016                         'url':          video_url,
1017                         'uploader':     video_uploader,
1018                         'upload_date':  u'NA',
1019                         'title':        video_title,
1020                         'ext':          video_extension.decode('utf-8'),
1021                         'thumbnail':    video_thumbnail.decode('utf-8'),
1022                         'description':  video_description,
1023                 }]
1024
1025
1026 class VimeoIE(InfoExtractor):
1027         """Information extractor for vimeo.com."""
1028
1029         # _VALID_URL matches Vimeo URLs
1030         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
1031         IE_NAME = u'vimeo'
1032
1033         def __init__(self, downloader=None):
1034                 InfoExtractor.__init__(self, downloader)
1035
1036         def report_download_webpage(self, video_id):
1037                 """Report webpage download."""
1038                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1039
1040         def report_extraction(self, video_id):
1041                 """Report information extraction."""
1042                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1043
1044         def _real_extract(self, url, new_video=True):
1045                 # Extract ID from URL
1046                 mobj = re.match(self._VALID_URL, url)
1047                 if mobj is None:
1048                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1049                         return
1050
1051                 video_id = mobj.group(1)
1052
1053                 # Retrieve video webpage to extract further information
1054                 request = urllib2.Request(url, None, std_headers)
1055                 try:
1056                         self.report_download_webpage(video_id)
1057                         webpage = urllib2.urlopen(request).read()
1058                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1059                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1060                         return
1061
1062                 # Now we begin extracting as much information as we can from what we
1063                 # retrieved. First we extract the information common to all extractors,
1064                 # and latter we extract those that are Vimeo specific.
1065                 self.report_extraction(video_id)
1066
1067                 # Extract the config JSON
1068                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1069                 try:
1070                         config = json.loads(config)
1071                 except:
1072                         self._downloader.trouble(u'ERROR: unable to extract info section')
1073                         return
1074                 
1075                 # Extract title
1076                 video_title = config["video"]["title"]
1077
1078                 # Extract uploader
1079                 video_uploader = config["video"]["owner"]["name"]
1080
1081                 # Extract video thumbnail
1082                 video_thumbnail = config["video"]["thumbnail"]
1083
1084                 # Extract video description
1085                 video_description = get_element_by_id("description", webpage.decode('utf8'))
1086                 if video_description: video_description = clean_html(video_description)
1087                 else: video_description = ''
1088
1089                 # Extract upload date
1090                 video_upload_date = u'NA'
1091                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1092                 if mobj is not None:
1093                         video_upload_date = mobj.group(1)
1094
1095                 # Vimeo specific: extract request signature and timestamp
1096                 sig = config['request']['signature']
1097                 timestamp = config['request']['timestamp']
1098
1099                 # Vimeo specific: extract video codec and quality information
1100                 # First consider quality, then codecs, then take everything
1101                 # TODO bind to format param
1102                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1103                 files = { 'hd': [], 'sd': [], 'other': []}
1104                 for codec_name, codec_extension in codecs:
1105                         if codec_name in config["video"]["files"]:
1106                                 if 'hd' in config["video"]["files"][codec_name]:
1107                                         files['hd'].append((codec_name, codec_extension, 'hd'))
1108                                 elif 'sd' in config["video"]["files"][codec_name]:
1109                                         files['sd'].append((codec_name, codec_extension, 'sd'))
1110                                 else:
1111                                         files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1112
1113                 for quality in ('hd', 'sd', 'other'):
1114                         if len(files[quality]) > 0:
1115                                 video_quality = files[quality][0][2]
1116                                 video_codec = files[quality][0][0]
1117                                 video_extension = files[quality][0][1]
1118                                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1119                                 break
1120                 else:
1121                         self._downloader.trouble(u'ERROR: no known codec found')
1122                         return
1123
1124                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1125                                         %(video_id, sig, timestamp, video_quality, video_codec.upper())
1126
1127                 return [{
1128                         'id':           video_id,
1129                         'url':          video_url,
1130                         'uploader':     video_uploader,
1131                         'upload_date':  video_upload_date,
1132                         'title':        video_title,
1133                         'ext':          video_extension,
1134                         'thumbnail':    video_thumbnail,
1135                         'description':  video_description,
1136                 }]
1137
1138
1139 class GenericIE(InfoExtractor):
1140         """Generic last-resort information extractor."""
1141
1142         _VALID_URL = r'.*'
1143         IE_NAME = u'generic'
1144
1145         def __init__(self, downloader=None):
1146                 InfoExtractor.__init__(self, downloader)
1147
1148         def report_download_webpage(self, video_id):
1149                 """Report webpage download."""
1150                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1151                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1152
1153         def report_extraction(self, video_id):
1154                 """Report information extraction."""
1155                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1156
1157         def report_following_redirect(self, new_url):
1158                 """Report information extraction."""
1159                 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1160                 
1161         def _test_redirect(self, url):
1162                 """Check if it is a redirect, like url shorteners, in case restart chain."""
1163                 class HeadRequest(urllib2.Request):
1164                         def get_method(self):
1165                                 return "HEAD"
1166
1167                 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1168                         """
1169                         Subclass the HTTPRedirectHandler to make it use our 
1170                         HeadRequest also on the redirected URL
1171                         """
1172                         def redirect_request(self, req, fp, code, msg, headers, newurl): 
1173                                 if code in (301, 302, 303, 307):
1174                                         newurl = newurl.replace(' ', '%20') 
1175                                         newheaders = dict((k,v) for k,v in req.headers.items()
1176                                                                           if k.lower() not in ("content-length", "content-type"))
1177                                         return HeadRequest(newurl, 
1178                                                                            headers=newheaders,
1179                                                                            origin_req_host=req.get_origin_req_host(), 
1180                                                                            unverifiable=True) 
1181                                 else: 
1182                                         raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp) 
1183
1184                 class HTTPMethodFallback(urllib2.BaseHandler):
1185                         """
1186                         Fallback to GET if HEAD is not allowed (405 HTTP error)
1187                         """
1188                         def http_error_405(self, req, fp, code, msg, headers): 
1189                                 fp.read()
1190                                 fp.close()
1191
1192                                 newheaders = dict((k,v) for k,v in req.headers.items()
1193                                                                   if k.lower() not in ("content-length", "content-type"))
1194                                 return self.parent.open(urllib2.Request(req.get_full_url(), 
1195                                                                                                  headers=newheaders, 
1196                                                                                                  origin_req_host=req.get_origin_req_host(), 
1197                                                                                                  unverifiable=True))
1198
1199                 # Build our opener
1200                 opener = urllib2.OpenerDirector() 
1201                 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1202                                                 HTTPMethodFallback, HEADRedirectHandler,
1203                                                 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1204                         opener.add_handler(handler())
1205
1206                 response = opener.open(HeadRequest(url))
1207                 new_url = response.geturl()
1208                 
1209                 if url == new_url: return False
1210                 
1211                 self.report_following_redirect(new_url)
1212                 self._downloader.download([new_url])
1213                 return True
1214
1215         def _real_extract(self, url):
1216                 if self._test_redirect(url): return
1217
1218                 video_id = url.split('/')[-1]
1219                 request = urllib2.Request(url)
1220                 try:
1221                         self.report_download_webpage(video_id)
1222                         webpage = urllib2.urlopen(request).read()
1223                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1224                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1225                         return
1226                 except ValueError, err:
1227                         # since this is the last-resort InfoExtractor, if
1228                         # this error is thrown, it'll be thrown here
1229                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1230                         return
1231
1232                 self.report_extraction(video_id)
1233                 # Start with something easy: JW Player in SWFObject
1234                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1235                 if mobj is None:
1236                         # Broaden the search a little bit
1237                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1238                 if mobj is None:
1239                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1240                         return
1241
1242                 # It's possible that one of the regexes
1243                 # matched, but returned an empty group:
1244                 if mobj.group(1) is None:
1245                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1246                         return
1247
1248                 video_url = urllib.unquote(mobj.group(1))
1249                 video_id = os.path.basename(video_url)
1250
1251                 # here's a fun little line of code for you:
1252                 video_extension = os.path.splitext(video_id)[1][1:]
1253                 video_id = os.path.splitext(video_id)[0]
1254
1255                 # it's tempting to parse this further, but you would
1256                 # have to take into account all the variations like
1257                 #   Video Title - Site Name
1258                 #   Site Name | Video Title
1259                 #   Video Title - Tagline | Site Name
1260                 # and so on and so forth; it's just not practical
1261                 mobj = re.search(r'<title>(.*)</title>', webpage)
1262                 if mobj is None:
1263                         self._downloader.trouble(u'ERROR: unable to extract title')
1264                         return
1265                 video_title = mobj.group(1).decode('utf-8')
1266
1267                 # video uploader is domain name
1268                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1269                 if mobj is None:
1270                         self._downloader.trouble(u'ERROR: unable to extract title')
1271                         return
1272                 video_uploader = mobj.group(1).decode('utf-8')
1273
1274                 return [{
1275                         'id':           video_id.decode('utf-8'),
1276                         'url':          video_url.decode('utf-8'),
1277                         'uploader':     video_uploader,
1278                         'upload_date':  u'NA',
1279                         'title':        video_title,
1280                         'ext':          video_extension.decode('utf-8'),
1281                 }]
1282
1283
1284 class YoutubeSearchIE(InfoExtractor):
1285         """Information Extractor for YouTube search queries."""
1286         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1287         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1288         _max_youtube_results = 1000
1289         IE_NAME = u'youtube:search'
1290
1291         def __init__(self, downloader=None):
1292                 InfoExtractor.__init__(self, downloader)
1293
1294         def report_download_page(self, query, pagenum):
1295                 """Report attempt to download search page with given number."""
1296                 query = query.decode(preferredencoding())
1297                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1298
1299         def _real_extract(self, query):
1300                 mobj = re.match(self._VALID_URL, query)
1301                 if mobj is None:
1302                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1303                         return
1304
1305                 prefix, query = query.split(':')
1306                 prefix = prefix[8:]
1307                 query = query.encode('utf-8')
1308                 if prefix == '':
1309                         self._download_n_results(query, 1)
1310                         return
1311                 elif prefix == 'all':
1312                         self._download_n_results(query, self._max_youtube_results)
1313                         return
1314                 else:
1315                         try:
1316                                 n = long(prefix)
1317                                 if n <= 0:
1318                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1319                                         return
1320                                 elif n > self._max_youtube_results:
1321                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1322                                         n = self._max_youtube_results
1323                                 self._download_n_results(query, n)
1324                                 return
1325                         except ValueError: # parsing prefix as integer fails
1326                                 self._download_n_results(query, 1)
1327                                 return
1328
1329         def _download_n_results(self, query, n):
1330                 """Downloads a specified number of results for a query"""
1331
1332                 video_ids = []
1333                 pagenum = 0
1334                 limit = n
1335
1336                 while (50 * pagenum) < limit:
1337                         self.report_download_page(query, pagenum+1)
1338                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1339                         request = urllib2.Request(result_url)
1340                         try:
1341                                 data = urllib2.urlopen(request).read()
1342                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1343                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1344                                 return
1345                         api_response = json.loads(data)['data']
1346
1347                         new_ids = list(video['id'] for video in api_response['items'])
1348                         video_ids += new_ids
1349
1350                         limit = min(n, api_response['totalItems'])
1351                         pagenum += 1
1352
1353                 if len(video_ids) > n:
1354                         video_ids = video_ids[:n]
1355                 for id in video_ids:
1356                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1357                 return
1358
1359
1360 class GoogleSearchIE(InfoExtractor):
1361         """Information Extractor for Google Video search queries."""
1362         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1363         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1364         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1365         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1366         _max_google_results = 1000
1367         IE_NAME = u'video.google:search'
1368
1369         def __init__(self, downloader=None):
1370                 InfoExtractor.__init__(self, downloader)
1371
1372         def report_download_page(self, query, pagenum):
1373                 """Report attempt to download playlist page with given number."""
1374                 query = query.decode(preferredencoding())
1375                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1376
1377         def _real_extract(self, query):
1378                 mobj = re.match(self._VALID_URL, query)
1379                 if mobj is None:
1380                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1381                         return
1382
1383                 prefix, query = query.split(':')
1384                 prefix = prefix[8:]
1385                 query = query.encode('utf-8')
1386                 if prefix == '':
1387                         self._download_n_results(query, 1)
1388                         return
1389                 elif prefix == 'all':
1390                         self._download_n_results(query, self._max_google_results)
1391                         return
1392                 else:
1393                         try:
1394                                 n = long(prefix)
1395                                 if n <= 0:
1396                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1397                                         return
1398                                 elif n > self._max_google_results:
1399                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1400                                         n = self._max_google_results
1401                                 self._download_n_results(query, n)
1402                                 return
1403                         except ValueError: # parsing prefix as integer fails
1404                                 self._download_n_results(query, 1)
1405                                 return
1406
1407         def _download_n_results(self, query, n):
1408                 """Downloads a specified number of results for a query"""
1409
1410                 video_ids = []
1411                 pagenum = 0
1412
1413                 while True:
1414                         self.report_download_page(query, pagenum)
1415                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1416                         request = urllib2.Request(result_url)
1417                         try:
1418                                 page = urllib2.urlopen(request).read()
1419                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1420                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1421                                 return
1422
1423                         # Extract video identifiers
1424                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1425                                 video_id = mobj.group(1)
1426                                 if video_id not in video_ids:
1427                                         video_ids.append(video_id)
1428                                         if len(video_ids) == n:
1429                                                 # Specified n videos reached
1430                                                 for id in video_ids:
1431                                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1432                                                 return
1433
1434                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1435                                 for id in video_ids:
1436                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1437                                 return
1438
1439                         pagenum = pagenum + 1
1440
1441
1442 class YahooSearchIE(InfoExtractor):
1443         """Information Extractor for Yahoo! Video search queries."""
1444         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1445         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1446         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1447         _MORE_PAGES_INDICATOR = r'\s*Next'
1448         _max_yahoo_results = 1000
1449         IE_NAME = u'video.yahoo:search'
1450
1451         def __init__(self, downloader=None):
1452                 InfoExtractor.__init__(self, downloader)
1453
1454         def report_download_page(self, query, pagenum):
1455                 """Report attempt to download playlist page with given number."""
1456                 query = query.decode(preferredencoding())
1457                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1458
1459         def _real_extract(self, query):
1460                 mobj = re.match(self._VALID_URL, query)
1461                 if mobj is None:
1462                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1463                         return
1464
1465                 prefix, query = query.split(':')
1466                 prefix = prefix[8:]
1467                 query = query.encode('utf-8')
1468                 if prefix == '':
1469                         self._download_n_results(query, 1)
1470                         return
1471                 elif prefix == 'all':
1472                         self._download_n_results(query, self._max_yahoo_results)
1473                         return
1474                 else:
1475                         try:
1476                                 n = long(prefix)
1477                                 if n <= 0:
1478                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1479                                         return
1480                                 elif n > self._max_yahoo_results:
1481                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1482                                         n = self._max_yahoo_results
1483                                 self._download_n_results(query, n)
1484                                 return
1485                         except ValueError: # parsing prefix as integer fails
1486                                 self._download_n_results(query, 1)
1487                                 return
1488
1489         def _download_n_results(self, query, n):
1490                 """Downloads a specified number of results for a query"""
1491
1492                 video_ids = []
1493                 already_seen = set()
1494                 pagenum = 1
1495
1496                 while True:
1497                         self.report_download_page(query, pagenum)
1498                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1499                         request = urllib2.Request(result_url)
1500                         try:
1501                                 page = urllib2.urlopen(request).read()
1502                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1503                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1504                                 return
1505
1506                         # Extract video identifiers
1507                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1508                                 video_id = mobj.group(1)
1509                                 if video_id not in already_seen:
1510                                         video_ids.append(video_id)
1511                                         already_seen.add(video_id)
1512                                         if len(video_ids) == n:
1513                                                 # Specified n videos reached
1514                                                 for id in video_ids:
1515                                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1516                                                 return
1517
1518                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1519                                 for id in video_ids:
1520                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1521                                 return
1522
1523                         pagenum = pagenum + 1
1524
1525
1526 class YoutubePlaylistIE(InfoExtractor):
1527         """Information Extractor for YouTube playlists."""
1528
1529         _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1530         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1531         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1532         _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1533         IE_NAME = u'youtube:playlist'
1534
1535         def __init__(self, downloader=None):
1536                 InfoExtractor.__init__(self, downloader)
1537
1538         def report_download_page(self, playlist_id, pagenum):
1539                 """Report attempt to download playlist page with given number."""
1540                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1541
1542         def _real_extract(self, url):
1543                 # Extract playlist id
1544                 mobj = re.match(self._VALID_URL, url)
1545                 if mobj is None:
1546                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1547                         return
1548
1549                 # Single video case
1550                 if mobj.group(3) is not None:
1551                         self._downloader.download([mobj.group(3)])
1552                         return
1553
1554                 # Download playlist pages
1555                 # prefix is 'p' as default for playlists but there are other types that need extra care
1556                 playlist_prefix = mobj.group(1)
1557                 if playlist_prefix == 'a':
1558                         playlist_access = 'artist'
1559                 else:
1560                         playlist_prefix = 'p'
1561                         playlist_access = 'view_play_list'
1562                 playlist_id = mobj.group(2)
1563                 video_ids = []
1564                 pagenum = 1
1565
1566                 while True:
1567                         self.report_download_page(playlist_id, pagenum)
1568                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1569                         request = urllib2.Request(url)
1570                         try:
1571                                 page = urllib2.urlopen(request).read()
1572                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1573                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1574                                 return
1575
1576                         # Extract video identifiers
1577                         ids_in_page = []
1578                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1579                                 if mobj.group(1) not in ids_in_page:
1580                                         ids_in_page.append(mobj.group(1))
1581                         video_ids.extend(ids_in_page)
1582
1583                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1584                                 break
1585                         pagenum = pagenum + 1
1586
1587                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1588                 playlistend = self._downloader.params.get('playlistend', -1)
1589                 if playlistend == -1:
1590                         video_ids = video_ids[playliststart:]
1591                 else:
1592                         video_ids = video_ids[playliststart:playlistend]
1593
1594                 for id in video_ids:
1595                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1596                 return
1597
1598
1599 class YoutubeChannelIE(InfoExtractor):
1600         """Information Extractor for YouTube channels."""
1601
1602         _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1603         _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1604         _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1605         IE_NAME = u'youtube:channel'
1606
1607         def report_download_page(self, channel_id, pagenum):
1608                 """Report attempt to download channel page with given number."""
1609                 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1610
1611         def _real_extract(self, url):
1612                 # Extract channel id
1613                 mobj = re.match(self._VALID_URL, url)
1614                 if mobj is None:
1615                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1616                         return
1617
1618                 # Download channel pages
1619                 channel_id = mobj.group(1)
1620                 video_ids = []
1621                 pagenum = 1
1622
1623                 while True:
1624                         self.report_download_page(channel_id, pagenum)
1625                         url = self._TEMPLATE_URL % (channel_id, pagenum)
1626                         request = urllib2.Request(url)
1627                         try:
1628                                 page = urllib2.urlopen(request).read()
1629                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1630                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1631                                 return
1632
1633                         # Extract video identifiers
1634                         ids_in_page = []
1635                         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1636                                 if mobj.group(1) not in ids_in_page:
1637                                         ids_in_page.append(mobj.group(1))
1638                         video_ids.extend(ids_in_page)
1639
1640                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1641                                 break
1642                         pagenum = pagenum + 1
1643
1644                 for id in video_ids:
1645                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1646                 return
1647
1648
1649 class YoutubeUserIE(InfoExtractor):
1650         """Information Extractor for YouTube users."""
1651
1652         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1653         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1654         _GDATA_PAGE_SIZE = 50
1655         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1656         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1657         IE_NAME = u'youtube:user'
1658
1659         def __init__(self, downloader=None):
1660                 InfoExtractor.__init__(self, downloader)
1661
1662         def report_download_page(self, username, start_index):
1663                 """Report attempt to download user page."""
1664                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1665                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1666
1667         def _real_extract(self, url):
1668                 # Extract username
1669                 mobj = re.match(self._VALID_URL, url)
1670                 if mobj is None:
1671                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1672                         return
1673
1674                 username = mobj.group(1)
1675
1676                 # Download video ids using YouTube Data API. Result size per
1677                 # query is limited (currently to 50 videos) so we need to query
1678                 # page by page until there are no video ids - it means we got
1679                 # all of them.
1680
1681                 video_ids = []
1682                 pagenum = 0
1683
1684                 while True:
1685                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1686                         self.report_download_page(username, start_index)
1687
1688                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1689
1690                         try:
1691                                 page = urllib2.urlopen(request).read()
1692                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1693                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1694                                 return
1695
1696                         # Extract video identifiers
1697                         ids_in_page = []
1698
1699                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1700                                 if mobj.group(1) not in ids_in_page:
1701                                         ids_in_page.append(mobj.group(1))
1702
1703                         video_ids.extend(ids_in_page)
1704
1705                         # A little optimization - if current page is not
1706                         # "full", ie. does not contain PAGE_SIZE video ids then
1707                         # we can assume that this page is the last one - there
1708                         # are no more ids on further pages - no need to query
1709                         # again.
1710
1711                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1712                                 break
1713
1714                         pagenum += 1
1715
1716                 all_ids_count = len(video_ids)
1717                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1718                 playlistend = self._downloader.params.get('playlistend', -1)
1719
1720                 if playlistend == -1:
1721                         video_ids = video_ids[playliststart:]
1722                 else:
1723                         video_ids = video_ids[playliststart:playlistend]
1724
1725                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1726                                 (username, all_ids_count, len(video_ids)))
1727
1728                 for video_id in video_ids:
1729                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1730
1731
1732 class BlipTVUserIE(InfoExtractor):
1733         """Information Extractor for blip.tv users."""
1734
1735         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1736         _PAGE_SIZE = 12
1737         IE_NAME = u'blip.tv:user'
1738
1739         def __init__(self, downloader=None):
1740                 InfoExtractor.__init__(self, downloader)
1741
1742         def report_download_page(self, username, pagenum):
1743                 """Report attempt to download user page."""
1744                 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1745                                 (self.IE_NAME, username, pagenum))
1746
1747         def _real_extract(self, url):
1748                 # Extract username
1749                 mobj = re.match(self._VALID_URL, url)
1750                 if mobj is None:
1751                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1752                         return
1753
1754                 username = mobj.group(1)
1755
1756                 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1757
1758                 request = urllib2.Request(url)
1759
1760                 try:
1761                         page = urllib2.urlopen(request).read().decode('utf-8')
1762                         mobj = re.search(r'data-users-id="([^"]+)"', page)
1763                         page_base = page_base % mobj.group(1)
1764                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1765                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1766                         return
1767
1768
1769                 # Download video ids using BlipTV Ajax calls. Result size per
1770                 # query is limited (currently to 12 videos) so we need to query
1771                 # page by page until there are no video ids - it means we got
1772                 # all of them.
1773
1774                 video_ids = []
1775                 pagenum = 1
1776
1777                 while True:
1778                         self.report_download_page(username, pagenum)
1779
1780                         request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1781
1782                         try:
1783                                 page = urllib2.urlopen(request).read().decode('utf-8')
1784                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1785                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1786                                 return
1787
1788                         # Extract video identifiers
1789                         ids_in_page = []
1790
1791                         for mobj in re.finditer(r'href="/([^"]+)"', page):
1792                                 if mobj.group(1) not in ids_in_page:
1793                                         ids_in_page.append(unescapeHTML(mobj.group(1)))
1794
1795                         video_ids.extend(ids_in_page)
1796
1797                         # A little optimization - if current page is not
1798                         # "full", ie. does not contain PAGE_SIZE video ids then
1799                         # we can assume that this page is the last one - there
1800                         # are no more ids on further pages - no need to query
1801                         # again.
1802
1803                         if len(ids_in_page) < self._PAGE_SIZE:
1804                                 break
1805
1806                         pagenum += 1
1807
1808                 all_ids_count = len(video_ids)
1809                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1810                 playlistend = self._downloader.params.get('playlistend', -1)
1811
1812                 if playlistend == -1:
1813                         video_ids = video_ids[playliststart:]
1814                 else:
1815                         video_ids = video_ids[playliststart:playlistend]
1816
1817                 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1818                                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1819
1820                 for video_id in video_ids:
1821                         self._downloader.download([u'http://blip.tv/'+video_id])
1822
1823
1824 class DepositFilesIE(InfoExtractor):
1825         """Information extractor for depositfiles.com"""
1826
1827         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1828         IE_NAME = u'DepositFiles'
1829
1830         def __init__(self, downloader=None):
1831                 InfoExtractor.__init__(self, downloader)
1832
1833         def report_download_webpage(self, file_id):
1834                 """Report webpage download."""
1835                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1836
1837         def report_extraction(self, file_id):
1838                 """Report information extraction."""
1839                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1840
1841         def _real_extract(self, url):
1842                 file_id = url.split('/')[-1]
1843                 # Rebuild url in english locale
1844                 url = 'http://depositfiles.com/en/files/' + file_id
1845
1846                 # Retrieve file webpage with 'Free download' button pressed
1847                 free_download_indication = { 'gateway_result' : '1' }
1848                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1849                 try:
1850                         self.report_download_webpage(file_id)
1851                         webpage = urllib2.urlopen(request).read()
1852                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1853                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1854                         return
1855
1856                 # Search for the real file URL
1857                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1858                 if (mobj is None) or (mobj.group(1) is None):
1859                         # Try to figure out reason of the error.
1860                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1861                         if (mobj is not None) and (mobj.group(1) is not None):
1862                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1863                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1864                         else:
1865                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1866                         return
1867
1868                 file_url = mobj.group(1)
1869                 file_extension = os.path.splitext(file_url)[1][1:]
1870
1871                 # Search for file title
1872                 mobj = re.search(r'<b title="(.*?)">', webpage)
1873                 if mobj is None:
1874                         self._downloader.trouble(u'ERROR: unable to extract title')
1875                         return
1876                 file_title = mobj.group(1).decode('utf-8')
1877
1878                 return [{
1879                         'id':           file_id.decode('utf-8'),
1880                         'url':          file_url.decode('utf-8'),
1881                         'uploader':     u'NA',
1882                         'upload_date':  u'NA',
1883                         'title':        file_title,
1884                         'ext':          file_extension.decode('utf-8'),
1885                 }]
1886
1887
1888 class FacebookIE(InfoExtractor):
1889         """Information Extractor for Facebook"""
1890
1891         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1892         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1893         _NETRC_MACHINE = 'facebook'
1894         _available_formats = ['video', 'highqual', 'lowqual']
1895         _video_extensions = {
1896                 'video': 'mp4',
1897                 'highqual': 'mp4',
1898                 'lowqual': 'mp4',
1899         }
1900         IE_NAME = u'facebook'
1901
1902         def __init__(self, downloader=None):
1903                 InfoExtractor.__init__(self, downloader)
1904
1905         def _reporter(self, message):
1906                 """Add header and report message."""
1907                 self._downloader.to_screen(u'[facebook] %s' % message)
1908
1909         def report_login(self):
1910                 """Report attempt to log in."""
1911                 self._reporter(u'Logging in')
1912
1913         def report_video_webpage_download(self, video_id):
1914                 """Report attempt to download video webpage."""
1915                 self._reporter(u'%s: Downloading video webpage' % video_id)
1916
1917         def report_information_extraction(self, video_id):
1918                 """Report attempt to extract video information."""
1919                 self._reporter(u'%s: Extracting video information' % video_id)
1920
1921         def _parse_page(self, video_webpage):
1922                 """Extract video information from page"""
1923                 # General data
1924                 data = {'title': r'\("video_title", "(.*?)"\)',
1925                         'description': r'<div class="datawrap">(.*?)</div>',
1926                         'owner': r'\("video_owner_name", "(.*?)"\)',
1927                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1928                         }
1929                 video_info = {}
1930                 for piece in data.keys():
1931                         mobj = re.search(data[piece], video_webpage)
1932                         if mobj is not None:
1933                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1934
1935                 # Video urls
1936                 video_urls = {}
1937                 for fmt in self._available_formats:
1938                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1939                         if mobj is not None:
1940                                 # URL is in a Javascript segment inside an escaped Unicode format within
1941                                 # the generally utf-8 page
1942                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1943                 video_info['video_urls'] = video_urls
1944
1945                 return video_info
1946
1947         def _real_initialize(self):
1948                 if self._downloader is None:
1949                         return
1950
1951                 useremail = None
1952                 password = None
1953                 downloader_params = self._downloader.params
1954
1955                 # Attempt to use provided username and password or .netrc data
1956                 if downloader_params.get('username', None) is not None:
1957                         useremail = downloader_params['username']
1958                         password = downloader_params['password']
1959                 elif downloader_params.get('usenetrc', False):
1960                         try:
1961                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1962                                 if info is not None:
1963                                         useremail = info[0]
1964                                         password = info[2]
1965                                 else:
1966                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1967                         except (IOError, netrc.NetrcParseError), err:
1968                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
1969                                 return
1970
1971                 if useremail is None:
1972                         return
1973
1974                 # Log in
1975                 login_form = {
1976                         'email': useremail,
1977                         'pass': password,
1978                         'login': 'Log+In'
1979                         }
1980                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1981                 try:
1982                         self.report_login()
1983                         login_results = urllib2.urlopen(request).read()
1984                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1985                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1986                                 return
1987                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1988                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
1989                         return
1990
1991         def _real_extract(self, url):
1992                 mobj = re.match(self._VALID_URL, url)
1993                 if mobj is None:
1994                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1995                         return
1996                 video_id = mobj.group('ID')
1997
1998                 # Get video webpage
1999                 self.report_video_webpage_download(video_id)
2000                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2001                 try:
2002                         page = urllib2.urlopen(request)
2003                         video_webpage = page.read()
2004                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2005                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2006                         return
2007
2008                 # Start extracting information
2009                 self.report_information_extraction(video_id)
2010
2011                 # Extract information
2012                 video_info = self._parse_page(video_webpage)
2013
2014                 # uploader
2015                 if 'owner' not in video_info:
2016                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2017                         return
2018                 video_uploader = video_info['owner']
2019
2020                 # title
2021                 if 'title' not in video_info:
2022                         self._downloader.trouble(u'ERROR: unable to extract video title')
2023                         return
2024                 video_title = video_info['title']
2025                 video_title = video_title.decode('utf-8')
2026
2027                 # thumbnail image
2028                 if 'thumbnail' not in video_info:
2029                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2030                         video_thumbnail = ''
2031                 else:
2032                         video_thumbnail = video_info['thumbnail']
2033
2034                 # upload date
2035                 upload_date = u'NA'
2036                 if 'upload_date' in video_info:
2037                         upload_time = video_info['upload_date']
2038                         timetuple = email.utils.parsedate_tz(upload_time)
2039                         if timetuple is not None:
2040                                 try:
2041                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2042                                 except:
2043                                         pass
2044
2045                 # description
2046                 video_description = video_info.get('description', 'No description available.')
2047
2048                 url_map = video_info['video_urls']
2049                 if len(url_map.keys()) > 0:
2050                         # Decide which formats to download
2051                         req_format = self._downloader.params.get('format', None)
2052                         format_limit = self._downloader.params.get('format_limit', None)
2053
2054                         if format_limit is not None and format_limit in self._available_formats:
2055                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2056                         else:
2057                                 format_list = self._available_formats
2058                         existing_formats = [x for x in format_list if x in url_map]
2059                         if len(existing_formats) == 0:
2060                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2061                                 return
2062                         if req_format is None:
2063                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2064                         elif req_format == 'worst':
2065                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2066                         elif req_format == '-1':
2067                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2068                         else:
2069                                 # Specific format
2070                                 if req_format not in url_map:
2071                                         self._downloader.trouble(u'ERROR: requested format not available')
2072                                         return
2073                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2074
2075                 results = []
2076                 for format_param, video_real_url in video_url_list:
2077                         # Extension
2078                         video_extension = self._video_extensions.get(format_param, 'mp4')
2079
2080                         results.append({
2081                                 'id':           video_id.decode('utf-8'),
2082                                 'url':          video_real_url.decode('utf-8'),
2083                                 'uploader':     video_uploader.decode('utf-8'),
2084                                 'upload_date':  upload_date,
2085                                 'title':        video_title,
2086                                 'ext':          video_extension.decode('utf-8'),
2087                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2088                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2089                                 'description':  video_description.decode('utf-8'),
2090                         })
2091                 return results
2092
2093 class BlipTVIE(InfoExtractor):
2094         """Information extractor for blip.tv"""
2095
2096         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2097         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2098         IE_NAME = u'blip.tv'
2099
2100         def report_extraction(self, file_id):
2101                 """Report information extraction."""
2102                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2103
2104         def report_direct_download(self, title):
2105                 """Report information extraction."""
2106                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2107
2108         def _real_extract(self, url):
2109                 mobj = re.match(self._VALID_URL, url)
2110                 if mobj is None:
2111                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2112                         return
2113
2114                 if '?' in url:
2115                         cchar = '&'
2116                 else:
2117                         cchar = '?'
2118                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2119                 request = urllib2.Request(json_url.encode('utf-8'))
2120                 self.report_extraction(mobj.group(1))
2121                 info = None
2122                 try:
2123                         urlh = urllib2.urlopen(request)
2124                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2125                                 basename = url.split('/')[-1]
2126                                 title,ext = os.path.splitext(basename)
2127                                 title = title.decode('UTF-8')
2128                                 ext = ext.replace('.', '')
2129                                 self.report_direct_download(title)
2130                                 info = {
2131                                         'id': title,
2132                                         'url': url,
2133                                         'uploader': u'NA',
2134                                         'upload_date': u'NA',
2135                                         'title': title,
2136                                         'ext': ext,
2137                                         'urlhandle': urlh
2138                                 }
2139                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2140                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2141                         return
2142                 if info is None: # Regular URL
2143                         try:
2144                                 json_code = urlh.read()
2145                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2146                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2147                                 return
2148
2149                         try:
2150                                 json_data = json.loads(json_code)
2151                                 if 'Post' in json_data:
2152                                         data = json_data['Post']
2153                                 else:
2154                                         data = json_data
2155
2156                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2157                                 video_url = data['media']['url']
2158                                 umobj = re.match(self._URL_EXT, video_url)
2159                                 if umobj is None:
2160                                         raise ValueError('Can not determine filename extension')
2161                                 ext = umobj.group(1)
2162
2163                                 info = {
2164                                         'id': data['item_id'],
2165                                         'url': video_url,
2166                                         'uploader': data['display_name'],
2167                                         'upload_date': upload_date,
2168                                         'title': data['title'],
2169                                         'ext': ext,
2170                                         'format': data['media']['mimeType'],
2171                                         'thumbnail': data['thumbnailUrl'],
2172                                         'description': data['description'],
2173                                         'player_url': data['embedUrl']
2174                                 }
2175                         except (ValueError,KeyError), err:
2176                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2177                                 return
2178
2179                 std_headers['User-Agent'] = 'iTunes/10.6.1'
2180                 return [info]
2181
2182
2183 class MyVideoIE(InfoExtractor):
2184         """Information Extractor for myvideo.de."""
2185
2186         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2187         IE_NAME = u'myvideo'
2188
2189         def __init__(self, downloader=None):
2190                 InfoExtractor.__init__(self, downloader)
2191         
2192         def report_download_webpage(self, video_id):
2193                 """Report webpage download."""
2194                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2195
2196         def report_extraction(self, video_id):
2197                 """Report information extraction."""
2198                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2199
2200         def _real_extract(self,url):
2201                 mobj = re.match(self._VALID_URL, url)
2202                 if mobj is None:
2203                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2204                         return
2205
2206                 video_id = mobj.group(1)
2207
2208                 # Get video webpage
2209                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2210                 try:
2211                         self.report_download_webpage(video_id)
2212                         webpage = urllib2.urlopen(request).read()
2213                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2214                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2215                         return
2216
2217                 self.report_extraction(video_id)
2218                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2219                                  webpage)
2220                 if mobj is None:
2221                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2222                         return
2223                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2224
2225                 mobj = re.search('<title>([^<]+)</title>', webpage)
2226                 if mobj is None:
2227                         self._downloader.trouble(u'ERROR: unable to extract title')
2228                         return
2229
2230                 video_title = mobj.group(1)
2231
2232                 return [{
2233                         'id':           video_id,
2234                         'url':          video_url,
2235                         'uploader':     u'NA',
2236                         'upload_date':  u'NA',
2237                         'title':        video_title,
2238                         'ext':          u'flv',
2239                 }]
2240
2241 class ComedyCentralIE(InfoExtractor):
2242         """Information extractor for The Daily Show and Colbert Report """
2243
2244         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2245         IE_NAME = u'comedycentral'
2246
2247         _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2248
2249         _video_extensions = {
2250                 '3500': 'mp4',
2251                 '2200': 'mp4',
2252                 '1700': 'mp4',
2253                 '1200': 'mp4',
2254                 '750': 'mp4',
2255                 '400': 'mp4',
2256         }
2257         _video_dimensions = {
2258                 '3500': '1280x720',
2259                 '2200': '960x540',
2260                 '1700': '768x432',
2261                 '1200': '640x360',
2262                 '750': '512x288',
2263                 '400': '384x216',
2264         }
2265
2266         def report_extraction(self, episode_id):
2267                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2268
2269         def report_config_download(self, episode_id):
2270                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2271
2272         def report_index_download(self, episode_id):
2273                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2274
2275         def report_player_url(self, episode_id):
2276                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2277
2278
2279         def _print_formats(self, formats):
2280                 print('Available formats:')
2281                 for x in formats:
2282                         print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2283
2284
2285         def _real_extract(self, url):
2286                 mobj = re.match(self._VALID_URL, url)
2287                 if mobj is None:
2288                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2289                         return
2290
2291                 if mobj.group('shortname'):
2292                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
2293                                 url = u'http://www.thedailyshow.com/full-episodes/'
2294                         else:
2295                                 url = u'http://www.colbertnation.com/full-episodes/'
2296                         mobj = re.match(self._VALID_URL, url)
2297                         assert mobj is not None
2298
2299                 dlNewest = not mobj.group('episode')
2300                 if dlNewest:
2301                         epTitle = mobj.group('showname')
2302                 else:
2303                         epTitle = mobj.group('episode')
2304
2305                 req = urllib2.Request(url)
2306                 self.report_extraction(epTitle)
2307                 try:
2308                         htmlHandle = urllib2.urlopen(req)
2309                         html = htmlHandle.read()
2310                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2311                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2312                         return
2313                 if dlNewest:
2314                         url = htmlHandle.geturl()
2315                         mobj = re.match(self._VALID_URL, url)
2316                         if mobj is None:
2317                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2318                                 return
2319                         if mobj.group('episode') == '':
2320                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2321                                 return
2322                         epTitle = mobj.group('episode')
2323
2324                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2325
2326                 if len(mMovieParams) == 0:
2327                         # The Colbert Report embeds the information in a without
2328                         # a URL prefix; so extract the alternate reference
2329                         # and then add the URL prefix manually.
2330
2331                         altMovieParams = re.findall('data-mgid="([^"]*episode.*?:.*?)"', html)
2332                         if len(altMovieParams) == 0:
2333                                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2334                                 return
2335                         else:
2336                                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2337                 
2338                 playerUrl_raw = mMovieParams[0][0]
2339                 self.report_player_url(epTitle)
2340                 try:
2341                         urlHandle = urllib2.urlopen(playerUrl_raw)
2342                         playerUrl = urlHandle.geturl()
2343                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2344                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2345                         return
2346
2347                 uri = mMovieParams[0][1]
2348                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2349                 self.report_index_download(epTitle)
2350                 try:
2351                         indexXml = urllib2.urlopen(indexUrl).read()
2352                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2353                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2354                         return
2355
2356                 results = []
2357
2358                 idoc = xml.etree.ElementTree.fromstring(indexXml)
2359                 itemEls = idoc.findall('.//item')
2360                 for itemEl in itemEls:
2361                         mediaId = itemEl.findall('./guid')[0].text
2362                         shortMediaId = mediaId.split(':')[-1]
2363                         showId = mediaId.split(':')[-2].replace('.com', '')
2364                         officialTitle = itemEl.findall('./title')[0].text
2365                         officialDate = itemEl.findall('./pubDate')[0].text
2366
2367                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2368                                                 urllib.urlencode({'uri': mediaId}))
2369                         configReq = urllib2.Request(configUrl)
2370                         self.report_config_download(epTitle)
2371                         try:
2372                                 configXml = urllib2.urlopen(configReq).read()
2373                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2374                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2375                                 return
2376
2377                         cdoc = xml.etree.ElementTree.fromstring(configXml)
2378                         turls = []
2379                         for rendition in cdoc.findall('.//rendition'):
2380                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2381                                 turls.append(finfo)
2382
2383                         if len(turls) == 0:
2384                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2385                                 continue
2386                         
2387                         if self._downloader.params.get('listformats', None):
2388                                 self._print_formats([i[0] for i in turls])
2389                                 return
2390
2391                         # For now, just pick the highest bitrate
2392                         format,video_url = turls[-1]
2393
2394                         # Get the format arg from the arg stream
2395                         req_format = self._downloader.params.get('format', None)
2396
2397                         # Select format if we can find one
2398                         for f,v in turls:
2399                                 if f == req_format:
2400                                         format, video_url = f, v
2401                                         break
2402
2403                         # Patch to download from alternative CDN, which does not
2404                         # break on current RTMPDump builds
2405                         broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2406                         better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2407
2408                         if video_url.startswith(broken_cdn):
2409                                 video_url = video_url.replace(broken_cdn, better_cdn)
2410
2411                         effTitle = showId + u'-' + epTitle
2412                         info = {
2413                                 'id': shortMediaId,
2414                                 'url': video_url,
2415                                 'uploader': showId,
2416                                 'upload_date': officialDate,
2417                                 'title': effTitle,
2418                                 'ext': 'mp4',
2419                                 'format': format,
2420                                 'thumbnail': None,
2421                                 'description': officialTitle,
2422                                 'player_url': None #playerUrl
2423                         }
2424
2425                         results.append(info)
2426                         
2427                 return results
2428
2429
2430 class EscapistIE(InfoExtractor):
2431         """Information extractor for The Escapist """
2432
2433         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2434         IE_NAME = u'escapist'
2435
2436         def report_extraction(self, showName):
2437                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2438
2439         def report_config_download(self, showName):
2440                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2441
2442         def _real_extract(self, url):
2443                 mobj = re.match(self._VALID_URL, url)
2444                 if mobj is None:
2445                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2446                         return
2447                 showName = mobj.group('showname')
2448                 videoId = mobj.group('episode')
2449
2450                 self.report_extraction(showName)
2451                 try:
2452                         webPage = urllib2.urlopen(url)
2453                         webPageBytes = webPage.read()
2454                         m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2455                         webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2456                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2457                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2458                         return
2459
2460                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2461                 description = unescapeHTML(descMatch.group(1))
2462                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2463                 imgUrl = unescapeHTML(imgMatch.group(1))
2464                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2465                 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2466                 configUrlMatch = re.search('config=(.*)$', playerUrl)
2467                 configUrl = urllib2.unquote(configUrlMatch.group(1))
2468
2469                 self.report_config_download(showName)
2470                 try:
2471                         configJSON = urllib2.urlopen(configUrl).read()
2472                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2473                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2474                         return
2475
2476                 # Technically, it's JavaScript, not JSON
2477                 configJSON = configJSON.replace("'", '"')
2478
2479                 try:
2480                         config = json.loads(configJSON)
2481                 except (ValueError,), err:
2482                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2483                         return
2484
2485                 playlist = config['playlist']
2486                 videoUrl = playlist[1]['url']
2487
2488                 info = {
2489                         'id': videoId,
2490                         'url': videoUrl,
2491                         'uploader': showName,
2492                         'upload_date': u'NA',
2493                         'title': showName,
2494                         'ext': 'flv',
2495                         'thumbnail': imgUrl,
2496                         'description': description,
2497                         'player_url': playerUrl,
2498                 }
2499
2500                 return [info]
2501
2502
2503 class CollegeHumorIE(InfoExtractor):
2504         """Information extractor for collegehumor.com"""
2505
2506         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2507         IE_NAME = u'collegehumor'
2508
2509         def report_webpage(self, video_id):
2510                 """Report information extraction."""
2511                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2512
2513         def report_extraction(self, video_id):
2514                 """Report information extraction."""
2515                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2516
2517         def _real_extract(self, url):
2518                 mobj = re.match(self._VALID_URL, url)
2519                 if mobj is None:
2520                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2521                         return
2522                 video_id = mobj.group('videoid')
2523
2524                 self.report_webpage(video_id)
2525                 request = urllib2.Request(url)
2526                 try:
2527                         webpage = urllib2.urlopen(request).read()
2528                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2529                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2530                         return
2531
2532                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2533                 if m is None:
2534                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2535                         return
2536                 internal_video_id = m.group('internalvideoid')
2537
2538                 info = {
2539                         'id': video_id,
2540                         'internal_id': internal_video_id,
2541                         'uploader': u'NA',
2542                         'upload_date': u'NA',
2543                 }
2544
2545                 self.report_extraction(video_id)
2546                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2547                 try:
2548                         metaXml = urllib2.urlopen(xmlUrl).read()
2549                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2550                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2551                         return
2552
2553                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2554                 try:
2555                         videoNode = mdoc.findall('./video')[0]
2556                         info['description'] = videoNode.findall('./description')[0].text
2557                         info['title'] = videoNode.findall('./caption')[0].text
2558                         info['url'] = videoNode.findall('./file')[0].text
2559                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2560                         info['ext'] = info['url'].rpartition('.')[2]
2561                 except IndexError:
2562                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2563                         return
2564
2565                 return [info]
2566
2567
2568 class XVideosIE(InfoExtractor):
2569         """Information extractor for xvideos.com"""
2570
2571         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2572         IE_NAME = u'xvideos'
2573
2574         def report_webpage(self, video_id):
2575                 """Report information extraction."""
2576                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2577
2578         def report_extraction(self, video_id):
2579                 """Report information extraction."""
2580                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2581
2582         def _real_extract(self, url):
2583                 mobj = re.match(self._VALID_URL, url)
2584                 if mobj is None:
2585                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2586                         return
2587                 video_id = mobj.group(1).decode('utf-8')
2588
2589                 self.report_webpage(video_id)
2590
2591                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2592                 try:
2593                         webpage = urllib2.urlopen(request).read()
2594                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2595                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2596                         return
2597
2598                 self.report_extraction(video_id)
2599
2600
2601                 # Extract video URL
2602                 mobj = re.search(r'flv_url=(.+?)&', webpage)
2603                 if mobj is None:
2604                         self._downloader.trouble(u'ERROR: unable to extract video url')
2605                         return
2606                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2607
2608
2609                 # Extract title
2610                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2611                 if mobj is None:
2612                         self._downloader.trouble(u'ERROR: unable to extract video title')
2613                         return
2614                 video_title = mobj.group(1).decode('utf-8')
2615
2616
2617                 # Extract video thumbnail
2618                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2619                 if mobj is None:
2620                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2621                         return
2622                 video_thumbnail = mobj.group(0).decode('utf-8')
2623
2624                 info = {
2625                         'id': video_id,
2626                         'url': video_url,
2627                         'uploader': u'NA',
2628                         'upload_date': u'NA',
2629                         'title': video_title,
2630                         'ext': 'flv',
2631                         'thumbnail': video_thumbnail,
2632                         'description': None,
2633                 }
2634
2635                 return [info]
2636
2637
2638 class SoundcloudIE(InfoExtractor):
2639         """Information extractor for soundcloud.com
2640            To access the media, the uid of the song and a stream token
2641            must be extracted from the page source and the script must make
2642            a request to media.soundcloud.com/crossdomain.xml. Then
2643            the media can be grabbed by requesting from an url composed
2644            of the stream token and uid
2645          """
2646
2647         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2648         IE_NAME = u'soundcloud'
2649
2650         def __init__(self, downloader=None):
2651                 InfoExtractor.__init__(self, downloader)
2652
2653         def report_webpage(self, video_id):
2654                 """Report information extraction."""
2655                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2656
2657         def report_extraction(self, video_id):
2658                 """Report information extraction."""
2659                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2660
2661         def _real_extract(self, url):
2662                 mobj = re.match(self._VALID_URL, url)
2663                 if mobj is None:
2664                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2665                         return
2666
2667                 # extract uploader (which is in the url)
2668                 uploader = mobj.group(1).decode('utf-8')
2669                 # extract simple title (uploader + slug of song title)
2670                 slug_title =  mobj.group(2).decode('utf-8')
2671                 simple_title = uploader + u'-' + slug_title
2672
2673                 self.report_webpage('%s/%s' % (uploader, slug_title))
2674
2675                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2676                 try:
2677                         webpage = urllib2.urlopen(request).read()
2678                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2679                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2680                         return
2681
2682                 self.report_extraction('%s/%s' % (uploader, slug_title))
2683
2684                 # extract uid and stream token that soundcloud hands out for access
2685                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2686                 if mobj:
2687                         video_id = mobj.group(1)
2688                         stream_token = mobj.group(2)
2689
2690                 # extract unsimplified title
2691                 mobj = re.search('"title":"(.*?)",', webpage)
2692                 if mobj:
2693                         title = mobj.group(1).decode('utf-8')
2694                 else:
2695                         title = simple_title
2696
2697                 # construct media url (with uid/token)
2698                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2699                 mediaURL = mediaURL % (video_id, stream_token)
2700
2701                 # description
2702                 description = u'No description available'
2703                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2704                 if mobj:
2705                         description = mobj.group(1)
2706
2707                 # upload date
2708                 upload_date = u'NA'
2709                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2710                 if mobj:
2711                         try:
2712                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2713                         except Exception, e:
2714                                 self._downloader.to_stderr(compat_str(e))
2715
2716                 # for soundcloud, a request to a cross domain is required for cookies
2717                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2718
2719                 return [{
2720                         'id':           video_id.decode('utf-8'),
2721                         'url':          mediaURL,
2722                         'uploader':     uploader.decode('utf-8'),
2723                         'upload_date':  upload_date,
2724                         'title':        title,
2725                         'ext':          u'mp3',
2726                         'description': description.decode('utf-8')
2727                 }]
2728
2729
2730 class InfoQIE(InfoExtractor):
2731         """Information extractor for infoq.com"""
2732
2733         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2734         IE_NAME = u'infoq'
2735
2736         def report_webpage(self, video_id):
2737                 """Report information extraction."""
2738                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2739
2740         def report_extraction(self, video_id):
2741                 """Report information extraction."""
2742                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2743
2744         def _real_extract(self, url):
2745                 mobj = re.match(self._VALID_URL, url)
2746                 if mobj is None:
2747                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2748                         return
2749
2750                 self.report_webpage(url)
2751
2752                 request = urllib2.Request(url)
2753                 try:
2754                         webpage = urllib2.urlopen(request).read()
2755                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2756                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2757                         return
2758
2759                 self.report_extraction(url)
2760
2761
2762                 # Extract video URL
2763                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2764                 if mobj is None:
2765                         self._downloader.trouble(u'ERROR: unable to extract video url')
2766                         return
2767                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2768
2769
2770                 # Extract title
2771                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2772                 if mobj is None:
2773                         self._downloader.trouble(u'ERROR: unable to extract video title')
2774                         return
2775                 video_title = mobj.group(1).decode('utf-8')
2776
2777                 # Extract description
2778                 video_description = u'No description available.'
2779                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2780                 if mobj is not None:
2781                         video_description = mobj.group(1).decode('utf-8')
2782
2783                 video_filename = video_url.split('/')[-1]
2784                 video_id, extension = video_filename.split('.')
2785
2786                 info = {
2787                         'id': video_id,
2788                         'url': video_url,
2789                         'uploader': u'NA',
2790                         'upload_date': u'NA',
2791                         'title': video_title,
2792                         'ext': extension, # Extension is always(?) mp4, but seems to be flv
2793                         'thumbnail': None,
2794                         'description': video_description,
2795                 }
2796
2797                 return [info]
2798
2799 class MixcloudIE(InfoExtractor):
2800         """Information extractor for www.mixcloud.com"""
2801         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2802         IE_NAME = u'mixcloud'
2803
2804         def __init__(self, downloader=None):
2805                 InfoExtractor.__init__(self, downloader)
2806
2807         def report_download_json(self, file_id):
2808                 """Report JSON download."""
2809                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2810
2811         def report_extraction(self, file_id):
2812                 """Report information extraction."""
2813                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2814
2815         def get_urls(self, jsonData, fmt, bitrate='best'):
2816                 """Get urls from 'audio_formats' section in json"""
2817                 file_url = None
2818                 try:
2819                         bitrate_list = jsonData[fmt]
2820                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2821                                 bitrate = max(bitrate_list) # select highest
2822
2823                         url_list = jsonData[fmt][bitrate]
2824                 except TypeError: # we have no bitrate info.
2825                         url_list = jsonData[fmt]
2826                 return url_list
2827
2828         def check_urls(self, url_list):
2829                 """Returns 1st active url from list"""
2830                 for url in url_list:
2831                         try:
2832                                 urllib2.urlopen(url)
2833                                 return url
2834                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2835                                 url = None
2836
2837                 return None
2838
2839         def _print_formats(self, formats):
2840                 print('Available formats:')
2841                 for fmt in formats.keys():
2842                         for b in formats[fmt]:
2843                                 try:
2844                                         ext = formats[fmt][b][0]
2845                                         print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2846                                 except TypeError: # we have no bitrate info
2847                                         ext = formats[fmt][0]
2848                                         print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2849                                         break
2850
2851         def _real_extract(self, url):
2852                 mobj = re.match(self._VALID_URL, url)
2853                 if mobj is None:
2854                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2855                         return
2856                 # extract uploader & filename from url
2857                 uploader = mobj.group(1).decode('utf-8')
2858                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2859
2860                 # construct API request
2861                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2862                 # retrieve .json file with links to files
2863                 request = urllib2.Request(file_url)
2864                 try:
2865                         self.report_download_json(file_url)
2866                         jsonData = urllib2.urlopen(request).read()
2867                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2868                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2869                         return
2870
2871                 # parse JSON
2872                 json_data = json.loads(jsonData)
2873                 player_url = json_data['player_swf_url']
2874                 formats = dict(json_data['audio_formats'])
2875
2876                 req_format = self._downloader.params.get('format', None)
2877                 bitrate = None
2878
2879                 if self._downloader.params.get('listformats', None):
2880                         self._print_formats(formats)
2881                         return
2882
2883                 if req_format is None or req_format == 'best':
2884                         for format_param in formats.keys():
2885                                 url_list = self.get_urls(formats, format_param)
2886                                 # check urls
2887                                 file_url = self.check_urls(url_list)
2888                                 if file_url is not None:
2889                                         break # got it!
2890                 else:
2891                         if req_format not in formats.keys():
2892                                 self._downloader.trouble(u'ERROR: format is not available')
2893                                 return
2894
2895                         url_list = self.get_urls(formats, req_format)
2896                         file_url = self.check_urls(url_list)
2897                         format_param = req_format
2898
2899                 return [{
2900                         'id': file_id.decode('utf-8'),
2901                         'url': file_url.decode('utf-8'),
2902                         'uploader':     uploader.decode('utf-8'),
2903                         'upload_date': u'NA',
2904                         'title': json_data['name'],
2905                         'ext': file_url.split('.')[-1].decode('utf-8'),
2906                         'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2907                         'thumbnail': json_data['thumbnail_url'],
2908                         'description': json_data['description'],
2909                         'player_url': player_url.decode('utf-8'),
2910                 }]
2911
2912 class StanfordOpenClassroomIE(InfoExtractor):
2913         """Information extractor for Stanford's Open ClassRoom"""
2914
2915         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2916         IE_NAME = u'stanfordoc'
2917
2918         def report_download_webpage(self, objid):
2919                 """Report information extraction."""
2920                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2921
2922         def report_extraction(self, video_id):
2923                 """Report information extraction."""
2924                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2925
2926         def _real_extract(self, url):
2927                 mobj = re.match(self._VALID_URL, url)
2928                 if mobj is None:
2929                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2930                         return
2931
2932                 if mobj.group('course') and mobj.group('video'): # A specific video
2933                         course = mobj.group('course')
2934                         video = mobj.group('video')
2935                         info = {
2936                                 'id': course + '_' + video,
2937                                 'uploader': u'NA',
2938                                 'upload_date': u'NA',
2939                         }
2940
2941                         self.report_extraction(info['id'])
2942                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2943                         xmlUrl = baseUrl + video + '.xml'
2944                         try:
2945                                 metaXml = urllib2.urlopen(xmlUrl).read()
2946                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2947                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2948                                 return
2949                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2950                         try:
2951                                 info['title'] = mdoc.findall('./title')[0].text
2952                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2953                         except IndexError:
2954                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2955                                 return
2956                         info['ext'] = info['url'].rpartition('.')[2]
2957                         return [info]
2958                 elif mobj.group('course'): # A course page
2959                         course = mobj.group('course')
2960                         info = {
2961                                 'id': course,
2962                                 'type': 'playlist',
2963                                 'uploader': u'NA',
2964                                 'upload_date': u'NA',
2965                         }
2966
2967                         self.report_download_webpage(info['id'])
2968                         try:
2969                                 coursepage = urllib2.urlopen(url).read()
2970                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2971                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2972                                 return
2973
2974                         m = re.search('<h1>([^<]+)</h1>', coursepage)
2975                         if m:
2976                                 info['title'] = unescapeHTML(m.group(1))
2977                         else:
2978                                 info['title'] = info['id']
2979
2980                         m = re.search('<description>([^<]+)</description>', coursepage)
2981                         if m:
2982                                 info['description'] = unescapeHTML(m.group(1))
2983
2984                         links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2985                         info['list'] = [
2986                                 {
2987                                         'type': 'reference',
2988                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2989                                 }
2990                                         for vpage in links]
2991                         results = []
2992                         for entry in info['list']:
2993                                 assert entry['type'] == 'reference'
2994                                 results += self.extract(entry['url'])
2995                         return results
2996                         
2997                 else: # Root page
2998                         info = {
2999                                 'id': 'Stanford OpenClassroom',
3000                                 'type': 'playlist',
3001                                 'uploader': u'NA',
3002                                 'upload_date': u'NA',
3003                         }
3004
3005                         self.report_download_webpage(info['id'])
3006                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3007                         try:
3008                                 rootpage = urllib2.urlopen(rootURL).read()
3009                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3010                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3011                                 return
3012
3013                         info['title'] = info['id']
3014
3015                         links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3016                         info['list'] = [
3017                                 {
3018                                         'type': 'reference',
3019                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3020                                 }
3021                                         for cpage in links]
3022
3023                         results = []
3024                         for entry in info['list']:
3025                                 assert entry['type'] == 'reference'
3026                                 results += self.extract(entry['url'])
3027                         return results
3028
3029 class MTVIE(InfoExtractor):
3030         """Information extractor for MTV.com"""
3031
3032         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3033         IE_NAME = u'mtv'
3034
3035         def report_webpage(self, video_id):
3036                 """Report information extraction."""
3037                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3038
3039         def report_extraction(self, video_id):
3040                 """Report information extraction."""
3041                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3042
3043         def _real_extract(self, url):
3044                 mobj = re.match(self._VALID_URL, url)
3045                 if mobj is None:
3046                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3047                         return
3048                 if not mobj.group('proto'):
3049                         url = 'http://' + url
3050                 video_id = mobj.group('videoid')
3051                 self.report_webpage(video_id)
3052
3053                 request = urllib2.Request(url)
3054                 try:
3055                         webpage = urllib2.urlopen(request).read()
3056                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3057                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3058                         return
3059
3060                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3061                 if mobj is None:
3062                         self._downloader.trouble(u'ERROR: unable to extract song name')
3063                         return
3064                 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3065                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3066                 if mobj is None:
3067                         self._downloader.trouble(u'ERROR: unable to extract performer')
3068                         return
3069                 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3070                 video_title = performer + ' - ' + song_name 
3071
3072                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3073                 if mobj is None:
3074                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3075                         return
3076                 mtvn_uri = mobj.group(1)
3077
3078                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3079                 if mobj is None:
3080                         self._downloader.trouble(u'ERROR: unable to extract content id')
3081                         return
3082                 content_id = mobj.group(1)
3083
3084                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3085                 self.report_extraction(video_id)
3086                 request = urllib2.Request(videogen_url)
3087                 try:
3088                         metadataXml = urllib2.urlopen(request).read()
3089                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3090                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3091                         return
3092
3093                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3094                 renditions = mdoc.findall('.//rendition')
3095
3096                 # For now, always pick the highest quality.
3097                 rendition = renditions[-1]
3098
3099                 try:
3100                         _,_,ext = rendition.attrib['type'].partition('/')
3101                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3102                         video_url = rendition.find('./src').text
3103                 except KeyError:
3104                         self._downloader.trouble('Invalid rendition field.')
3105                         return
3106
3107                 info = {
3108                         'id': video_id,
3109                         'url': video_url,
3110                         'uploader': performer,
3111                         'upload_date': u'NA',
3112                         'title': video_title,
3113                         'ext': ext,
3114                         'format': format,
3115                 }
3116
3117                 return [info]
3118
3119
3120 class YoukuIE(InfoExtractor):
3121
3122         _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3123         IE_NAME = u'Youku'
3124
3125         def __init__(self, downloader=None):
3126                 InfoExtractor.__init__(self, downloader)
3127
3128         def report_download_webpage(self, file_id):
3129                 """Report webpage download."""
3130                 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3131
3132         def report_extraction(self, file_id):
3133                 """Report information extraction."""
3134                 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3135
3136         def _gen_sid(self):
3137                 nowTime = int(time.time() * 1000)
3138                 random1 = random.randint(1000,1998)
3139                 random2 = random.randint(1000,9999)
3140
3141                 return "%d%d%d" %(nowTime,random1,random2)
3142
3143         def _get_file_ID_mix_string(self, seed):
3144                 mixed = []
3145                 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3146                 seed = float(seed)
3147                 for i in range(len(source)):
3148                         seed  =  (seed * 211 + 30031 ) % 65536
3149                         index  =  math.floor(seed / 65536 * len(source) )
3150                         mixed.append(source[int(index)])
3151                         source.remove(source[int(index)])
3152                 #return ''.join(mixed)
3153                 return mixed
3154
3155         def _get_file_id(self, fileId, seed):
3156                 mixed = self._get_file_ID_mix_string(seed)
3157                 ids = fileId.split('*')
3158                 realId = []
3159                 for ch in ids:
3160                         if ch:
3161                                 realId.append(mixed[int(ch)])
3162                 return ''.join(realId)
3163
3164         def _real_extract(self, url):
3165                 mobj = re.match(self._VALID_URL, url)
3166                 if mobj is None:
3167                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3168                         return
3169                 video_id = mobj.group('ID')
3170
3171                 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3172
3173                 request = urllib2.Request(info_url, None, std_headers)
3174                 try:
3175                         self.report_download_webpage(video_id)
3176                         jsondata = urllib2.urlopen(request).read()
3177                 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3178                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3179                         return
3180
3181                 self.report_extraction(video_id)
3182                 try:
3183                         config = json.loads(jsondata)
3184
3185                         video_title =  config['data'][0]['title']
3186                         seed = config['data'][0]['seed']
3187
3188                         format = self._downloader.params.get('format', None)
3189                         supported_format = config['data'][0]['streamfileids'].keys()
3190
3191                         if format is None or format == 'best':
3192                                 if 'hd2' in supported_format:
3193                                         format = 'hd2'
3194                                 else:
3195                                         format = 'flv'
3196                                 ext = u'flv'
3197                         elif format == 'worst':
3198                                 format = 'mp4'
3199                                 ext = u'mp4'
3200                         else:
3201                                 format = 'flv'
3202                                 ext = u'flv'
3203
3204
3205                         fileid = config['data'][0]['streamfileids'][format]
3206                         seg_number = len(config['data'][0]['segs'][format])
3207
3208                         keys=[]
3209                         for i in xrange(seg_number):
3210                                 keys.append(config['data'][0]['segs'][format][i]['k'])
3211
3212                         #TODO check error
3213                         #youku only could be viewed from mainland china
3214                 except:
3215                         self._downloader.trouble(u'ERROR: unable to extract info section')
3216                         return
3217
3218                 files_info=[]
3219                 sid = self._gen_sid()
3220                 fileid = self._get_file_id(fileid, seed)
3221
3222                 #column 8,9 of fileid represent the segment number
3223                 #fileid[7:9] should be changed
3224                 for index, key in enumerate(keys):
3225
3226                         temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3227                         download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3228
3229                         info = {
3230                                 'id': '%s_part%02d' % (video_id, index),
3231                                 'url': download_url,
3232                                 'uploader': u'NA',
3233                                 'upload_date': u'NA',
3234                                 'title': video_title,
3235                                 'ext': ext,
3236                         }
3237                         files_info.append(info)
3238
3239                 return files_info
3240
3241
3242 class XNXXIE(InfoExtractor):
3243         """Information extractor for xnxx.com"""
3244
3245         _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3246         IE_NAME = u'xnxx'
3247         VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3248         VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3249         VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3250
3251         def report_webpage(self, video_id):
3252                 """Report information extraction"""
3253                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3254
3255         def report_extraction(self, video_id):
3256                 """Report information extraction"""
3257                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3258
3259         def _real_extract(self, url):
3260                 mobj = re.match(self._VALID_URL, url)
3261                 if mobj is None:
3262                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3263                         return
3264                 video_id = mobj.group(1).decode('utf-8')
3265
3266                 self.report_webpage(video_id)
3267
3268                 # Get webpage content
3269                 try:
3270                         webpage = urllib2.urlopen(url).read()
3271                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3272                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3273                         return
3274
3275                 result = re.search(self.VIDEO_URL_RE, webpage)
3276                 if result is None:
3277                         self._downloader.trouble(u'ERROR: unable to extract video url')
3278                         return
3279                 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3280
3281                 result = re.search(self.VIDEO_TITLE_RE, webpage)
3282                 if result is None:
3283                         self._downloader.trouble(u'ERROR: unable to extract video title')
3284                         return
3285                 video_title = result.group(1).decode('utf-8')
3286
3287                 result = re.search(self.VIDEO_THUMB_RE, webpage)
3288                 if result is None:
3289                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3290                         return
3291                 video_thumbnail = result.group(1).decode('utf-8')
3292
3293                 return [{
3294                         'id': video_id,
3295                         'url': video_url,
3296                         'uploader': u'NA',
3297                         'upload_date': u'NA',
3298                         'title': video_title,
3299                         'ext': 'flv',
3300                         'thumbnail': video_thumbnail,
3301                         'description': None,
3302                 }]
3303
3304
3305 class GooglePlusIE(InfoExtractor):
3306         """Information extractor for plus.google.com."""
3307
3308         _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3309         IE_NAME = u'plus.google'
3310
3311         def __init__(self, downloader=None):
3312                 InfoExtractor.__init__(self, downloader)
3313
3314         def report_extract_entry(self, url):
3315                 """Report downloading extry"""
3316                 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3317
3318         def report_date(self, upload_date):
3319                 """Report downloading extry"""
3320                 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3321
3322         def report_uploader(self, uploader):
3323                 """Report downloading extry"""
3324                 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3325
3326         def report_title(self, video_title):
3327                 """Report downloading extry"""
3328                 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3329
3330         def report_extract_vid_page(self, video_page):
3331                 """Report information extraction."""
3332                 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3333
3334         def _real_extract(self, url):
3335                 # Extract id from URL
3336                 mobj = re.match(self._VALID_URL, url)
3337                 if mobj is None:
3338                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3339                         return
3340
3341                 post_url = mobj.group(0)
3342                 video_id = mobj.group(2)
3343
3344                 video_extension = 'flv'
3345
3346                 # Step 1, Retrieve post webpage to extract further information
3347                 self.report_extract_entry(post_url)
3348                 request = urllib2.Request(post_url)
3349                 try:
3350                         webpage = urllib2.urlopen(request).read()
3351                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3352                         self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3353                         return
3354
3355                 # Extract update date
3356                 upload_date = u'NA'
3357                 pattern = 'title="Timestamp">(.*?)</a>'
3358                 mobj = re.search(pattern, webpage)
3359                 if mobj:
3360                         upload_date = mobj.group(1)
3361                         # Convert timestring to a format suitable for filename
3362                         upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3363                         upload_date = upload_date.strftime('%Y%m%d')
3364                 self.report_date(upload_date)
3365
3366                 # Extract uploader
3367                 uploader = u'NA'
3368                 pattern = r'rel\="author".*?>(.*?)</a>'
3369                 mobj = re.search(pattern, webpage)
3370                 if mobj:
3371                         uploader = mobj.group(1)
3372                 self.report_uploader(uploader)
3373
3374                 # Extract title
3375                 # Get the first line for title
3376                 video_title = u'NA'
3377                 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3378                 mobj = re.search(pattern, webpage)
3379                 if mobj:
3380                         video_title = mobj.group(1)
3381                 self.report_title(video_title)
3382
3383                 # Step 2, Stimulate clicking the image box to launch video
3384                 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3385                 mobj = re.search(pattern, webpage)
3386                 if mobj is None:
3387                         self._downloader.trouble(u'ERROR: unable to extract video page URL')
3388
3389                 video_page = mobj.group(1)
3390                 request = urllib2.Request(video_page)
3391                 try:
3392                         webpage = urllib2.urlopen(request).read()
3393                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3394                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3395                         return
3396                 self.report_extract_vid_page(video_page)
3397
3398
3399                 # Extract video links on video page
3400                 """Extract video links of all sizes"""
3401                 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3402                 mobj = re.findall(pattern, webpage)
3403                 if len(mobj) == 0:
3404                         self._downloader.trouble(u'ERROR: unable to extract video links')
3405
3406                 # Sort in resolution
3407                 links = sorted(mobj)
3408
3409                 # Choose the lowest of the sort, i.e. highest resolution
3410                 video_url = links[-1]
3411                 # Only get the url. The resolution part in the tuple has no use anymore
3412                 video_url = video_url[-1]
3413                 # Treat escaped \u0026 style hex
3414                 video_url = unicode(video_url, "unicode_escape")
3415
3416
3417                 return [{
3418                         'id':           video_id.decode('utf-8'),
3419                         'url':          video_url,
3420                         'uploader':     uploader.decode('utf-8'),
3421                         'upload_date':  upload_date.decode('utf-8'),
3422                         'title':        video_title.decode('utf-8'),
3423                         'ext':          video_extension.decode('utf-8'),
3424                 }]