document info_dict['subtitles'] and info_dict['urlhandle']
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import datetime
5 import HTMLParser
6 import httplib
7 import netrc
8 import os
9 import re
10 import socket
11 import time
12 import urllib
13 import urllib2
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 from urlparse import parse_qs
19
20 try:
21         import cStringIO as StringIO
22 except ImportError:
23         import StringIO
24
25 from utils import *
26
27
28 class InfoExtractor(object):
29         """Information Extractor class.
30
31         Information extractors are the classes that, given a URL, extract
32         information about the video (or videos) the URL refers to. This
33         information includes the real video URL, the video title, author and
34         others. The information is stored in a dictionary which is then 
35         passed to the FileDownloader. The FileDownloader processes this
36         information possibly downloading the video to the file system, among
37         other possible outcomes.
38
39         The dictionaries must include the following fields:
40
41         id:             Video identifier.
42         url:            Final video URL.
43         uploader:       Nickname of the video uploader, unescaped.
44         upload_date:    Video upload date (YYYYMMDD).
45         title:          Video title, unescaped.
46         ext:            Video filename extension.
47
48         The following fields are optional:
49
50         format:         The video format, defaults to ext (used for --get-format)
51         thumbnail:      Full URL to a video thumbnail image.
52         description:    One-line video description.
53         player_url:     SWF Player URL (used for rtmpdump).
54         subtitles:      The .srt file contents.
55         urlhandle:              [internal] The urlHandle to be used to download the file,
56                         like returned by urllib2.urlopen
57
58         The fields should all be Unicode strings.
59
60         Subclasses of this one should re-define the _real_initialize() and
61         _real_extract() methods and define a _VALID_URL regexp.
62         Probably, they should also be added to the list of extractors.
63
64         _real_extract() must return a *list* of information dictionaries as
65         described above.
66         """
67
68         _ready = False
69         _downloader = None
70
71         def __init__(self, downloader=None):
72                 """Constructor. Receives an optional downloader."""
73                 self._ready = False
74                 self.set_downloader(downloader)
75
76         def suitable(self, url):
77                 """Receives a URL and returns True if suitable for this IE."""
78                 return re.match(self._VALID_URL, url) is not None
79
80         def initialize(self):
81                 """Initializes an instance (authentication, etc)."""
82                 if not self._ready:
83                         self._real_initialize()
84                         self._ready = True
85
86         def extract(self, url):
87                 """Extracts URL information and returns it in list of dicts."""
88                 self.initialize()
89                 return self._real_extract(url)
90
91         def set_downloader(self, downloader):
92                 """Sets the downloader for this IE."""
93                 self._downloader = downloader
94
95         def _real_initialize(self):
96                 """Real initialization process. Redefine in subclasses."""
97                 pass
98
99         def _real_extract(self, url):
100                 """Real extraction process. Redefine in subclasses."""
101                 pass
102
103
104 class YoutubeIE(InfoExtractor):
105         """Information extractor for youtube.com."""
106
107         _VALID_URL = r"""^
108                          (
109                              (?:https?://)?                                       # http(s):// (optional)
110                              (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
111                                 tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
112                              (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
113                              (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
114                              (?:                                                  # the various things that can precede the ID:
115                                  (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
116                                  |(?:                                             # or the v= param in all its forms
117                                      (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
118                                      (?:\?|\#!?)                                  # the params delimiter ? or # or #!
119                                      (?:.+&)?                                     # any other preceding param (like /?s=tuff&v=xxxx)
120                                      v=
121                                  )
122                              )?                                                   # optional -> youtube.com/xxxx is OK
123                          )?                                                       # all until now is optional -> you can pass the naked ID
124                          ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
125                          (?(1).+)?                                                # if we found the ID, everything can follow
126                          $"""
127         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
128         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
129         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
130         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
131         _NETRC_MACHINE = 'youtube'
132         # Listed in order of quality
133         _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
134         _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
135         _video_extensions = {
136                 '13': '3gp',
137                 '17': 'mp4',
138                 '18': 'mp4',
139                 '22': 'mp4',
140                 '37': 'mp4',
141                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
142                 '43': 'webm',
143                 '44': 'webm',
144                 '45': 'webm',
145                 '46': 'webm',
146         }
147         _video_dimensions = {
148                 '5': '240x400',
149                 '6': '???',
150                 '13': '???',
151                 '17': '144x176',
152                 '18': '360x640',
153                 '22': '720x1280',
154                 '34': '360x640',
155                 '35': '480x854',
156                 '37': '1080x1920',
157                 '38': '3072x4096',
158                 '43': '360x640',
159                 '44': '480x854',
160                 '45': '720x1280',
161                 '46': '1080x1920',
162         }       
163         IE_NAME = u'youtube'
164
165         def suitable(self, url):
166                 """Receives a URL and returns True if suitable for this IE."""
167                 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
168
169         def report_lang(self):
170                 """Report attempt to set language."""
171                 self._downloader.to_screen(u'[youtube] Setting language')
172
173         def report_login(self):
174                 """Report attempt to log in."""
175                 self._downloader.to_screen(u'[youtube] Logging in')
176
177         def report_age_confirmation(self):
178                 """Report attempt to confirm age."""
179                 self._downloader.to_screen(u'[youtube] Confirming age')
180
181         def report_video_webpage_download(self, video_id):
182                 """Report attempt to download video webpage."""
183                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
184
185         def report_video_info_webpage_download(self, video_id):
186                 """Report attempt to download video info webpage."""
187                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
188
189         def report_video_subtitles_download(self, video_id):
190                 """Report attempt to download video info webpage."""
191                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
192
193         def report_information_extraction(self, video_id):
194                 """Report attempt to extract video information."""
195                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
196
197         def report_unavailable_format(self, video_id, format):
198                 """Report extracted video URL."""
199                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
200
201         def report_rtmp_download(self):
202                 """Indicate the download will use the RTMP protocol."""
203                 self._downloader.to_screen(u'[youtube] RTMP download detected')
204
205         def _closed_captions_xml_to_srt(self, xml_string):
206                 srt = ''
207                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
208                 # TODO parse xml instead of regex
209                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
210                         if not dur: dur = '4'
211                         start = float(start)
212                         end = start + float(dur)
213                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
214                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
215                         caption = unescapeHTML(caption)
216                         caption = unescapeHTML(caption) # double cycle, intentional
217                         srt += str(n+1) + '\n'
218                         srt += start + ' --> ' + end + '\n'
219                         srt += caption + '\n\n'
220                 return srt
221
222         def _print_formats(self, formats):
223                 print('Available formats:')
224                 for x in formats:
225                         print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
226
227         def _real_initialize(self):
228                 if self._downloader is None:
229                         return
230
231                 username = None
232                 password = None
233                 downloader_params = self._downloader.params
234
235                 # Attempt to use provided username and password or .netrc data
236                 if downloader_params.get('username', None) is not None:
237                         username = downloader_params['username']
238                         password = downloader_params['password']
239                 elif downloader_params.get('usenetrc', False):
240                         try:
241                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
242                                 if info is not None:
243                                         username = info[0]
244                                         password = info[2]
245                                 else:
246                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
247                         except (IOError, netrc.NetrcParseError), err:
248                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
249                                 return
250
251                 # Set language
252                 request = urllib2.Request(self._LANG_URL)
253                 try:
254                         self.report_lang()
255                         urllib2.urlopen(request).read()
256                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
257                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
258                         return
259
260                 # No authentication to be performed
261                 if username is None:
262                         return
263
264                 # Log in
265                 login_form = {
266                                 'current_form': 'loginForm',
267                                 'next':         '/',
268                                 'action_login': 'Log In',
269                                 'username':     username,
270                                 'password':     password,
271                                 }
272                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
273                 try:
274                         self.report_login()
275                         login_results = urllib2.urlopen(request).read()
276                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
277                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
278                                 return
279                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
280                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
281                         return
282
283                 # Confirm age
284                 age_form = {
285                                 'next_url':             '/',
286                                 'action_confirm':       'Confirm',
287                                 }
288                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
289                 try:
290                         self.report_age_confirmation()
291                         age_results = urllib2.urlopen(request).read()
292                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
293                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
294                         return
295
296         def _real_extract(self, url):
297                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
298                 mobj = re.search(self._NEXT_URL_RE, url)
299                 if mobj:
300                         url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
301
302                 # Extract video id from URL
303                 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
304                 if mobj is None:
305                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
306                         return
307                 video_id = mobj.group(2)
308
309                 # Get video webpage
310                 self.report_video_webpage_download(video_id)
311                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
312                 try:
313                         video_webpage = urllib2.urlopen(request).read()
314                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
315                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
316                         return
317
318                 # Attempt to extract SWF player URL
319                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
320                 if mobj is not None:
321                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
322                 else:
323                         player_url = None
324
325                 # Get video info
326                 self.report_video_info_webpage_download(video_id)
327                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
328                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
329                                         % (video_id, el_type))
330                         request = urllib2.Request(video_info_url)
331                         try:
332                                 video_info_webpage = urllib2.urlopen(request).read()
333                                 video_info = parse_qs(video_info_webpage)
334                                 if 'token' in video_info:
335                                         break
336                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
337                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
338                                 return
339                 if 'token' not in video_info:
340                         if 'reason' in video_info:
341                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
342                         else:
343                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
344                         return
345
346                 # Check for "rental" videos
347                 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
348                         self._downloader.trouble(u'ERROR: "rental" videos not supported')
349                         return
350
351                 # Start extracting information
352                 self.report_information_extraction(video_id)
353
354                 # uploader
355                 if 'author' not in video_info:
356                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
357                         return
358                 video_uploader = urllib.unquote_plus(video_info['author'][0])
359
360                 # title
361                 if 'title' not in video_info:
362                         self._downloader.trouble(u'ERROR: unable to extract video title')
363                         return
364                 video_title = urllib.unquote_plus(video_info['title'][0])
365                 video_title = video_title.decode('utf-8')
366
367                 # thumbnail image
368                 if 'thumbnail_url' not in video_info:
369                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
370                         video_thumbnail = ''
371                 else:   # don't panic if we can't find it
372                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
373
374                 # upload date
375                 upload_date = u'NA'
376                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
377                 if mobj is not None:
378                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
379                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
380                         for expression in format_expressions:
381                                 try:
382                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
383                                 except:
384                                         pass
385
386                 # description
387                 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
388                 if video_description: video_description = clean_html(video_description)
389                 else: video_description = ''
390                         
391                 # closed captions
392                 video_subtitles = None
393                 if self._downloader.params.get('writesubtitles', False):
394                         try:
395                                 self.report_video_subtitles_download(video_id)
396                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
397                                 try:
398                                         srt_list = urllib2.urlopen(request).read()
399                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
400                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
401                                 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
402                                 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
403                                 if not srt_lang_list:
404                                         raise Trouble(u'WARNING: video has no closed captions')
405                                 if self._downloader.params.get('subtitleslang', False):
406                                         srt_lang = self._downloader.params.get('subtitleslang')
407                                 elif 'en' in srt_lang_list:
408                                         srt_lang = 'en'
409                                 else:
410                                         srt_lang = srt_lang_list.keys()[0]
411                                 if not srt_lang in srt_lang_list:
412                                         raise Trouble(u'WARNING: no closed captions found in the specified language')
413                                 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
414                                 try:
415                                         srt_xml = urllib2.urlopen(request).read()
416                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
417                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
418                                 if not srt_xml:
419                                         raise Trouble(u'WARNING: unable to download video subtitles')
420                                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
421                         except Trouble as trouble:
422                                 self._downloader.trouble(trouble[0])
423
424                 if 'length_seconds' not in video_info:
425                         self._downloader.trouble(u'WARNING: unable to extract video duration')
426                         video_duration = ''
427                 else:
428                         video_duration = urllib.unquote_plus(video_info['length_seconds'][0])
429
430                 # token
431                 video_token = urllib.unquote_plus(video_info['token'][0])
432
433                 # Decide which formats to download
434                 req_format = self._downloader.params.get('format', None)
435
436                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
437                         self.report_rtmp_download()
438                         video_url_list = [(None, video_info['conn'][0])]
439                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
440                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
441                         url_data = [parse_qs(uds) for uds in url_data_strs]
442                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
443                         url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
444
445                         format_limit = self._downloader.params.get('format_limit', None)
446                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
447                         if format_limit is not None and format_limit in available_formats:
448                                 format_list = available_formats[available_formats.index(format_limit):]
449                         else:
450                                 format_list = available_formats
451                         existing_formats = [x for x in format_list if x in url_map]
452                         if len(existing_formats) == 0:
453                                 self._downloader.trouble(u'ERROR: no known formats available for video')
454                                 return
455                         if self._downloader.params.get('listformats', None):
456                                 self._print_formats(existing_formats)
457                                 return
458                         if req_format is None or req_format == 'best':
459                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
460                         elif req_format == 'worst':
461                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
462                         elif req_format in ('-1', 'all'):
463                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
464                         else:
465                                 # Specific formats. We pick the first in a slash-delimeted sequence.
466                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
467                                 req_formats = req_format.split('/')
468                                 video_url_list = None
469                                 for rf in req_formats:
470                                         if rf in url_map:
471                                                 video_url_list = [(rf, url_map[rf])]
472                                                 break
473                                 if video_url_list is None:
474                                         self._downloader.trouble(u'ERROR: requested format not available')
475                                         return
476                 else:
477                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
478                         return
479
480                 results = []
481                 for format_param, video_real_url in video_url_list:
482                         # Extension
483                         video_extension = self._video_extensions.get(format_param, 'flv')
484
485                         video_format = '{} - {}'.format(format_param.decode('utf-8') if format_param else video_extension.decode('utf-8'),
486                                                             self._video_dimensions.get(format_param, '???'))
487
488                         results.append({
489                                 'id':           video_id.decode('utf-8'),
490                                 'url':          video_real_url.decode('utf-8'),
491                                 'uploader':     video_uploader.decode('utf-8'),
492                                 'upload_date':  upload_date,
493                                 'title':        video_title,
494                                 'ext':          video_extension.decode('utf-8'),
495                                 'format':       video_format,
496                                 'thumbnail':    video_thumbnail.decode('utf-8'),
497                                 'description':  video_description,
498                                 'player_url':   player_url,
499                                 'subtitles':    video_subtitles,
500                                 'duration':             video_duration
501                         })
502                 return results
503
504
505 class MetacafeIE(InfoExtractor):
506         """Information Extractor for metacafe.com."""
507
508         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
509         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
510         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
511         IE_NAME = u'metacafe'
512
513         def __init__(self, downloader=None):
514                 InfoExtractor.__init__(self, downloader)
515
516         def report_disclaimer(self):
517                 """Report disclaimer retrieval."""
518                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
519
520         def report_age_confirmation(self):
521                 """Report attempt to confirm age."""
522                 self._downloader.to_screen(u'[metacafe] Confirming age')
523
524         def report_download_webpage(self, video_id):
525                 """Report webpage download."""
526                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
527
528         def report_extraction(self, video_id):
529                 """Report information extraction."""
530                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
531
532         def _real_initialize(self):
533                 # Retrieve disclaimer
534                 request = urllib2.Request(self._DISCLAIMER)
535                 try:
536                         self.report_disclaimer()
537                         disclaimer = urllib2.urlopen(request).read()
538                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
539                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
540                         return
541
542                 # Confirm age
543                 disclaimer_form = {
544                         'filters': '0',
545                         'submit': "Continue - I'm over 18",
546                         }
547                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
548                 try:
549                         self.report_age_confirmation()
550                         disclaimer = urllib2.urlopen(request).read()
551                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
552                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
553                         return
554
555         def _real_extract(self, url):
556                 # Extract id and simplified title from URL
557                 mobj = re.match(self._VALID_URL, url)
558                 if mobj is None:
559                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
560                         return
561
562                 video_id = mobj.group(1)
563
564                 # Check if video comes from YouTube
565                 mobj2 = re.match(r'^yt-(.*)$', video_id)
566                 if mobj2 is not None:
567                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
568                         return
569
570                 # Retrieve video webpage to extract further information
571                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
572                 try:
573                         self.report_download_webpage(video_id)
574                         webpage = urllib2.urlopen(request).read()
575                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
576                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
577                         return
578
579                 # Extract URL, uploader and title from webpage
580                 self.report_extraction(video_id)
581                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
582                 if mobj is not None:
583                         mediaURL = urllib.unquote(mobj.group(1))
584                         video_extension = mediaURL[-3:]
585
586                         # Extract gdaKey if available
587                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
588                         if mobj is None:
589                                 video_url = mediaURL
590                         else:
591                                 gdaKey = mobj.group(1)
592                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
593                 else:
594                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
595                         if mobj is None:
596                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
597                                 return
598                         vardict = parse_qs(mobj.group(1))
599                         if 'mediaData' not in vardict:
600                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
601                                 return
602                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
603                         if mobj is None:
604                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
605                                 return
606                         mediaURL = mobj.group(1).replace('\\/', '/')
607                         video_extension = mediaURL[-3:]
608                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
609
610                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
611                 if mobj is None:
612                         self._downloader.trouble(u'ERROR: unable to extract title')
613                         return
614                 video_title = mobj.group(1).decode('utf-8')
615
616                 mobj = re.search(r'submitter=(.*?);', webpage)
617                 if mobj is None:
618                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
619                         return
620                 video_uploader = mobj.group(1)
621
622                 return [{
623                         'id':           video_id.decode('utf-8'),
624                         'url':          video_url.decode('utf-8'),
625                         'uploader':     video_uploader.decode('utf-8'),
626                         'upload_date':  u'NA',
627                         'title':        video_title,
628                         'ext':          video_extension.decode('utf-8'),
629                 }]
630
631
632 class DailymotionIE(InfoExtractor):
633         """Information Extractor for Dailymotion"""
634
635         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
636         IE_NAME = u'dailymotion'
637
638         def __init__(self, downloader=None):
639                 InfoExtractor.__init__(self, downloader)
640
641         def report_download_webpage(self, video_id):
642                 """Report webpage download."""
643                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
644
645         def report_extraction(self, video_id):
646                 """Report information extraction."""
647                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
648
649         def _real_extract(self, url):
650                 # Extract id and simplified title from URL
651                 mobj = re.match(self._VALID_URL, url)
652                 if mobj is None:
653                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
654                         return
655
656                 video_id = mobj.group(1).split('_')[0].split('?')[0]
657
658                 video_extension = 'mp4'
659
660                 # Retrieve video webpage to extract further information
661                 request = urllib2.Request(url)
662                 request.add_header('Cookie', 'family_filter=off')
663                 try:
664                         self.report_download_webpage(video_id)
665                         webpage = urllib2.urlopen(request).read()
666                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
667                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
668                         return
669
670                 # Extract URL, uploader and title from webpage
671                 self.report_extraction(video_id)
672                 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
673                 if mobj is None:
674                         self._downloader.trouble(u'ERROR: unable to extract media URL')
675                         return
676                 flashvars = urllib.unquote(mobj.group(1))
677
678                 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
679                         if key in flashvars:
680                                 max_quality = key
681                                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
682                                 break
683                 else:
684                         self._downloader.trouble(u'ERROR: unable to extract video URL')
685                         return
686
687                 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
688                 if mobj is None:
689                         self._downloader.trouble(u'ERROR: unable to extract video URL')
690                         return
691
692                 video_url = urllib.unquote(mobj.group(1)).replace('\\/', '/')
693
694                 # TODO: support choosing qualities
695
696                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
697                 if mobj is None:
698                         self._downloader.trouble(u'ERROR: unable to extract title')
699                         return
700                 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
701
702                 video_uploader = u'NA'
703                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
704                 if mobj is None:
705                         # lookin for official user
706                         mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
707                         if mobj_official is None:
708                                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
709                         else:
710                                 video_uploader = mobj_official.group(1)
711                 else:
712                         video_uploader = mobj.group(1)
713
714                 video_upload_date = u'NA'
715                 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
716                 if mobj is not None:
717                         video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
718
719                 return [{
720                         'id':           video_id.decode('utf-8'),
721                         'url':          video_url.decode('utf-8'),
722                         'uploader':     video_uploader.decode('utf-8'),
723                         'upload_date':  video_upload_date,
724                         'title':        video_title,
725                         'ext':          video_extension.decode('utf-8'),
726                 }]
727
728
729 class GoogleIE(InfoExtractor):
730         """Information extractor for video.google.com."""
731
732         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
733         IE_NAME = u'video.google'
734
735         def __init__(self, downloader=None):
736                 InfoExtractor.__init__(self, downloader)
737
738         def report_download_webpage(self, video_id):
739                 """Report webpage download."""
740                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
741
742         def report_extraction(self, video_id):
743                 """Report information extraction."""
744                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
745
746         def _real_extract(self, url):
747                 # Extract id from URL
748                 mobj = re.match(self._VALID_URL, url)
749                 if mobj is None:
750                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
751                         return
752
753                 video_id = mobj.group(1)
754
755                 video_extension = 'mp4'
756
757                 # Retrieve video webpage to extract further information
758                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
759                 try:
760                         self.report_download_webpage(video_id)
761                         webpage = urllib2.urlopen(request).read()
762                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
763                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
764                         return
765
766                 # Extract URL, uploader, and title from webpage
767                 self.report_extraction(video_id)
768                 mobj = re.search(r"download_url:'([^']+)'", webpage)
769                 if mobj is None:
770                         video_extension = 'flv'
771                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
772                 if mobj is None:
773                         self._downloader.trouble(u'ERROR: unable to extract media URL')
774                         return
775                 mediaURL = urllib.unquote(mobj.group(1))
776                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
777                 mediaURL = mediaURL.replace('\\x26', '\x26')
778
779                 video_url = mediaURL
780
781                 mobj = re.search(r'<title>(.*)</title>', webpage)
782                 if mobj is None:
783                         self._downloader.trouble(u'ERROR: unable to extract title')
784                         return
785                 video_title = mobj.group(1).decode('utf-8')
786
787                 # Extract video description
788                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
789                 if mobj is None:
790                         self._downloader.trouble(u'ERROR: unable to extract video description')
791                         return
792                 video_description = mobj.group(1).decode('utf-8')
793                 if not video_description:
794                         video_description = 'No description available.'
795
796                 # Extract video thumbnail
797                 if self._downloader.params.get('forcethumbnail', False):
798                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
799                         try:
800                                 webpage = urllib2.urlopen(request).read()
801                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
802                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
803                                 return
804                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
805                         if mobj is None:
806                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
807                                 return
808                         video_thumbnail = mobj.group(1)
809                 else:   # we need something to pass to process_info
810                         video_thumbnail = ''
811
812                 return [{
813                         'id':           video_id.decode('utf-8'),
814                         'url':          video_url.decode('utf-8'),
815                         'uploader':     u'NA',
816                         'upload_date':  u'NA',
817                         'title':        video_title,
818                         'ext':          video_extension.decode('utf-8'),
819                 }]
820
821
822 class PhotobucketIE(InfoExtractor):
823         """Information extractor for photobucket.com."""
824
825         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
826         IE_NAME = u'photobucket'
827
828         def __init__(self, downloader=None):
829                 InfoExtractor.__init__(self, downloader)
830
831         def report_download_webpage(self, video_id):
832                 """Report webpage download."""
833                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
834
835         def report_extraction(self, video_id):
836                 """Report information extraction."""
837                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
838
839         def _real_extract(self, url):
840                 # Extract id from URL
841                 mobj = re.match(self._VALID_URL, url)
842                 if mobj is None:
843                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
844                         return
845
846                 video_id = mobj.group(1)
847
848                 video_extension = 'flv'
849
850                 # Retrieve video webpage to extract further information
851                 request = urllib2.Request(url)
852                 try:
853                         self.report_download_webpage(video_id)
854                         webpage = urllib2.urlopen(request).read()
855                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
856                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
857                         return
858
859                 # Extract URL, uploader, and title from webpage
860                 self.report_extraction(video_id)
861                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
862                 if mobj is None:
863                         self._downloader.trouble(u'ERROR: unable to extract media URL')
864                         return
865                 mediaURL = urllib.unquote(mobj.group(1))
866
867                 video_url = mediaURL
868
869                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
870                 if mobj is None:
871                         self._downloader.trouble(u'ERROR: unable to extract title')
872                         return
873                 video_title = mobj.group(1).decode('utf-8')
874
875                 video_uploader = mobj.group(2).decode('utf-8')
876
877                 return [{
878                         'id':           video_id.decode('utf-8'),
879                         'url':          video_url.decode('utf-8'),
880                         'uploader':     video_uploader,
881                         'upload_date':  u'NA',
882                         'title':        video_title,
883                         'ext':          video_extension.decode('utf-8'),
884                 }]
885
886
887 class YahooIE(InfoExtractor):
888         """Information extractor for video.yahoo.com."""
889
890         # _VALID_URL matches all Yahoo! Video URLs
891         # _VPAGE_URL matches only the extractable '/watch/' URLs
892         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
893         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
894         IE_NAME = u'video.yahoo'
895
896         def __init__(self, downloader=None):
897                 InfoExtractor.__init__(self, downloader)
898
899         def report_download_webpage(self, video_id):
900                 """Report webpage download."""
901                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
902
903         def report_extraction(self, video_id):
904                 """Report information extraction."""
905                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
906
907         def _real_extract(self, url, new_video=True):
908                 # Extract ID from URL
909                 mobj = re.match(self._VALID_URL, url)
910                 if mobj is None:
911                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
912                         return
913
914                 video_id = mobj.group(2)
915                 video_extension = 'flv'
916
917                 # Rewrite valid but non-extractable URLs as
918                 # extractable English language /watch/ URLs
919                 if re.match(self._VPAGE_URL, url) is None:
920                         request = urllib2.Request(url)
921                         try:
922                                 webpage = urllib2.urlopen(request).read()
923                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
924                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
925                                 return
926
927                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
928                         if mobj is None:
929                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
930                                 return
931                         yahoo_id = mobj.group(1)
932
933                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
934                         if mobj is None:
935                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
936                                 return
937                         yahoo_vid = mobj.group(1)
938
939                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
940                         return self._real_extract(url, new_video=False)
941
942                 # Retrieve video webpage to extract further information
943                 request = urllib2.Request(url)
944                 try:
945                         self.report_download_webpage(video_id)
946                         webpage = urllib2.urlopen(request).read()
947                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
948                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
949                         return
950
951                 # Extract uploader and title from webpage
952                 self.report_extraction(video_id)
953                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
954                 if mobj is None:
955                         self._downloader.trouble(u'ERROR: unable to extract video title')
956                         return
957                 video_title = mobj.group(1).decode('utf-8')
958
959                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
960                 if mobj is None:
961                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
962                         return
963                 video_uploader = mobj.group(1).decode('utf-8')
964
965                 # Extract video thumbnail
966                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
967                 if mobj is None:
968                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
969                         return
970                 video_thumbnail = mobj.group(1).decode('utf-8')
971
972                 # Extract video description
973                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
974                 if mobj is None:
975                         self._downloader.trouble(u'ERROR: unable to extract video description')
976                         return
977                 video_description = mobj.group(1).decode('utf-8')
978                 if not video_description:
979                         video_description = 'No description available.'
980
981                 # Extract video height and width
982                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
983                 if mobj is None:
984                         self._downloader.trouble(u'ERROR: unable to extract video height')
985                         return
986                 yv_video_height = mobj.group(1)
987
988                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
989                 if mobj is None:
990                         self._downloader.trouble(u'ERROR: unable to extract video width')
991                         return
992                 yv_video_width = mobj.group(1)
993
994                 # Retrieve video playlist to extract media URL
995                 # I'm not completely sure what all these options are, but we
996                 # seem to need most of them, otherwise the server sends a 401.
997                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
998                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
999                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1000                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1001                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1002                 try:
1003                         self.report_download_webpage(video_id)
1004                         webpage = urllib2.urlopen(request).read()
1005                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1006                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1007                         return
1008
1009                 # Extract media URL from playlist XML
1010                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1011                 if mobj is None:
1012                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1013                         return
1014                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1015                 video_url = unescapeHTML(video_url)
1016
1017                 return [{
1018                         'id':           video_id.decode('utf-8'),
1019                         'url':          video_url,
1020                         'uploader':     video_uploader,
1021                         'upload_date':  u'NA',
1022                         'title':        video_title,
1023                         'ext':          video_extension.decode('utf-8'),
1024                         'thumbnail':    video_thumbnail.decode('utf-8'),
1025                         'description':  video_description,
1026                 }]
1027
1028
1029 class VimeoIE(InfoExtractor):
1030         """Information extractor for vimeo.com."""
1031
1032         # _VALID_URL matches Vimeo URLs
1033         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
1034         IE_NAME = u'vimeo'
1035
1036         def __init__(self, downloader=None):
1037                 InfoExtractor.__init__(self, downloader)
1038
1039         def report_download_webpage(self, video_id):
1040                 """Report webpage download."""
1041                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1042
1043         def report_extraction(self, video_id):
1044                 """Report information extraction."""
1045                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1046
1047         def _real_extract(self, url, new_video=True):
1048                 # Extract ID from URL
1049                 mobj = re.match(self._VALID_URL, url)
1050                 if mobj is None:
1051                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1052                         return
1053
1054                 video_id = mobj.group(1)
1055
1056                 # Retrieve video webpage to extract further information
1057                 request = urllib2.Request(url, None, std_headers)
1058                 try:
1059                         self.report_download_webpage(video_id)
1060                         webpage = urllib2.urlopen(request).read()
1061                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1062                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1063                         return
1064
1065                 # Now we begin extracting as much information as we can from what we
1066                 # retrieved. First we extract the information common to all extractors,
1067                 # and latter we extract those that are Vimeo specific.
1068                 self.report_extraction(video_id)
1069
1070                 # Extract the config JSON
1071                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1072                 try:
1073                         config = json.loads(config)
1074                 except:
1075                         self._downloader.trouble(u'ERROR: unable to extract info section')
1076                         return
1077                 
1078                 # Extract title
1079                 video_title = config["video"]["title"]
1080
1081                 # Extract uploader
1082                 video_uploader = config["video"]["owner"]["name"]
1083
1084                 # Extract video thumbnail
1085                 video_thumbnail = config["video"]["thumbnail"]
1086
1087                 # Extract video description
1088                 video_description = get_element_by_id("description", webpage.decode('utf8'))
1089                 if video_description: video_description = clean_html(video_description)
1090                 else: video_description = ''
1091
1092                 # Extract upload date
1093                 video_upload_date = u'NA'
1094                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1095                 if mobj is not None:
1096                         video_upload_date = mobj.group(1)
1097
1098                 # Vimeo specific: extract request signature and timestamp
1099                 sig = config['request']['signature']
1100                 timestamp = config['request']['timestamp']
1101
1102                 # Vimeo specific: extract video codec and quality information
1103                 # First consider quality, then codecs, then take everything
1104                 # TODO bind to format param
1105                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1106                 files = { 'hd': [], 'sd': [], 'other': []}
1107                 for codec_name, codec_extension in codecs:
1108                         if codec_name in config["video"]["files"]:
1109                                 if 'hd' in config["video"]["files"][codec_name]:
1110                                         files['hd'].append((codec_name, codec_extension, 'hd'))
1111                                 elif 'sd' in config["video"]["files"][codec_name]:
1112                                         files['sd'].append((codec_name, codec_extension, 'sd'))
1113                                 else:
1114                                         files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1115
1116                 for quality in ('hd', 'sd', 'other'):
1117                         if len(files[quality]) > 0:
1118                                 video_quality = files[quality][0][2]
1119                                 video_codec = files[quality][0][0]
1120                                 video_extension = files[quality][0][1]
1121                                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1122                                 break
1123                 else:
1124                         self._downloader.trouble(u'ERROR: no known codec found')
1125                         return
1126
1127                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1128                                         %(video_id, sig, timestamp, video_quality, video_codec.upper())
1129
1130                 return [{
1131                         'id':           video_id,
1132                         'url':          video_url,
1133                         'uploader':     video_uploader,
1134                         'upload_date':  video_upload_date,
1135                         'title':        video_title,
1136                         'ext':          video_extension,
1137                         'thumbnail':    video_thumbnail,
1138                         'description':  video_description,
1139                 }]
1140
1141
1142 class GenericIE(InfoExtractor):
1143         """Generic last-resort information extractor."""
1144
1145         _VALID_URL = r'.*'
1146         IE_NAME = u'generic'
1147
1148         def __init__(self, downloader=None):
1149                 InfoExtractor.__init__(self, downloader)
1150
1151         def report_download_webpage(self, video_id):
1152                 """Report webpage download."""
1153                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1154                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1155
1156         def report_extraction(self, video_id):
1157                 """Report information extraction."""
1158                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1159
1160         def report_following_redirect(self, new_url):
1161                 """Report information extraction."""
1162                 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1163                 
1164         def _test_redirect(self, url):
1165                 """Check if it is a redirect, like url shorteners, in case restart chain."""
1166                 class HeadRequest(urllib2.Request):
1167                         def get_method(self):
1168                                 return "HEAD"
1169
1170                 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1171                         """
1172                         Subclass the HTTPRedirectHandler to make it use our 
1173                         HeadRequest also on the redirected URL
1174                         """
1175                         def redirect_request(self, req, fp, code, msg, headers, newurl): 
1176                                 if code in (301, 302, 303, 307):
1177                                         newurl = newurl.replace(' ', '%20') 
1178                                         newheaders = dict((k,v) for k,v in req.headers.items()
1179                                                                           if k.lower() not in ("content-length", "content-type"))
1180                                         return HeadRequest(newurl, 
1181                                                                            headers=newheaders,
1182                                                                            origin_req_host=req.get_origin_req_host(), 
1183                                                                            unverifiable=True) 
1184                                 else: 
1185                                         raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp) 
1186
1187                 class HTTPMethodFallback(urllib2.BaseHandler):
1188                         """
1189                         Fallback to GET if HEAD is not allowed (405 HTTP error)
1190                         """
1191                         def http_error_405(self, req, fp, code, msg, headers): 
1192                                 fp.read()
1193                                 fp.close()
1194
1195                                 newheaders = dict((k,v) for k,v in req.headers.items()
1196                                                                   if k.lower() not in ("content-length", "content-type"))
1197                                 return self.parent.open(urllib2.Request(req.get_full_url(), 
1198                                                                                                  headers=newheaders, 
1199                                                                                                  origin_req_host=req.get_origin_req_host(), 
1200                                                                                                  unverifiable=True))
1201
1202                 # Build our opener
1203                 opener = urllib2.OpenerDirector() 
1204                 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1205                                                 HTTPMethodFallback, HEADRedirectHandler,
1206                                                 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1207                         opener.add_handler(handler())
1208
1209                 response = opener.open(HeadRequest(url))
1210                 new_url = response.geturl()
1211                 
1212                 if url == new_url: return False
1213                 
1214                 self.report_following_redirect(new_url)
1215                 self._downloader.download([new_url])
1216                 return True
1217
1218         def _real_extract(self, url):
1219                 if self._test_redirect(url): return
1220
1221                 video_id = url.split('/')[-1]
1222                 request = urllib2.Request(url)
1223                 try:
1224                         self.report_download_webpage(video_id)
1225                         webpage = urllib2.urlopen(request).read()
1226                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1227                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1228                         return
1229                 except ValueError, err:
1230                         # since this is the last-resort InfoExtractor, if
1231                         # this error is thrown, it'll be thrown here
1232                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1233                         return
1234
1235                 self.report_extraction(video_id)
1236                 # Start with something easy: JW Player in SWFObject
1237                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1238                 if mobj is None:
1239                         # Broaden the search a little bit
1240                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1241                 if mobj is None:
1242                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1243                         return
1244
1245                 # It's possible that one of the regexes
1246                 # matched, but returned an empty group:
1247                 if mobj.group(1) is None:
1248                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1249                         return
1250
1251                 video_url = urllib.unquote(mobj.group(1))
1252                 video_id = os.path.basename(video_url)
1253
1254                 # here's a fun little line of code for you:
1255                 video_extension = os.path.splitext(video_id)[1][1:]
1256                 video_id = os.path.splitext(video_id)[0]
1257
1258                 # it's tempting to parse this further, but you would
1259                 # have to take into account all the variations like
1260                 #   Video Title - Site Name
1261                 #   Site Name | Video Title
1262                 #   Video Title - Tagline | Site Name
1263                 # and so on and so forth; it's just not practical
1264                 mobj = re.search(r'<title>(.*)</title>', webpage)
1265                 if mobj is None:
1266                         self._downloader.trouble(u'ERROR: unable to extract title')
1267                         return
1268                 video_title = mobj.group(1).decode('utf-8')
1269
1270                 # video uploader is domain name
1271                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1272                 if mobj is None:
1273                         self._downloader.trouble(u'ERROR: unable to extract title')
1274                         return
1275                 video_uploader = mobj.group(1).decode('utf-8')
1276
1277                 return [{
1278                         'id':           video_id.decode('utf-8'),
1279                         'url':          video_url.decode('utf-8'),
1280                         'uploader':     video_uploader,
1281                         'upload_date':  u'NA',
1282                         'title':        video_title,
1283                         'ext':          video_extension.decode('utf-8'),
1284                 }]
1285
1286
1287 class YoutubeSearchIE(InfoExtractor):
1288         """Information Extractor for YouTube search queries."""
1289         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1290         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1291         _max_youtube_results = 1000
1292         IE_NAME = u'youtube:search'
1293
1294         def __init__(self, downloader=None):
1295                 InfoExtractor.__init__(self, downloader)
1296
1297         def report_download_page(self, query, pagenum):
1298                 """Report attempt to download search page with given number."""
1299                 query = query.decode(preferredencoding())
1300                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1301
1302         def _real_extract(self, query):
1303                 mobj = re.match(self._VALID_URL, query)
1304                 if mobj is None:
1305                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1306                         return
1307
1308                 prefix, query = query.split(':')
1309                 prefix = prefix[8:]
1310                 query = query.encode('utf-8')
1311                 if prefix == '':
1312                         self._download_n_results(query, 1)
1313                         return
1314                 elif prefix == 'all':
1315                         self._download_n_results(query, self._max_youtube_results)
1316                         return
1317                 else:
1318                         try:
1319                                 n = long(prefix)
1320                                 if n <= 0:
1321                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1322                                         return
1323                                 elif n > self._max_youtube_results:
1324                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1325                                         n = self._max_youtube_results
1326                                 self._download_n_results(query, n)
1327                                 return
1328                         except ValueError: # parsing prefix as integer fails
1329                                 self._download_n_results(query, 1)
1330                                 return
1331
1332         def _download_n_results(self, query, n):
1333                 """Downloads a specified number of results for a query"""
1334
1335                 video_ids = []
1336                 pagenum = 0
1337                 limit = n
1338
1339                 while (50 * pagenum) < limit:
1340                         self.report_download_page(query, pagenum+1)
1341                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1342                         request = urllib2.Request(result_url)
1343                         try:
1344                                 data = urllib2.urlopen(request).read()
1345                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1346                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1347                                 return
1348                         api_response = json.loads(data)['data']
1349
1350                         new_ids = list(video['id'] for video in api_response['items'])
1351                         video_ids += new_ids
1352
1353                         limit = min(n, api_response['totalItems'])
1354                         pagenum += 1
1355
1356                 if len(video_ids) > n:
1357                         video_ids = video_ids[:n]
1358                 for id in video_ids:
1359                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1360                 return
1361
1362
1363 class GoogleSearchIE(InfoExtractor):
1364         """Information Extractor for Google Video search queries."""
1365         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1366         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1367         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1368         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1369         _max_google_results = 1000
1370         IE_NAME = u'video.google:search'
1371
1372         def __init__(self, downloader=None):
1373                 InfoExtractor.__init__(self, downloader)
1374
1375         def report_download_page(self, query, pagenum):
1376                 """Report attempt to download playlist page with given number."""
1377                 query = query.decode(preferredencoding())
1378                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1379
1380         def _real_extract(self, query):
1381                 mobj = re.match(self._VALID_URL, query)
1382                 if mobj is None:
1383                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1384                         return
1385
1386                 prefix, query = query.split(':')
1387                 prefix = prefix[8:]
1388                 query = query.encode('utf-8')
1389                 if prefix == '':
1390                         self._download_n_results(query, 1)
1391                         return
1392                 elif prefix == 'all':
1393                         self._download_n_results(query, self._max_google_results)
1394                         return
1395                 else:
1396                         try:
1397                                 n = long(prefix)
1398                                 if n <= 0:
1399                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1400                                         return
1401                                 elif n > self._max_google_results:
1402                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1403                                         n = self._max_google_results
1404                                 self._download_n_results(query, n)
1405                                 return
1406                         except ValueError: # parsing prefix as integer fails
1407                                 self._download_n_results(query, 1)
1408                                 return
1409
1410         def _download_n_results(self, query, n):
1411                 """Downloads a specified number of results for a query"""
1412
1413                 video_ids = []
1414                 pagenum = 0
1415
1416                 while True:
1417                         self.report_download_page(query, pagenum)
1418                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1419                         request = urllib2.Request(result_url)
1420                         try:
1421                                 page = urllib2.urlopen(request).read()
1422                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1423                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1424                                 return
1425
1426                         # Extract video identifiers
1427                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1428                                 video_id = mobj.group(1)
1429                                 if video_id not in video_ids:
1430                                         video_ids.append(video_id)
1431                                         if len(video_ids) == n:
1432                                                 # Specified n videos reached
1433                                                 for id in video_ids:
1434                                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1435                                                 return
1436
1437                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1438                                 for id in video_ids:
1439                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1440                                 return
1441
1442                         pagenum = pagenum + 1
1443
1444
1445 class YahooSearchIE(InfoExtractor):
1446         """Information Extractor for Yahoo! Video search queries."""
1447         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1448         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1449         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1450         _MORE_PAGES_INDICATOR = r'\s*Next'
1451         _max_yahoo_results = 1000
1452         IE_NAME = u'video.yahoo:search'
1453
1454         def __init__(self, downloader=None):
1455                 InfoExtractor.__init__(self, downloader)
1456
1457         def report_download_page(self, query, pagenum):
1458                 """Report attempt to download playlist page with given number."""
1459                 query = query.decode(preferredencoding())
1460                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1461
1462         def _real_extract(self, query):
1463                 mobj = re.match(self._VALID_URL, query)
1464                 if mobj is None:
1465                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1466                         return
1467
1468                 prefix, query = query.split(':')
1469                 prefix = prefix[8:]
1470                 query = query.encode('utf-8')
1471                 if prefix == '':
1472                         self._download_n_results(query, 1)
1473                         return
1474                 elif prefix == 'all':
1475                         self._download_n_results(query, self._max_yahoo_results)
1476                         return
1477                 else:
1478                         try:
1479                                 n = long(prefix)
1480                                 if n <= 0:
1481                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1482                                         return
1483                                 elif n > self._max_yahoo_results:
1484                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1485                                         n = self._max_yahoo_results
1486                                 self._download_n_results(query, n)
1487                                 return
1488                         except ValueError: # parsing prefix as integer fails
1489                                 self._download_n_results(query, 1)
1490                                 return
1491
1492         def _download_n_results(self, query, n):
1493                 """Downloads a specified number of results for a query"""
1494
1495                 video_ids = []
1496                 already_seen = set()
1497                 pagenum = 1
1498
1499                 while True:
1500                         self.report_download_page(query, pagenum)
1501                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1502                         request = urllib2.Request(result_url)
1503                         try:
1504                                 page = urllib2.urlopen(request).read()
1505                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1506                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1507                                 return
1508
1509                         # Extract video identifiers
1510                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1511                                 video_id = mobj.group(1)
1512                                 if video_id not in already_seen:
1513                                         video_ids.append(video_id)
1514                                         already_seen.add(video_id)
1515                                         if len(video_ids) == n:
1516                                                 # Specified n videos reached
1517                                                 for id in video_ids:
1518                                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1519                                                 return
1520
1521                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1522                                 for id in video_ids:
1523                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1524                                 return
1525
1526                         pagenum = pagenum + 1
1527
1528
1529 class YoutubePlaylistIE(InfoExtractor):
1530         """Information Extractor for YouTube playlists."""
1531
1532         _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1533         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1534         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1535         _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1536         IE_NAME = u'youtube:playlist'
1537
1538         def __init__(self, downloader=None):
1539                 InfoExtractor.__init__(self, downloader)
1540
1541         def report_download_page(self, playlist_id, pagenum):
1542                 """Report attempt to download playlist page with given number."""
1543                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1544
1545         def _real_extract(self, url):
1546                 # Extract playlist id
1547                 mobj = re.match(self._VALID_URL, url)
1548                 if mobj is None:
1549                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1550                         return
1551
1552                 # Single video case
1553                 if mobj.group(3) is not None:
1554                         self._downloader.download([mobj.group(3)])
1555                         return
1556
1557                 # Download playlist pages
1558                 # prefix is 'p' as default for playlists but there are other types that need extra care
1559                 playlist_prefix = mobj.group(1)
1560                 if playlist_prefix == 'a':
1561                         playlist_access = 'artist'
1562                 else:
1563                         playlist_prefix = 'p'
1564                         playlist_access = 'view_play_list'
1565                 playlist_id = mobj.group(2)
1566                 video_ids = []
1567                 pagenum = 1
1568
1569                 while True:
1570                         self.report_download_page(playlist_id, pagenum)
1571                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1572                         request = urllib2.Request(url)
1573                         try:
1574                                 page = urllib2.urlopen(request).read()
1575                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1576                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1577                                 return
1578
1579                         # Extract video identifiers
1580                         ids_in_page = []
1581                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1582                                 if mobj.group(1) not in ids_in_page:
1583                                         ids_in_page.append(mobj.group(1))
1584                         video_ids.extend(ids_in_page)
1585
1586                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1587                                 break
1588                         pagenum = pagenum + 1
1589
1590                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1591                 playlistend = self._downloader.params.get('playlistend', -1)
1592                 if playlistend == -1:
1593                         video_ids = video_ids[playliststart:]
1594                 else:
1595                         video_ids = video_ids[playliststart:playlistend]
1596
1597                 for id in video_ids:
1598                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1599                 return
1600
1601
1602 class YoutubeChannelIE(InfoExtractor):
1603         """Information Extractor for YouTube channels."""
1604
1605         _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1606         _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1607         _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1608         IE_NAME = u'youtube:channel'
1609
1610         def report_download_page(self, channel_id, pagenum):
1611                 """Report attempt to download channel page with given number."""
1612                 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1613
1614         def _real_extract(self, url):
1615                 # Extract channel id
1616                 mobj = re.match(self._VALID_URL, url)
1617                 if mobj is None:
1618                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1619                         return
1620
1621                 # Download channel pages
1622                 channel_id = mobj.group(1)
1623                 video_ids = []
1624                 pagenum = 1
1625
1626                 while True:
1627                         self.report_download_page(channel_id, pagenum)
1628                         url = self._TEMPLATE_URL % (channel_id, pagenum)
1629                         request = urllib2.Request(url)
1630                         try:
1631                                 page = urllib2.urlopen(request).read()
1632                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1633                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1634                                 return
1635
1636                         # Extract video identifiers
1637                         ids_in_page = []
1638                         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1639                                 if mobj.group(1) not in ids_in_page:
1640                                         ids_in_page.append(mobj.group(1))
1641                         video_ids.extend(ids_in_page)
1642
1643                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1644                                 break
1645                         pagenum = pagenum + 1
1646
1647                 for id in video_ids:
1648                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1649                 return
1650
1651
1652 class YoutubeUserIE(InfoExtractor):
1653         """Information Extractor for YouTube users."""
1654
1655         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1656         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1657         _GDATA_PAGE_SIZE = 50
1658         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1659         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1660         IE_NAME = u'youtube:user'
1661
1662         def __init__(self, downloader=None):
1663                 InfoExtractor.__init__(self, downloader)
1664
1665         def report_download_page(self, username, start_index):
1666                 """Report attempt to download user page."""
1667                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1668                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1669
1670         def _real_extract(self, url):
1671                 # Extract username
1672                 mobj = re.match(self._VALID_URL, url)
1673                 if mobj is None:
1674                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1675                         return
1676
1677                 username = mobj.group(1)
1678
1679                 # Download video ids using YouTube Data API. Result size per
1680                 # query is limited (currently to 50 videos) so we need to query
1681                 # page by page until there are no video ids - it means we got
1682                 # all of them.
1683
1684                 video_ids = []
1685                 pagenum = 0
1686
1687                 while True:
1688                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1689                         self.report_download_page(username, start_index)
1690
1691                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1692
1693                         try:
1694                                 page = urllib2.urlopen(request).read()
1695                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1696                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1697                                 return
1698
1699                         # Extract video identifiers
1700                         ids_in_page = []
1701
1702                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1703                                 if mobj.group(1) not in ids_in_page:
1704                                         ids_in_page.append(mobj.group(1))
1705
1706                         video_ids.extend(ids_in_page)
1707
1708                         # A little optimization - if current page is not
1709                         # "full", ie. does not contain PAGE_SIZE video ids then
1710                         # we can assume that this page is the last one - there
1711                         # are no more ids on further pages - no need to query
1712                         # again.
1713
1714                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1715                                 break
1716
1717                         pagenum += 1
1718
1719                 all_ids_count = len(video_ids)
1720                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1721                 playlistend = self._downloader.params.get('playlistend', -1)
1722
1723                 if playlistend == -1:
1724                         video_ids = video_ids[playliststart:]
1725                 else:
1726                         video_ids = video_ids[playliststart:playlistend]
1727
1728                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1729                                 (username, all_ids_count, len(video_ids)))
1730
1731                 for video_id in video_ids:
1732                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1733
1734
1735 class BlipTVUserIE(InfoExtractor):
1736         """Information Extractor for blip.tv users."""
1737
1738         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1739         _PAGE_SIZE = 12
1740         IE_NAME = u'blip.tv:user'
1741
1742         def __init__(self, downloader=None):
1743                 InfoExtractor.__init__(self, downloader)
1744
1745         def report_download_page(self, username, pagenum):
1746                 """Report attempt to download user page."""
1747                 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1748                                 (self.IE_NAME, username, pagenum))
1749
1750         def _real_extract(self, url):
1751                 # Extract username
1752                 mobj = re.match(self._VALID_URL, url)
1753                 if mobj is None:
1754                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1755                         return
1756
1757                 username = mobj.group(1)
1758
1759                 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1760
1761                 request = urllib2.Request(url)
1762
1763                 try:
1764                         page = urllib2.urlopen(request).read().decode('utf-8')
1765                         mobj = re.search(r'data-users-id="([^"]+)"', page)
1766                         page_base = page_base % mobj.group(1)
1767                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1768                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1769                         return
1770
1771
1772                 # Download video ids using BlipTV Ajax calls. Result size per
1773                 # query is limited (currently to 12 videos) so we need to query
1774                 # page by page until there are no video ids - it means we got
1775                 # all of them.
1776
1777                 video_ids = []
1778                 pagenum = 1
1779
1780                 while True:
1781                         self.report_download_page(username, pagenum)
1782
1783                         request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1784
1785                         try:
1786                                 page = urllib2.urlopen(request).read().decode('utf-8')
1787                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1788                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1789                                 return
1790
1791                         # Extract video identifiers
1792                         ids_in_page = []
1793
1794                         for mobj in re.finditer(r'href="/([^"]+)"', page):
1795                                 if mobj.group(1) not in ids_in_page:
1796                                         ids_in_page.append(unescapeHTML(mobj.group(1)))
1797
1798                         video_ids.extend(ids_in_page)
1799
1800                         # A little optimization - if current page is not
1801                         # "full", ie. does not contain PAGE_SIZE video ids then
1802                         # we can assume that this page is the last one - there
1803                         # are no more ids on further pages - no need to query
1804                         # again.
1805
1806                         if len(ids_in_page) < self._PAGE_SIZE:
1807                                 break
1808
1809                         pagenum += 1
1810
1811                 all_ids_count = len(video_ids)
1812                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1813                 playlistend = self._downloader.params.get('playlistend', -1)
1814
1815                 if playlistend == -1:
1816                         video_ids = video_ids[playliststart:]
1817                 else:
1818                         video_ids = video_ids[playliststart:playlistend]
1819
1820                 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1821                                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1822
1823                 for video_id in video_ids:
1824                         self._downloader.download([u'http://blip.tv/'+video_id])
1825
1826
1827 class DepositFilesIE(InfoExtractor):
1828         """Information extractor for depositfiles.com"""
1829
1830         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1831         IE_NAME = u'DepositFiles'
1832
1833         def __init__(self, downloader=None):
1834                 InfoExtractor.__init__(self, downloader)
1835
1836         def report_download_webpage(self, file_id):
1837                 """Report webpage download."""
1838                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1839
1840         def report_extraction(self, file_id):
1841                 """Report information extraction."""
1842                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1843
1844         def _real_extract(self, url):
1845                 file_id = url.split('/')[-1]
1846                 # Rebuild url in english locale
1847                 url = 'http://depositfiles.com/en/files/' + file_id
1848
1849                 # Retrieve file webpage with 'Free download' button pressed
1850                 free_download_indication = { 'gateway_result' : '1' }
1851                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1852                 try:
1853                         self.report_download_webpage(file_id)
1854                         webpage = urllib2.urlopen(request).read()
1855                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1856                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1857                         return
1858
1859                 # Search for the real file URL
1860                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1861                 if (mobj is None) or (mobj.group(1) is None):
1862                         # Try to figure out reason of the error.
1863                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1864                         if (mobj is not None) and (mobj.group(1) is not None):
1865                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1866                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1867                         else:
1868                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1869                         return
1870
1871                 file_url = mobj.group(1)
1872                 file_extension = os.path.splitext(file_url)[1][1:]
1873
1874                 # Search for file title
1875                 mobj = re.search(r'<b title="(.*?)">', webpage)
1876                 if mobj is None:
1877                         self._downloader.trouble(u'ERROR: unable to extract title')
1878                         return
1879                 file_title = mobj.group(1).decode('utf-8')
1880
1881                 return [{
1882                         'id':           file_id.decode('utf-8'),
1883                         'url':          file_url.decode('utf-8'),
1884                         'uploader':     u'NA',
1885                         'upload_date':  u'NA',
1886                         'title':        file_title,
1887                         'ext':          file_extension.decode('utf-8'),
1888                 }]
1889
1890
1891 class FacebookIE(InfoExtractor):
1892         """Information Extractor for Facebook"""
1893
1894         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1895         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1896         _NETRC_MACHINE = 'facebook'
1897         _available_formats = ['video', 'highqual', 'lowqual']
1898         _video_extensions = {
1899                 'video': 'mp4',
1900                 'highqual': 'mp4',
1901                 'lowqual': 'mp4',
1902         }
1903         IE_NAME = u'facebook'
1904
1905         def __init__(self, downloader=None):
1906                 InfoExtractor.__init__(self, downloader)
1907
1908         def _reporter(self, message):
1909                 """Add header and report message."""
1910                 self._downloader.to_screen(u'[facebook] %s' % message)
1911
1912         def report_login(self):
1913                 """Report attempt to log in."""
1914                 self._reporter(u'Logging in')
1915
1916         def report_video_webpage_download(self, video_id):
1917                 """Report attempt to download video webpage."""
1918                 self._reporter(u'%s: Downloading video webpage' % video_id)
1919
1920         def report_information_extraction(self, video_id):
1921                 """Report attempt to extract video information."""
1922                 self._reporter(u'%s: Extracting video information' % video_id)
1923
1924         def _parse_page(self, video_webpage):
1925                 """Extract video information from page"""
1926                 # General data
1927                 data = {'title': r'\("video_title", "(.*?)"\)',
1928                         'description': r'<div class="datawrap">(.*?)</div>',
1929                         'owner': r'\("video_owner_name", "(.*?)"\)',
1930                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1931                         }
1932                 video_info = {}
1933                 for piece in data.keys():
1934                         mobj = re.search(data[piece], video_webpage)
1935                         if mobj is not None:
1936                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1937
1938                 # Video urls
1939                 video_urls = {}
1940                 for fmt in self._available_formats:
1941                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1942                         if mobj is not None:
1943                                 # URL is in a Javascript segment inside an escaped Unicode format within
1944                                 # the generally utf-8 page
1945                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1946                 video_info['video_urls'] = video_urls
1947
1948                 return video_info
1949
1950         def _real_initialize(self):
1951                 if self._downloader is None:
1952                         return
1953
1954                 useremail = None
1955                 password = None
1956                 downloader_params = self._downloader.params
1957
1958                 # Attempt to use provided username and password or .netrc data
1959                 if downloader_params.get('username', None) is not None:
1960                         useremail = downloader_params['username']
1961                         password = downloader_params['password']
1962                 elif downloader_params.get('usenetrc', False):
1963                         try:
1964                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1965                                 if info is not None:
1966                                         useremail = info[0]
1967                                         password = info[2]
1968                                 else:
1969                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1970                         except (IOError, netrc.NetrcParseError), err:
1971                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
1972                                 return
1973
1974                 if useremail is None:
1975                         return
1976
1977                 # Log in
1978                 login_form = {
1979                         'email': useremail,
1980                         'pass': password,
1981                         'login': 'Log+In'
1982                         }
1983                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1984                 try:
1985                         self.report_login()
1986                         login_results = urllib2.urlopen(request).read()
1987                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1988                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1989                                 return
1990                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1991                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
1992                         return
1993
1994         def _real_extract(self, url):
1995                 mobj = re.match(self._VALID_URL, url)
1996                 if mobj is None:
1997                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1998                         return
1999                 video_id = mobj.group('ID')
2000
2001                 # Get video webpage
2002                 self.report_video_webpage_download(video_id)
2003                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2004                 try:
2005                         page = urllib2.urlopen(request)
2006                         video_webpage = page.read()
2007                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2008                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2009                         return
2010
2011                 # Start extracting information
2012                 self.report_information_extraction(video_id)
2013
2014                 # Extract information
2015                 video_info = self._parse_page(video_webpage)
2016
2017                 # uploader
2018                 if 'owner' not in video_info:
2019                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2020                         return
2021                 video_uploader = video_info['owner']
2022
2023                 # title
2024                 if 'title' not in video_info:
2025                         self._downloader.trouble(u'ERROR: unable to extract video title')
2026                         return
2027                 video_title = video_info['title']
2028                 video_title = video_title.decode('utf-8')
2029
2030                 # thumbnail image
2031                 if 'thumbnail' not in video_info:
2032                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2033                         video_thumbnail = ''
2034                 else:
2035                         video_thumbnail = video_info['thumbnail']
2036
2037                 # upload date
2038                 upload_date = u'NA'
2039                 if 'upload_date' in video_info:
2040                         upload_time = video_info['upload_date']
2041                         timetuple = email.utils.parsedate_tz(upload_time)
2042                         if timetuple is not None:
2043                                 try:
2044                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2045                                 except:
2046                                         pass
2047
2048                 # description
2049                 video_description = video_info.get('description', 'No description available.')
2050
2051                 url_map = video_info['video_urls']
2052                 if len(url_map.keys()) > 0:
2053                         # Decide which formats to download
2054                         req_format = self._downloader.params.get('format', None)
2055                         format_limit = self._downloader.params.get('format_limit', None)
2056
2057                         if format_limit is not None and format_limit in self._available_formats:
2058                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2059                         else:
2060                                 format_list = self._available_formats
2061                         existing_formats = [x for x in format_list if x in url_map]
2062                         if len(existing_formats) == 0:
2063                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2064                                 return
2065                         if req_format is None:
2066                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2067                         elif req_format == 'worst':
2068                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2069                         elif req_format == '-1':
2070                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2071                         else:
2072                                 # Specific format
2073                                 if req_format not in url_map:
2074                                         self._downloader.trouble(u'ERROR: requested format not available')
2075                                         return
2076                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2077
2078                 results = []
2079                 for format_param, video_real_url in video_url_list:
2080                         # Extension
2081                         video_extension = self._video_extensions.get(format_param, 'mp4')
2082
2083                         results.append({
2084                                 'id':           video_id.decode('utf-8'),
2085                                 'url':          video_real_url.decode('utf-8'),
2086                                 'uploader':     video_uploader.decode('utf-8'),
2087                                 'upload_date':  upload_date,
2088                                 'title':        video_title,
2089                                 'ext':          video_extension.decode('utf-8'),
2090                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2091                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2092                                 'description':  video_description.decode('utf-8'),
2093                         })
2094                 return results
2095
2096 class BlipTVIE(InfoExtractor):
2097         """Information extractor for blip.tv"""
2098
2099         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2100         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2101         IE_NAME = u'blip.tv'
2102
2103         def report_extraction(self, file_id):
2104                 """Report information extraction."""
2105                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2106
2107         def report_direct_download(self, title):
2108                 """Report information extraction."""
2109                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2110
2111         def _real_extract(self, url):
2112                 mobj = re.match(self._VALID_URL, url)
2113                 if mobj is None:
2114                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2115                         return
2116
2117                 if '?' in url:
2118                         cchar = '&'
2119                 else:
2120                         cchar = '?'
2121                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2122                 request = urllib2.Request(json_url.encode('utf-8'))
2123                 self.report_extraction(mobj.group(1))
2124                 info = None
2125                 try:
2126                         urlh = urllib2.urlopen(request)
2127                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2128                                 basename = url.split('/')[-1]
2129                                 title,ext = os.path.splitext(basename)
2130                                 title = title.decode('UTF-8')
2131                                 ext = ext.replace('.', '')
2132                                 self.report_direct_download(title)
2133                                 info = {
2134                                         'id': title,
2135                                         'url': url,
2136                                         'uploader': u'NA',
2137                                         'upload_date': u'NA',
2138                                         'title': title,
2139                                         'ext': ext,
2140                                         'urlhandle': urlh
2141                                 }
2142                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2143                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2144                         return
2145                 if info is None: # Regular URL
2146                         try:
2147                                 json_code = urlh.read()
2148                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2149                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2150                                 return
2151
2152                         try:
2153                                 json_data = json.loads(json_code)
2154                                 if 'Post' in json_data:
2155                                         data = json_data['Post']
2156                                 else:
2157                                         data = json_data
2158
2159                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2160                                 video_url = data['media']['url']
2161                                 umobj = re.match(self._URL_EXT, video_url)
2162                                 if umobj is None:
2163                                         raise ValueError('Can not determine filename extension')
2164                                 ext = umobj.group(1)
2165
2166                                 info = {
2167                                         'id': data['item_id'],
2168                                         'url': video_url,
2169                                         'uploader': data['display_name'],
2170                                         'upload_date': upload_date,
2171                                         'title': data['title'],
2172                                         'ext': ext,
2173                                         'format': data['media']['mimeType'],
2174                                         'thumbnail': data['thumbnailUrl'],
2175                                         'description': data['description'],
2176                                         'player_url': data['embedUrl']
2177                                 }
2178                         except (ValueError,KeyError), err:
2179                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2180                                 return
2181
2182                 std_headers['User-Agent'] = 'iTunes/10.6.1'
2183                 return [info]
2184
2185
2186 class MyVideoIE(InfoExtractor):
2187         """Information Extractor for myvideo.de."""
2188
2189         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2190         IE_NAME = u'myvideo'
2191
2192         def __init__(self, downloader=None):
2193                 InfoExtractor.__init__(self, downloader)
2194         
2195         def report_download_webpage(self, video_id):
2196                 """Report webpage download."""
2197                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2198
2199         def report_extraction(self, video_id):
2200                 """Report information extraction."""
2201                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2202
2203         def _real_extract(self,url):
2204                 mobj = re.match(self._VALID_URL, url)
2205                 if mobj is None:
2206                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2207                         return
2208
2209                 video_id = mobj.group(1)
2210
2211                 # Get video webpage
2212                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2213                 try:
2214                         self.report_download_webpage(video_id)
2215                         webpage = urllib2.urlopen(request).read()
2216                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2217                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2218                         return
2219
2220                 self.report_extraction(video_id)
2221                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2222                                  webpage)
2223                 if mobj is None:
2224                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2225                         return
2226                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2227
2228                 mobj = re.search('<title>([^<]+)</title>', webpage)
2229                 if mobj is None:
2230                         self._downloader.trouble(u'ERROR: unable to extract title')
2231                         return
2232
2233                 video_title = mobj.group(1)
2234
2235                 return [{
2236                         'id':           video_id,
2237                         'url':          video_url,
2238                         'uploader':     u'NA',
2239                         'upload_date':  u'NA',
2240                         'title':        video_title,
2241                         'ext':          u'flv',
2242                 }]
2243
2244 class ComedyCentralIE(InfoExtractor):
2245         """Information extractor for The Daily Show and Colbert Report """
2246
2247         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2248         IE_NAME = u'comedycentral'
2249
2250         _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2251
2252         _video_extensions = {
2253                 '3500': 'mp4',
2254                 '2200': 'mp4',
2255                 '1700': 'mp4',
2256                 '1200': 'mp4',
2257                 '750': 'mp4',
2258                 '400': 'mp4',
2259         }
2260         _video_dimensions = {
2261                 '3500': '1280x720',
2262                 '2200': '960x540',
2263                 '1700': '768x432',
2264                 '1200': '640x360',
2265                 '750': '512x288',
2266                 '400': '384x216',
2267         }
2268
2269         def report_extraction(self, episode_id):
2270                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2271
2272         def report_config_download(self, episode_id):
2273                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2274
2275         def report_index_download(self, episode_id):
2276                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2277
2278         def report_player_url(self, episode_id):
2279                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2280
2281
2282         def _print_formats(self, formats):
2283                 print('Available formats:')
2284                 for x in formats:
2285                         print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2286
2287
2288         def _real_extract(self, url):
2289                 mobj = re.match(self._VALID_URL, url)
2290                 if mobj is None:
2291                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2292                         return
2293
2294                 if mobj.group('shortname'):
2295                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
2296                                 url = u'http://www.thedailyshow.com/full-episodes/'
2297                         else:
2298                                 url = u'http://www.colbertnation.com/full-episodes/'
2299                         mobj = re.match(self._VALID_URL, url)
2300                         assert mobj is not None
2301
2302                 dlNewest = not mobj.group('episode')
2303                 if dlNewest:
2304                         epTitle = mobj.group('showname')
2305                 else:
2306                         epTitle = mobj.group('episode')
2307
2308                 req = urllib2.Request(url)
2309                 self.report_extraction(epTitle)
2310                 try:
2311                         htmlHandle = urllib2.urlopen(req)
2312                         html = htmlHandle.read()
2313                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2314                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2315                         return
2316                 if dlNewest:
2317                         url = htmlHandle.geturl()
2318                         mobj = re.match(self._VALID_URL, url)
2319                         if mobj is None:
2320                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2321                                 return
2322                         if mobj.group('episode') == '':
2323                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2324                                 return
2325                         epTitle = mobj.group('episode')
2326
2327                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2328
2329                 if len(mMovieParams) == 0:
2330                         # The Colbert Report embeds the information in a without
2331                         # a URL prefix; so extract the alternate reference
2332                         # and then add the URL prefix manually.
2333
2334                         altMovieParams = re.findall('data-mgid="([^"]*episode.*?:.*?)"', html)
2335                         if len(altMovieParams) == 0:
2336                                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2337                                 return
2338                         else:
2339                                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2340                 
2341                 playerUrl_raw = mMovieParams[0][0]
2342                 self.report_player_url(epTitle)
2343                 try:
2344                         urlHandle = urllib2.urlopen(playerUrl_raw)
2345                         playerUrl = urlHandle.geturl()
2346                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2347                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2348                         return
2349
2350                 uri = mMovieParams[0][1]
2351                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2352                 self.report_index_download(epTitle)
2353                 try:
2354                         indexXml = urllib2.urlopen(indexUrl).read()
2355                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2356                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2357                         return
2358
2359                 results = []
2360
2361                 idoc = xml.etree.ElementTree.fromstring(indexXml)
2362                 itemEls = idoc.findall('.//item')
2363                 for itemEl in itemEls:
2364                         mediaId = itemEl.findall('./guid')[0].text
2365                         shortMediaId = mediaId.split(':')[-1]
2366                         showId = mediaId.split(':')[-2].replace('.com', '')
2367                         officialTitle = itemEl.findall('./title')[0].text
2368                         officialDate = itemEl.findall('./pubDate')[0].text
2369
2370                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2371                                                 urllib.urlencode({'uri': mediaId}))
2372                         configReq = urllib2.Request(configUrl)
2373                         self.report_config_download(epTitle)
2374                         try:
2375                                 configXml = urllib2.urlopen(configReq).read()
2376                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2377                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2378                                 return
2379
2380                         cdoc = xml.etree.ElementTree.fromstring(configXml)
2381                         turls = []
2382                         for rendition in cdoc.findall('.//rendition'):
2383                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2384                                 turls.append(finfo)
2385
2386                         if len(turls) == 0:
2387                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2388                                 continue
2389                         
2390                         if self._downloader.params.get('listformats', None):
2391                                 self._print_formats([i[0] for i in turls])
2392                                 return
2393
2394                         # For now, just pick the highest bitrate
2395                         format,video_url = turls[-1]
2396
2397                         # Get the format arg from the arg stream
2398                         req_format = self._downloader.params.get('format', None)
2399
2400                         # Select format if we can find one
2401                         for f,v in turls:
2402                                 if f == req_format:
2403                                         format, video_url = f, v
2404                                         break
2405
2406                         # Patch to download from alternative CDN, which does not
2407                         # break on current RTMPDump builds
2408                         broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2409                         better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2410
2411                         if video_url.startswith(broken_cdn):
2412                                 video_url = video_url.replace(broken_cdn, better_cdn)
2413
2414                         effTitle = showId + u'-' + epTitle
2415                         info = {
2416                                 'id': shortMediaId,
2417                                 'url': video_url,
2418                                 'uploader': showId,
2419                                 'upload_date': officialDate,
2420                                 'title': effTitle,
2421                                 'ext': 'mp4',
2422                                 'format': format,
2423                                 'thumbnail': None,
2424                                 'description': officialTitle,
2425                                 'player_url': None #playerUrl
2426                         }
2427
2428                         results.append(info)
2429                         
2430                 return results
2431
2432
2433 class EscapistIE(InfoExtractor):
2434         """Information extractor for The Escapist """
2435
2436         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2437         IE_NAME = u'escapist'
2438
2439         def report_extraction(self, showName):
2440                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2441
2442         def report_config_download(self, showName):
2443                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2444
2445         def _real_extract(self, url):
2446                 mobj = re.match(self._VALID_URL, url)
2447                 if mobj is None:
2448                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2449                         return
2450                 showName = mobj.group('showname')
2451                 videoId = mobj.group('episode')
2452
2453                 self.report_extraction(showName)
2454                 try:
2455                         webPage = urllib2.urlopen(url)
2456                         webPageBytes = webPage.read()
2457                         m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2458                         webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2459                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2460                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2461                         return
2462
2463                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2464                 description = unescapeHTML(descMatch.group(1))
2465                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2466                 imgUrl = unescapeHTML(imgMatch.group(1))
2467                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2468                 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2469                 configUrlMatch = re.search('config=(.*)$', playerUrl)
2470                 configUrl = urllib2.unquote(configUrlMatch.group(1))
2471
2472                 self.report_config_download(showName)
2473                 try:
2474                         configJSON = urllib2.urlopen(configUrl).read()
2475                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2476                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2477                         return
2478
2479                 # Technically, it's JavaScript, not JSON
2480                 configJSON = configJSON.replace("'", '"')
2481
2482                 try:
2483                         config = json.loads(configJSON)
2484                 except (ValueError,), err:
2485                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2486                         return
2487
2488                 playlist = config['playlist']
2489                 videoUrl = playlist[1]['url']
2490
2491                 info = {
2492                         'id': videoId,
2493                         'url': videoUrl,
2494                         'uploader': showName,
2495                         'upload_date': u'NA',
2496                         'title': showName,
2497                         'ext': 'flv',
2498                         'thumbnail': imgUrl,
2499                         'description': description,
2500                         'player_url': playerUrl,
2501                 }
2502
2503                 return [info]
2504
2505
2506 class CollegeHumorIE(InfoExtractor):
2507         """Information extractor for collegehumor.com"""
2508
2509         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2510         IE_NAME = u'collegehumor'
2511
2512         def report_webpage(self, video_id):
2513                 """Report information extraction."""
2514                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2515
2516         def report_extraction(self, video_id):
2517                 """Report information extraction."""
2518                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2519
2520         def _real_extract(self, url):
2521                 mobj = re.match(self._VALID_URL, url)
2522                 if mobj is None:
2523                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2524                         return
2525                 video_id = mobj.group('videoid')
2526
2527                 self.report_webpage(video_id)
2528                 request = urllib2.Request(url)
2529                 try:
2530                         webpage = urllib2.urlopen(request).read()
2531                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2532                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2533                         return
2534
2535                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2536                 if m is None:
2537                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2538                         return
2539                 internal_video_id = m.group('internalvideoid')
2540
2541                 info = {
2542                         'id': video_id,
2543                         'internal_id': internal_video_id,
2544                         'uploader': u'NA',
2545                         'upload_date': u'NA',
2546                 }
2547
2548                 self.report_extraction(video_id)
2549                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2550                 try:
2551                         metaXml = urllib2.urlopen(xmlUrl).read()
2552                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2553                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2554                         return
2555
2556                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2557                 try:
2558                         videoNode = mdoc.findall('./video')[0]
2559                         info['description'] = videoNode.findall('./description')[0].text
2560                         info['title'] = videoNode.findall('./caption')[0].text
2561                         info['url'] = videoNode.findall('./file')[0].text
2562                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2563                         info['ext'] = info['url'].rpartition('.')[2]
2564                 except IndexError:
2565                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2566                         return
2567
2568                 return [info]
2569
2570
2571 class XVideosIE(InfoExtractor):
2572         """Information extractor for xvideos.com"""
2573
2574         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2575         IE_NAME = u'xvideos'
2576
2577         def report_webpage(self, video_id):
2578                 """Report information extraction."""
2579                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2580
2581         def report_extraction(self, video_id):
2582                 """Report information extraction."""
2583                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2584
2585         def _real_extract(self, url):
2586                 mobj = re.match(self._VALID_URL, url)
2587                 if mobj is None:
2588                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2589                         return
2590                 video_id = mobj.group(1).decode('utf-8')
2591
2592                 self.report_webpage(video_id)
2593
2594                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2595                 try:
2596                         webpage = urllib2.urlopen(request).read()
2597                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2598                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2599                         return
2600
2601                 self.report_extraction(video_id)
2602
2603
2604                 # Extract video URL
2605                 mobj = re.search(r'flv_url=(.+?)&', webpage)
2606                 if mobj is None:
2607                         self._downloader.trouble(u'ERROR: unable to extract video url')
2608                         return
2609                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2610
2611
2612                 # Extract title
2613                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2614                 if mobj is None:
2615                         self._downloader.trouble(u'ERROR: unable to extract video title')
2616                         return
2617                 video_title = mobj.group(1).decode('utf-8')
2618
2619
2620                 # Extract video thumbnail
2621                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2622                 if mobj is None:
2623                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2624                         return
2625                 video_thumbnail = mobj.group(0).decode('utf-8')
2626
2627                 info = {
2628                         'id': video_id,
2629                         'url': video_url,
2630                         'uploader': u'NA',
2631                         'upload_date': u'NA',
2632                         'title': video_title,
2633                         'ext': 'flv',
2634                         'thumbnail': video_thumbnail,
2635                         'description': None,
2636                 }
2637
2638                 return [info]
2639
2640
2641 class SoundcloudIE(InfoExtractor):
2642         """Information extractor for soundcloud.com
2643            To access the media, the uid of the song and a stream token
2644            must be extracted from the page source and the script must make
2645            a request to media.soundcloud.com/crossdomain.xml. Then
2646            the media can be grabbed by requesting from an url composed
2647            of the stream token and uid
2648          """
2649
2650         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2651         IE_NAME = u'soundcloud'
2652
2653         def __init__(self, downloader=None):
2654                 InfoExtractor.__init__(self, downloader)
2655
2656         def report_webpage(self, video_id):
2657                 """Report information extraction."""
2658                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2659
2660         def report_extraction(self, video_id):
2661                 """Report information extraction."""
2662                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2663
2664         def _real_extract(self, url):
2665                 mobj = re.match(self._VALID_URL, url)
2666                 if mobj is None:
2667                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2668                         return
2669
2670                 # extract uploader (which is in the url)
2671                 uploader = mobj.group(1).decode('utf-8')
2672                 # extract simple title (uploader + slug of song title)
2673                 slug_title =  mobj.group(2).decode('utf-8')
2674                 simple_title = uploader + u'-' + slug_title
2675
2676                 self.report_webpage('%s/%s' % (uploader, slug_title))
2677
2678                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2679                 try:
2680                         webpage = urllib2.urlopen(request).read()
2681                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2682                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2683                         return
2684
2685                 self.report_extraction('%s/%s' % (uploader, slug_title))
2686
2687                 # extract uid and stream token that soundcloud hands out for access
2688                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2689                 if mobj:
2690                         video_id = mobj.group(1)
2691                         stream_token = mobj.group(2)
2692
2693                 # extract unsimplified title
2694                 mobj = re.search('"title":"(.*?)",', webpage)
2695                 if mobj:
2696                         title = mobj.group(1).decode('utf-8')
2697                 else:
2698                         title = simple_title
2699
2700                 # construct media url (with uid/token)
2701                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2702                 mediaURL = mediaURL % (video_id, stream_token)
2703
2704                 # description
2705                 description = u'No description available'
2706                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2707                 if mobj:
2708                         description = mobj.group(1)
2709
2710                 # upload date
2711                 upload_date = u'NA'
2712                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2713                 if mobj:
2714                         try:
2715                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2716                         except Exception, e:
2717                                 self._downloader.to_stderr(compat_str(e))
2718
2719                 # for soundcloud, a request to a cross domain is required for cookies
2720                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2721
2722                 return [{
2723                         'id':           video_id.decode('utf-8'),
2724                         'url':          mediaURL,
2725                         'uploader':     uploader.decode('utf-8'),
2726                         'upload_date':  upload_date,
2727                         'title':        title,
2728                         'ext':          u'mp3',
2729                         'description': description.decode('utf-8')
2730                 }]
2731
2732
2733 class InfoQIE(InfoExtractor):
2734         """Information extractor for infoq.com"""
2735
2736         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2737         IE_NAME = u'infoq'
2738
2739         def report_webpage(self, video_id):
2740                 """Report information extraction."""
2741                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2742
2743         def report_extraction(self, video_id):
2744                 """Report information extraction."""
2745                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2746
2747         def _real_extract(self, url):
2748                 mobj = re.match(self._VALID_URL, url)
2749                 if mobj is None:
2750                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2751                         return
2752
2753                 self.report_webpage(url)
2754
2755                 request = urllib2.Request(url)
2756                 try:
2757                         webpage = urllib2.urlopen(request).read()
2758                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2759                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2760                         return
2761
2762                 self.report_extraction(url)
2763
2764
2765                 # Extract video URL
2766                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2767                 if mobj is None:
2768                         self._downloader.trouble(u'ERROR: unable to extract video url')
2769                         return
2770                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2771
2772
2773                 # Extract title
2774                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2775                 if mobj is None:
2776                         self._downloader.trouble(u'ERROR: unable to extract video title')
2777                         return
2778                 video_title = mobj.group(1).decode('utf-8')
2779
2780                 # Extract description
2781                 video_description = u'No description available.'
2782                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2783                 if mobj is not None:
2784                         video_description = mobj.group(1).decode('utf-8')
2785
2786                 video_filename = video_url.split('/')[-1]
2787                 video_id, extension = video_filename.split('.')
2788
2789                 info = {
2790                         'id': video_id,
2791                         'url': video_url,
2792                         'uploader': u'NA',
2793                         'upload_date': u'NA',
2794                         'title': video_title,
2795                         'ext': extension, # Extension is always(?) mp4, but seems to be flv
2796                         'thumbnail': None,
2797                         'description': video_description,
2798                 }
2799
2800                 return [info]
2801
2802 class MixcloudIE(InfoExtractor):
2803         """Information extractor for www.mixcloud.com"""
2804         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2805         IE_NAME = u'mixcloud'
2806
2807         def __init__(self, downloader=None):
2808                 InfoExtractor.__init__(self, downloader)
2809
2810         def report_download_json(self, file_id):
2811                 """Report JSON download."""
2812                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2813
2814         def report_extraction(self, file_id):
2815                 """Report information extraction."""
2816                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2817
2818         def get_urls(self, jsonData, fmt, bitrate='best'):
2819                 """Get urls from 'audio_formats' section in json"""
2820                 file_url = None
2821                 try:
2822                         bitrate_list = jsonData[fmt]
2823                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2824                                 bitrate = max(bitrate_list) # select highest
2825
2826                         url_list = jsonData[fmt][bitrate]
2827                 except TypeError: # we have no bitrate info.
2828                         url_list = jsonData[fmt]
2829                 return url_list
2830
2831         def check_urls(self, url_list):
2832                 """Returns 1st active url from list"""
2833                 for url in url_list:
2834                         try:
2835                                 urllib2.urlopen(url)
2836                                 return url
2837                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2838                                 url = None
2839
2840                 return None
2841
2842         def _print_formats(self, formats):
2843                 print('Available formats:')
2844                 for fmt in formats.keys():
2845                         for b in formats[fmt]:
2846                                 try:
2847                                         ext = formats[fmt][b][0]
2848                                         print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2849                                 except TypeError: # we have no bitrate info
2850                                         ext = formats[fmt][0]
2851                                         print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2852                                         break
2853
2854         def _real_extract(self, url):
2855                 mobj = re.match(self._VALID_URL, url)
2856                 if mobj is None:
2857                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2858                         return
2859                 # extract uploader & filename from url
2860                 uploader = mobj.group(1).decode('utf-8')
2861                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2862
2863                 # construct API request
2864                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2865                 # retrieve .json file with links to files
2866                 request = urllib2.Request(file_url)
2867                 try:
2868                         self.report_download_json(file_url)
2869                         jsonData = urllib2.urlopen(request).read()
2870                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2871                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2872                         return
2873
2874                 # parse JSON
2875                 json_data = json.loads(jsonData)
2876                 player_url = json_data['player_swf_url']
2877                 formats = dict(json_data['audio_formats'])
2878
2879                 req_format = self._downloader.params.get('format', None)
2880                 bitrate = None
2881
2882                 if self._downloader.params.get('listformats', None):
2883                         self._print_formats(formats)
2884                         return
2885
2886                 if req_format is None or req_format == 'best':
2887                         for format_param in formats.keys():
2888                                 url_list = self.get_urls(formats, format_param)
2889                                 # check urls
2890                                 file_url = self.check_urls(url_list)
2891                                 if file_url is not None:
2892                                         break # got it!
2893                 else:
2894                         if req_format not in formats.keys():
2895                                 self._downloader.trouble(u'ERROR: format is not available')
2896                                 return
2897
2898                         url_list = self.get_urls(formats, req_format)
2899                         file_url = self.check_urls(url_list)
2900                         format_param = req_format
2901
2902                 return [{
2903                         'id': file_id.decode('utf-8'),
2904                         'url': file_url.decode('utf-8'),
2905                         'uploader':     uploader.decode('utf-8'),
2906                         'upload_date': u'NA',
2907                         'title': json_data['name'],
2908                         'ext': file_url.split('.')[-1].decode('utf-8'),
2909                         'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2910                         'thumbnail': json_data['thumbnail_url'],
2911                         'description': json_data['description'],
2912                         'player_url': player_url.decode('utf-8'),
2913                 }]
2914
2915 class StanfordOpenClassroomIE(InfoExtractor):
2916         """Information extractor for Stanford's Open ClassRoom"""
2917
2918         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2919         IE_NAME = u'stanfordoc'
2920
2921         def report_download_webpage(self, objid):
2922                 """Report information extraction."""
2923                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2924
2925         def report_extraction(self, video_id):
2926                 """Report information extraction."""
2927                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2928
2929         def _real_extract(self, url):
2930                 mobj = re.match(self._VALID_URL, url)
2931                 if mobj is None:
2932                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2933                         return
2934
2935                 if mobj.group('course') and mobj.group('video'): # A specific video
2936                         course = mobj.group('course')
2937                         video = mobj.group('video')
2938                         info = {
2939                                 'id': course + '_' + video,
2940                                 'uploader': u'NA',
2941                                 'upload_date': u'NA',
2942                         }
2943
2944                         self.report_extraction(info['id'])
2945                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2946                         xmlUrl = baseUrl + video + '.xml'
2947                         try:
2948                                 metaXml = urllib2.urlopen(xmlUrl).read()
2949                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2950                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2951                                 return
2952                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2953                         try:
2954                                 info['title'] = mdoc.findall('./title')[0].text
2955                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2956                         except IndexError:
2957                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2958                                 return
2959                         info['ext'] = info['url'].rpartition('.')[2]
2960                         return [info]
2961                 elif mobj.group('course'): # A course page
2962                         course = mobj.group('course')
2963                         info = {
2964                                 'id': course,
2965                                 'type': 'playlist',
2966                                 'uploader': u'NA',
2967                                 'upload_date': u'NA',
2968                         }
2969
2970                         self.report_download_webpage(info['id'])
2971                         try:
2972                                 coursepage = urllib2.urlopen(url).read()
2973                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2974                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2975                                 return
2976
2977                         m = re.search('<h1>([^<]+)</h1>', coursepage)
2978                         if m:
2979                                 info['title'] = unescapeHTML(m.group(1))
2980                         else:
2981                                 info['title'] = info['id']
2982
2983                         m = re.search('<description>([^<]+)</description>', coursepage)
2984                         if m:
2985                                 info['description'] = unescapeHTML(m.group(1))
2986
2987                         links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2988                         info['list'] = [
2989                                 {
2990                                         'type': 'reference',
2991                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2992                                 }
2993                                         for vpage in links]
2994                         results = []
2995                         for entry in info['list']:
2996                                 assert entry['type'] == 'reference'
2997                                 results += self.extract(entry['url'])
2998                         return results
2999                         
3000                 else: # Root page
3001                         info = {
3002                                 'id': 'Stanford OpenClassroom',
3003                                 'type': 'playlist',
3004                                 'uploader': u'NA',
3005                                 'upload_date': u'NA',
3006                         }
3007
3008                         self.report_download_webpage(info['id'])
3009                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3010                         try:
3011                                 rootpage = urllib2.urlopen(rootURL).read()
3012                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3013                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3014                                 return
3015
3016                         info['title'] = info['id']
3017
3018                         links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3019                         info['list'] = [
3020                                 {
3021                                         'type': 'reference',
3022                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3023                                 }
3024                                         for cpage in links]
3025
3026                         results = []
3027                         for entry in info['list']:
3028                                 assert entry['type'] == 'reference'
3029                                 results += self.extract(entry['url'])
3030                         return results
3031
3032 class MTVIE(InfoExtractor):
3033         """Information extractor for MTV.com"""
3034
3035         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3036         IE_NAME = u'mtv'
3037
3038         def report_webpage(self, video_id):
3039                 """Report information extraction."""
3040                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3041
3042         def report_extraction(self, video_id):
3043                 """Report information extraction."""
3044                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3045
3046         def _real_extract(self, url):
3047                 mobj = re.match(self._VALID_URL, url)
3048                 if mobj is None:
3049                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3050                         return
3051                 if not mobj.group('proto'):
3052                         url = 'http://' + url
3053                 video_id = mobj.group('videoid')
3054                 self.report_webpage(video_id)
3055
3056                 request = urllib2.Request(url)
3057                 try:
3058                         webpage = urllib2.urlopen(request).read()
3059                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3060                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3061                         return
3062
3063                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3064                 if mobj is None:
3065                         self._downloader.trouble(u'ERROR: unable to extract song name')
3066                         return
3067                 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3068                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3069                 if mobj is None:
3070                         self._downloader.trouble(u'ERROR: unable to extract performer')
3071                         return
3072                 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3073                 video_title = performer + ' - ' + song_name 
3074
3075                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3076                 if mobj is None:
3077                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3078                         return
3079                 mtvn_uri = mobj.group(1)
3080
3081                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3082                 if mobj is None:
3083                         self._downloader.trouble(u'ERROR: unable to extract content id')
3084                         return
3085                 content_id = mobj.group(1)
3086
3087                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3088                 self.report_extraction(video_id)
3089                 request = urllib2.Request(videogen_url)
3090                 try:
3091                         metadataXml = urllib2.urlopen(request).read()
3092                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3093                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3094                         return
3095
3096                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3097                 renditions = mdoc.findall('.//rendition')
3098
3099                 # For now, always pick the highest quality.
3100                 rendition = renditions[-1]
3101
3102                 try:
3103                         _,_,ext = rendition.attrib['type'].partition('/')
3104                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3105                         video_url = rendition.find('./src').text
3106                 except KeyError:
3107                         self._downloader.trouble('Invalid rendition field.')
3108                         return
3109
3110                 info = {
3111                         'id': video_id,
3112                         'url': video_url,
3113                         'uploader': performer,
3114                         'upload_date': u'NA',
3115                         'title': video_title,
3116                         'ext': ext,
3117                         'format': format,
3118                 }
3119
3120                 return [info]
3121
3122
3123 class YoukuIE(InfoExtractor):
3124
3125         _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3126         IE_NAME = u'Youku'
3127
3128         def __init__(self, downloader=None):
3129                 InfoExtractor.__init__(self, downloader)
3130
3131         def report_download_webpage(self, file_id):
3132                 """Report webpage download."""
3133                 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3134
3135         def report_extraction(self, file_id):
3136                 """Report information extraction."""
3137                 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3138
3139         def _gen_sid(self):
3140                 nowTime = int(time.time() * 1000)
3141                 random1 = random.randint(1000,1998)
3142                 random2 = random.randint(1000,9999)
3143
3144                 return "%d%d%d" %(nowTime,random1,random2)
3145
3146         def _get_file_ID_mix_string(self, seed):
3147                 mixed = []
3148                 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3149                 seed = float(seed)
3150                 for i in range(len(source)):
3151                         seed  =  (seed * 211 + 30031 ) % 65536
3152                         index  =  math.floor(seed / 65536 * len(source) )
3153                         mixed.append(source[int(index)])
3154                         source.remove(source[int(index)])
3155                 #return ''.join(mixed)
3156                 return mixed
3157
3158         def _get_file_id(self, fileId, seed):
3159                 mixed = self._get_file_ID_mix_string(seed)
3160                 ids = fileId.split('*')
3161                 realId = []
3162                 for ch in ids:
3163                         if ch:
3164                                 realId.append(mixed[int(ch)])
3165                 return ''.join(realId)
3166
3167         def _real_extract(self, url):
3168                 mobj = re.match(self._VALID_URL, url)
3169                 if mobj is None:
3170                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3171                         return
3172                 video_id = mobj.group('ID')
3173
3174                 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3175
3176                 request = urllib2.Request(info_url, None, std_headers)
3177                 try:
3178                         self.report_download_webpage(video_id)
3179                         jsondata = urllib2.urlopen(request).read()
3180                 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3181                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3182                         return
3183
3184                 self.report_extraction(video_id)
3185                 try:
3186                         config = json.loads(jsondata)
3187
3188                         video_title =  config['data'][0]['title']
3189                         seed = config['data'][0]['seed']
3190
3191                         format = self._downloader.params.get('format', None)
3192                         supported_format = config['data'][0]['streamfileids'].keys()
3193
3194                         if format is None or format == 'best':
3195                                 if 'hd2' in supported_format:
3196                                         format = 'hd2'
3197                                 else:
3198                                         format = 'flv'
3199                                 ext = u'flv'
3200                         elif format == 'worst':
3201                                 format = 'mp4'
3202                                 ext = u'mp4'
3203                         else:
3204                                 format = 'flv'
3205                                 ext = u'flv'
3206
3207
3208                         fileid = config['data'][0]['streamfileids'][format]
3209                         seg_number = len(config['data'][0]['segs'][format])
3210
3211                         keys=[]
3212                         for i in xrange(seg_number):
3213                                 keys.append(config['data'][0]['segs'][format][i]['k'])
3214
3215                         #TODO check error
3216                         #youku only could be viewed from mainland china
3217                 except:
3218                         self._downloader.trouble(u'ERROR: unable to extract info section')
3219                         return
3220
3221                 files_info=[]
3222                 sid = self._gen_sid()
3223                 fileid = self._get_file_id(fileid, seed)
3224
3225                 #column 8,9 of fileid represent the segment number
3226                 #fileid[7:9] should be changed
3227                 for index, key in enumerate(keys):
3228
3229                         temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3230                         download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3231
3232                         info = {
3233                                 'id': '%s_part%02d' % (video_id, index),
3234                                 'url': download_url,
3235                                 'uploader': u'NA',
3236                                 'upload_date': u'NA',
3237                                 'title': video_title,
3238                                 'ext': ext,
3239                         }
3240                         files_info.append(info)
3241
3242                 return files_info
3243
3244
3245 class XNXXIE(InfoExtractor):
3246         """Information extractor for xnxx.com"""
3247
3248         _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3249         IE_NAME = u'xnxx'
3250         VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3251         VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3252         VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3253
3254         def report_webpage(self, video_id):
3255                 """Report information extraction"""
3256                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3257
3258         def report_extraction(self, video_id):
3259                 """Report information extraction"""
3260                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3261
3262         def _real_extract(self, url):
3263                 mobj = re.match(self._VALID_URL, url)
3264                 if mobj is None:
3265                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3266                         return
3267                 video_id = mobj.group(1).decode('utf-8')
3268
3269                 self.report_webpage(video_id)
3270
3271                 # Get webpage content
3272                 try:
3273                         webpage = urllib2.urlopen(url).read()
3274                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3275                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3276                         return
3277
3278                 result = re.search(self.VIDEO_URL_RE, webpage)
3279                 if result is None:
3280                         self._downloader.trouble(u'ERROR: unable to extract video url')
3281                         return
3282                 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3283
3284                 result = re.search(self.VIDEO_TITLE_RE, webpage)
3285                 if result is None:
3286                         self._downloader.trouble(u'ERROR: unable to extract video title')
3287                         return
3288                 video_title = result.group(1).decode('utf-8')
3289
3290                 result = re.search(self.VIDEO_THUMB_RE, webpage)
3291                 if result is None:
3292                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3293                         return
3294                 video_thumbnail = result.group(1).decode('utf-8')
3295
3296                 return [{
3297                         'id': video_id,
3298                         'url': video_url,
3299                         'uploader': u'NA',
3300                         'upload_date': u'NA',
3301                         'title': video_title,
3302                         'ext': 'flv',
3303                         'thumbnail': video_thumbnail,
3304                         'description': None,
3305                 }]
3306
3307
3308 class GooglePlusIE(InfoExtractor):
3309         """Information extractor for plus.google.com."""
3310
3311         _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3312         IE_NAME = u'plus.google'
3313
3314         def __init__(self, downloader=None):
3315                 InfoExtractor.__init__(self, downloader)
3316
3317         def report_extract_entry(self, url):
3318                 """Report downloading extry"""
3319                 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3320
3321         def report_date(self, upload_date):
3322                 """Report downloading extry"""
3323                 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3324
3325         def report_uploader(self, uploader):
3326                 """Report downloading extry"""
3327                 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3328
3329         def report_title(self, video_title):
3330                 """Report downloading extry"""
3331                 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3332
3333         def report_extract_vid_page(self, video_page):
3334                 """Report information extraction."""
3335                 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3336
3337         def _real_extract(self, url):
3338                 # Extract id from URL
3339                 mobj = re.match(self._VALID_URL, url)
3340                 if mobj is None:
3341                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3342                         return
3343
3344                 post_url = mobj.group(0)
3345                 video_id = mobj.group(2)
3346
3347                 video_extension = 'flv'
3348
3349                 # Step 1, Retrieve post webpage to extract further information
3350                 self.report_extract_entry(post_url)
3351                 request = urllib2.Request(post_url)
3352                 try:
3353                         webpage = urllib2.urlopen(request).read()
3354                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3355                         self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3356                         return
3357
3358                 # Extract update date
3359                 upload_date = u'NA'
3360                 pattern = 'title="Timestamp">(.*?)</a>'
3361                 mobj = re.search(pattern, webpage)
3362                 if mobj:
3363                         upload_date = mobj.group(1)
3364                         # Convert timestring to a format suitable for filename
3365                         upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3366                         upload_date = upload_date.strftime('%Y%m%d')
3367                 self.report_date(upload_date)
3368
3369                 # Extract uploader
3370                 uploader = u'NA'
3371                 pattern = r'rel\="author".*?>(.*?)</a>'
3372                 mobj = re.search(pattern, webpage)
3373                 if mobj:
3374                         uploader = mobj.group(1)
3375                 self.report_uploader(uploader)
3376
3377                 # Extract title
3378                 # Get the first line for title
3379                 video_title = u'NA'
3380                 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3381                 mobj = re.search(pattern, webpage)
3382                 if mobj:
3383                         video_title = mobj.group(1)
3384                 self.report_title(video_title)
3385
3386                 # Step 2, Stimulate clicking the image box to launch video
3387                 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3388                 mobj = re.search(pattern, webpage)
3389                 if mobj is None:
3390                         self._downloader.trouble(u'ERROR: unable to extract video page URL')
3391
3392                 video_page = mobj.group(1)
3393                 request = urllib2.Request(video_page)
3394                 try:
3395                         webpage = urllib2.urlopen(request).read()
3396                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3397                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3398                         return
3399                 self.report_extract_vid_page(video_page)
3400
3401
3402                 # Extract video links on video page
3403                 """Extract video links of all sizes"""
3404                 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3405                 mobj = re.findall(pattern, webpage)
3406                 if len(mobj) == 0:
3407                         self._downloader.trouble(u'ERROR: unable to extract video links')
3408
3409                 # Sort in resolution
3410                 links = sorted(mobj)
3411
3412                 # Choose the lowest of the sort, i.e. highest resolution
3413                 video_url = links[-1]
3414                 # Only get the url. The resolution part in the tuple has no use anymore
3415                 video_url = video_url[-1]
3416                 # Treat escaped \u0026 style hex
3417                 video_url = unicode(video_url, "unicode_escape")
3418
3419
3420                 return [{
3421                         'id':           video_id.decode('utf-8'),
3422                         'url':          video_url,
3423                         'uploader':     uploader.decode('utf-8'),
3424                         'upload_date':  upload_date.decode('utf-8'),
3425                         'title':        video_title.decode('utf-8'),
3426                         'ext':          video_extension.decode('utf-8'),
3427                 }]