info_dict['upload_date'] is documented in --output, IEs MUST specify it
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import datetime
5 import HTMLParser
6 import httplib
7 import netrc
8 import os
9 import re
10 import socket
11 import time
12 import urllib
13 import urllib2
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 from urlparse import parse_qs
19
20 try:
21         import cStringIO as StringIO
22 except ImportError:
23         import StringIO
24
25 from utils import *
26
27
28 class InfoExtractor(object):
29         """Information Extractor class.
30
31         Information extractors are the classes that, given a URL, extract
32         information about the video (or videos) the URL refers to. This
33         information includes the real video URL, the video title, author and
34         others. The information is stored in a dictionary which is then 
35         passed to the FileDownloader. The FileDownloader processes this
36         information possibly downloading the video to the file system, among
37         other possible outcomes.
38
39         The dictionaries must include the following fields:
40
41         id:             Video identifier.
42         url:            Final video URL.
43         uploader:       Nickname of the video uploader.
44         upload_date:    Video upload date (YYYYMMDD).
45         title:          Video title, unescaped.
46         ext:            Video filename extension.
47
48         The following fields are optional:
49
50         format:         The video format, defaults to ext (used for --get-format)
51         thumbnail:      Full URL to a video thumbnail image.
52         description:    One-line video description.
53         player_url:     SWF Player URL (used for rtmpdump).
54
55         Subclasses of this one should re-define the _real_initialize() and
56         _real_extract() methods and define a _VALID_URL regexp.
57         Probably, they should also be added to the list of extractors.
58
59         _real_extract() must return a *list* of information dictionaries as
60         described above.
61         """
62
63         _ready = False
64         _downloader = None
65
66         def __init__(self, downloader=None):
67                 """Constructor. Receives an optional downloader."""
68                 self._ready = False
69                 self.set_downloader(downloader)
70
71         def suitable(self, url):
72                 """Receives a URL and returns True if suitable for this IE."""
73                 return re.match(self._VALID_URL, url) is not None
74
75         def initialize(self):
76                 """Initializes an instance (authentication, etc)."""
77                 if not self._ready:
78                         self._real_initialize()
79                         self._ready = True
80
81         def extract(self, url):
82                 """Extracts URL information and returns it in list of dicts."""
83                 self.initialize()
84                 return self._real_extract(url)
85
86         def set_downloader(self, downloader):
87                 """Sets the downloader for this IE."""
88                 self._downloader = downloader
89
90         def _real_initialize(self):
91                 """Real initialization process. Redefine in subclasses."""
92                 pass
93
94         def _real_extract(self, url):
95                 """Real extraction process. Redefine in subclasses."""
96                 pass
97
98
99 class YoutubeIE(InfoExtractor):
100         """Information extractor for youtube.com."""
101
102         _VALID_URL = r"""^
103                          (
104                              (?:https?://)?                                       # http(s):// (optional)
105                              (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
106                                 tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
107                              (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
108                              (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
109                              (?:                                                  # the various things that can precede the ID:
110                                  (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
111                                  |(?:                                             # or the v= param in all its forms
112                                      (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
113                                      (?:\?|\#!?)                                  # the params delimiter ? or # or #!
114                                      (?:.+&)?                                     # any other preceding param (like /?s=tuff&v=xxxx)
115                                      v=
116                                  )
117                              )?                                                   # optional -> youtube.com/xxxx is OK
118                          )?                                                       # all until now is optional -> you can pass the naked ID
119                          ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
120                          (?(1).+)?                                                # if we found the ID, everything can follow
121                          $"""
122         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
123         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
124         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
125         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
126         _NETRC_MACHINE = 'youtube'
127         # Listed in order of quality
128         _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
129         _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
130         _video_extensions = {
131                 '13': '3gp',
132                 '17': 'mp4',
133                 '18': 'mp4',
134                 '22': 'mp4',
135                 '37': 'mp4',
136                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
137                 '43': 'webm',
138                 '44': 'webm',
139                 '45': 'webm',
140                 '46': 'webm',
141         }
142         _video_dimensions = {
143                 '5': '240x400',
144                 '6': '???',
145                 '13': '???',
146                 '17': '144x176',
147                 '18': '360x640',
148                 '22': '720x1280',
149                 '34': '360x640',
150                 '35': '480x854',
151                 '37': '1080x1920',
152                 '38': '3072x4096',
153                 '43': '360x640',
154                 '44': '480x854',
155                 '45': '720x1280',
156                 '46': '1080x1920',
157         }       
158         IE_NAME = u'youtube'
159
160         def suitable(self, url):
161                 """Receives a URL and returns True if suitable for this IE."""
162                 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
163
164         def report_lang(self):
165                 """Report attempt to set language."""
166                 self._downloader.to_screen(u'[youtube] Setting language')
167
168         def report_login(self):
169                 """Report attempt to log in."""
170                 self._downloader.to_screen(u'[youtube] Logging in')
171
172         def report_age_confirmation(self):
173                 """Report attempt to confirm age."""
174                 self._downloader.to_screen(u'[youtube] Confirming age')
175
176         def report_video_webpage_download(self, video_id):
177                 """Report attempt to download video webpage."""
178                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
179
180         def report_video_info_webpage_download(self, video_id):
181                 """Report attempt to download video info webpage."""
182                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
183
184         def report_video_subtitles_download(self, video_id):
185                 """Report attempt to download video info webpage."""
186                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
187
188         def report_information_extraction(self, video_id):
189                 """Report attempt to extract video information."""
190                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
191
192         def report_unavailable_format(self, video_id, format):
193                 """Report extracted video URL."""
194                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
195
196         def report_rtmp_download(self):
197                 """Indicate the download will use the RTMP protocol."""
198                 self._downloader.to_screen(u'[youtube] RTMP download detected')
199
200         def _closed_captions_xml_to_srt(self, xml_string):
201                 srt = ''
202                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
203                 # TODO parse xml instead of regex
204                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
205                         if not dur: dur = '4'
206                         start = float(start)
207                         end = start + float(dur)
208                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
209                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
210                         caption = unescapeHTML(caption)
211                         caption = unescapeHTML(caption) # double cycle, intentional
212                         srt += str(n+1) + '\n'
213                         srt += start + ' --> ' + end + '\n'
214                         srt += caption + '\n\n'
215                 return srt
216
217         def _print_formats(self, formats):
218                 print('Available formats:')
219                 for x in formats:
220                         print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
221
222         def _real_initialize(self):
223                 if self._downloader is None:
224                         return
225
226                 username = None
227                 password = None
228                 downloader_params = self._downloader.params
229
230                 # Attempt to use provided username and password or .netrc data
231                 if downloader_params.get('username', None) is not None:
232                         username = downloader_params['username']
233                         password = downloader_params['password']
234                 elif downloader_params.get('usenetrc', False):
235                         try:
236                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
237                                 if info is not None:
238                                         username = info[0]
239                                         password = info[2]
240                                 else:
241                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
242                         except (IOError, netrc.NetrcParseError), err:
243                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
244                                 return
245
246                 # Set language
247                 request = urllib2.Request(self._LANG_URL)
248                 try:
249                         self.report_lang()
250                         urllib2.urlopen(request).read()
251                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
252                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
253                         return
254
255                 # No authentication to be performed
256                 if username is None:
257                         return
258
259                 # Log in
260                 login_form = {
261                                 'current_form': 'loginForm',
262                                 'next':         '/',
263                                 'action_login': 'Log In',
264                                 'username':     username,
265                                 'password':     password,
266                                 }
267                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
268                 try:
269                         self.report_login()
270                         login_results = urllib2.urlopen(request).read()
271                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
272                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
273                                 return
274                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
275                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
276                         return
277
278                 # Confirm age
279                 age_form = {
280                                 'next_url':             '/',
281                                 'action_confirm':       'Confirm',
282                                 }
283                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
284                 try:
285                         self.report_age_confirmation()
286                         age_results = urllib2.urlopen(request).read()
287                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
288                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
289                         return
290
291         def _real_extract(self, url):
292                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
293                 mobj = re.search(self._NEXT_URL_RE, url)
294                 if mobj:
295                         url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
296
297                 # Extract video id from URL
298                 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
299                 if mobj is None:
300                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
301                         return
302                 video_id = mobj.group(2)
303
304                 # Get video webpage
305                 self.report_video_webpage_download(video_id)
306                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
307                 try:
308                         video_webpage = urllib2.urlopen(request).read()
309                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
310                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
311                         return
312
313                 # Attempt to extract SWF player URL
314                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
315                 if mobj is not None:
316                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
317                 else:
318                         player_url = None
319
320                 # Get video info
321                 self.report_video_info_webpage_download(video_id)
322                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
323                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
324                                         % (video_id, el_type))
325                         request = urllib2.Request(video_info_url)
326                         try:
327                                 video_info_webpage = urllib2.urlopen(request).read()
328                                 video_info = parse_qs(video_info_webpage)
329                                 if 'token' in video_info:
330                                         break
331                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
332                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
333                                 return
334                 if 'token' not in video_info:
335                         if 'reason' in video_info:
336                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
337                         else:
338                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
339                         return
340
341                 # Check for "rental" videos
342                 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
343                         self._downloader.trouble(u'ERROR: "rental" videos not supported')
344                         return
345
346                 # Start extracting information
347                 self.report_information_extraction(video_id)
348
349                 # uploader
350                 if 'author' not in video_info:
351                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
352                         return
353                 video_uploader = urllib.unquote_plus(video_info['author'][0])
354
355                 # title
356                 if 'title' not in video_info:
357                         self._downloader.trouble(u'ERROR: unable to extract video title')
358                         return
359                 video_title = urllib.unquote_plus(video_info['title'][0])
360                 video_title = video_title.decode('utf-8')
361
362                 # thumbnail image
363                 if 'thumbnail_url' not in video_info:
364                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
365                         video_thumbnail = ''
366                 else:   # don't panic if we can't find it
367                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
368
369                 # upload date
370                 upload_date = u'NA'
371                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
372                 if mobj is not None:
373                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
374                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
375                         for expression in format_expressions:
376                                 try:
377                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
378                                 except:
379                                         pass
380
381                 # description
382                 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
383                 if video_description: video_description = clean_html(video_description)
384                 else: video_description = ''
385                         
386                 # closed captions
387                 video_subtitles = None
388                 if self._downloader.params.get('writesubtitles', False):
389                         try:
390                                 self.report_video_subtitles_download(video_id)
391                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
392                                 try:
393                                         srt_list = urllib2.urlopen(request).read()
394                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
395                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
396                                 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
397                                 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
398                                 if not srt_lang_list:
399                                         raise Trouble(u'WARNING: video has no closed captions')
400                                 if self._downloader.params.get('subtitleslang', False):
401                                         srt_lang = self._downloader.params.get('subtitleslang')
402                                 elif 'en' in srt_lang_list:
403                                         srt_lang = 'en'
404                                 else:
405                                         srt_lang = srt_lang_list.keys()[0]
406                                 if not srt_lang in srt_lang_list:
407                                         raise Trouble(u'WARNING: no closed captions found in the specified language')
408                                 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
409                                 try:
410                                         srt_xml = urllib2.urlopen(request).read()
411                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
412                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
413                                 if not srt_xml:
414                                         raise Trouble(u'WARNING: unable to download video subtitles')
415                                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
416                         except Trouble as trouble:
417                                 self._downloader.trouble(trouble[0])
418
419                 if 'length_seconds' not in video_info:
420                         self._downloader.trouble(u'WARNING: unable to extract video duration')
421                         video_duration = ''
422                 else:
423                         video_duration = urllib.unquote_plus(video_info['length_seconds'][0])
424
425                 # token
426                 video_token = urllib.unquote_plus(video_info['token'][0])
427
428                 # Decide which formats to download
429                 req_format = self._downloader.params.get('format', None)
430
431                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
432                         self.report_rtmp_download()
433                         video_url_list = [(None, video_info['conn'][0])]
434                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
435                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
436                         url_data = [parse_qs(uds) for uds in url_data_strs]
437                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
438                         url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
439
440                         format_limit = self._downloader.params.get('format_limit', None)
441                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
442                         if format_limit is not None and format_limit in available_formats:
443                                 format_list = available_formats[available_formats.index(format_limit):]
444                         else:
445                                 format_list = available_formats
446                         existing_formats = [x for x in format_list if x in url_map]
447                         if len(existing_formats) == 0:
448                                 self._downloader.trouble(u'ERROR: no known formats available for video')
449                                 return
450                         if self._downloader.params.get('listformats', None):
451                                 self._print_formats(existing_formats)
452                                 return
453                         if req_format is None or req_format == 'best':
454                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
455                         elif req_format == 'worst':
456                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
457                         elif req_format in ('-1', 'all'):
458                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
459                         else:
460                                 # Specific formats. We pick the first in a slash-delimeted sequence.
461                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
462                                 req_formats = req_format.split('/')
463                                 video_url_list = None
464                                 for rf in req_formats:
465                                         if rf in url_map:
466                                                 video_url_list = [(rf, url_map[rf])]
467                                                 break
468                                 if video_url_list is None:
469                                         self._downloader.trouble(u'ERROR: requested format not available')
470                                         return
471                 else:
472                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
473                         return
474
475                 results = []
476                 for format_param, video_real_url in video_url_list:
477                         # Extension
478                         video_extension = self._video_extensions.get(format_param, 'flv')
479
480                         video_format = '{} - {}'.format(format_param.decode('utf-8') if format_param else video_extension.decode('utf-8'),
481                                                             self._video_dimensions.get(format_param, '???'))
482
483                         results.append({
484                                 'id':           video_id.decode('utf-8'),
485                                 'url':          video_real_url.decode('utf-8'),
486                                 'uploader':     video_uploader.decode('utf-8'),
487                                 'upload_date':  upload_date,
488                                 'title':        video_title,
489                                 'ext':          video_extension.decode('utf-8'),
490                                 'format':       video_format,
491                                 'thumbnail':    video_thumbnail.decode('utf-8'),
492                                 'description':  video_description,
493                                 'player_url':   player_url,
494                                 'subtitles':    video_subtitles,
495                                 'duration':             video_duration
496                         })
497                 return results
498
499
500 class MetacafeIE(InfoExtractor):
501         """Information Extractor for metacafe.com."""
502
503         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
504         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
505         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
506         IE_NAME = u'metacafe'
507
508         def __init__(self, downloader=None):
509                 InfoExtractor.__init__(self, downloader)
510
511         def report_disclaimer(self):
512                 """Report disclaimer retrieval."""
513                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
514
515         def report_age_confirmation(self):
516                 """Report attempt to confirm age."""
517                 self._downloader.to_screen(u'[metacafe] Confirming age')
518
519         def report_download_webpage(self, video_id):
520                 """Report webpage download."""
521                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
522
523         def report_extraction(self, video_id):
524                 """Report information extraction."""
525                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
526
527         def _real_initialize(self):
528                 # Retrieve disclaimer
529                 request = urllib2.Request(self._DISCLAIMER)
530                 try:
531                         self.report_disclaimer()
532                         disclaimer = urllib2.urlopen(request).read()
533                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
534                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
535                         return
536
537                 # Confirm age
538                 disclaimer_form = {
539                         'filters': '0',
540                         'submit': "Continue - I'm over 18",
541                         }
542                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
543                 try:
544                         self.report_age_confirmation()
545                         disclaimer = urllib2.urlopen(request).read()
546                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
547                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
548                         return
549
550         def _real_extract(self, url):
551                 # Extract id and simplified title from URL
552                 mobj = re.match(self._VALID_URL, url)
553                 if mobj is None:
554                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
555                         return
556
557                 video_id = mobj.group(1)
558
559                 # Check if video comes from YouTube
560                 mobj2 = re.match(r'^yt-(.*)$', video_id)
561                 if mobj2 is not None:
562                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
563                         return
564
565                 # Retrieve video webpage to extract further information
566                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
567                 try:
568                         self.report_download_webpage(video_id)
569                         webpage = urllib2.urlopen(request).read()
570                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
571                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
572                         return
573
574                 # Extract URL, uploader and title from webpage
575                 self.report_extraction(video_id)
576                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
577                 if mobj is not None:
578                         mediaURL = urllib.unquote(mobj.group(1))
579                         video_extension = mediaURL[-3:]
580
581                         # Extract gdaKey if available
582                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
583                         if mobj is None:
584                                 video_url = mediaURL
585                         else:
586                                 gdaKey = mobj.group(1)
587                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
588                 else:
589                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
590                         if mobj is None:
591                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
592                                 return
593                         vardict = parse_qs(mobj.group(1))
594                         if 'mediaData' not in vardict:
595                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
596                                 return
597                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
598                         if mobj is None:
599                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
600                                 return
601                         mediaURL = mobj.group(1).replace('\\/', '/')
602                         video_extension = mediaURL[-3:]
603                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
604
605                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
606                 if mobj is None:
607                         self._downloader.trouble(u'ERROR: unable to extract title')
608                         return
609                 video_title = mobj.group(1).decode('utf-8')
610
611                 mobj = re.search(r'submitter=(.*?);', webpage)
612                 if mobj is None:
613                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
614                         return
615                 video_uploader = mobj.group(1)
616
617                 return [{
618                         'id':           video_id.decode('utf-8'),
619                         'url':          video_url.decode('utf-8'),
620                         'uploader':     video_uploader.decode('utf-8'),
621                         'upload_date':  u'NA',
622                         'title':        video_title,
623                         'ext':          video_extension.decode('utf-8'),
624                 }]
625
626
627 class DailymotionIE(InfoExtractor):
628         """Information Extractor for Dailymotion"""
629
630         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
631         IE_NAME = u'dailymotion'
632
633         def __init__(self, downloader=None):
634                 InfoExtractor.__init__(self, downloader)
635
636         def report_download_webpage(self, video_id):
637                 """Report webpage download."""
638                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
639
640         def report_extraction(self, video_id):
641                 """Report information extraction."""
642                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
643
644         def _real_extract(self, url):
645                 # Extract id and simplified title from URL
646                 mobj = re.match(self._VALID_URL, url)
647                 if mobj is None:
648                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
649                         return
650
651                 video_id = mobj.group(1).split('_')[0].split('?')[0]
652
653                 video_extension = 'mp4'
654
655                 # Retrieve video webpage to extract further information
656                 request = urllib2.Request(url)
657                 request.add_header('Cookie', 'family_filter=off')
658                 try:
659                         self.report_download_webpage(video_id)
660                         webpage = urllib2.urlopen(request).read()
661                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
662                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
663                         return
664
665                 # Extract URL, uploader and title from webpage
666                 self.report_extraction(video_id)
667                 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
668                 if mobj is None:
669                         self._downloader.trouble(u'ERROR: unable to extract media URL')
670                         return
671                 flashvars = urllib.unquote(mobj.group(1))
672
673                 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
674                         if key in flashvars:
675                                 max_quality = key
676                                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
677                                 break
678                 else:
679                         self._downloader.trouble(u'ERROR: unable to extract video URL')
680                         return
681
682                 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
683                 if mobj is None:
684                         self._downloader.trouble(u'ERROR: unable to extract video URL')
685                         return
686
687                 video_url = urllib.unquote(mobj.group(1)).replace('\\/', '/')
688
689                 # TODO: support choosing qualities
690
691                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
692                 if mobj is None:
693                         self._downloader.trouble(u'ERROR: unable to extract title')
694                         return
695                 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
696
697                 video_uploader = u'NA'
698                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
699                 if mobj is None:
700                         # lookin for official user
701                         mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
702                         if mobj_official is None:
703                                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
704                         else:
705                                 video_uploader = mobj_official.group(1)
706                 else:
707                         video_uploader = mobj.group(1)
708
709                 video_upload_date = u'NA'
710                 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
711                 if mobj is not None:
712                         video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
713
714                 return [{
715                         'id':           video_id.decode('utf-8'),
716                         'url':          video_url.decode('utf-8'),
717                         'uploader':     video_uploader.decode('utf-8'),
718                         'upload_date':  video_upload_date,
719                         'title':        video_title,
720                         'ext':          video_extension.decode('utf-8'),
721                 }]
722
723
724 class GoogleIE(InfoExtractor):
725         """Information extractor for video.google.com."""
726
727         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
728         IE_NAME = u'video.google'
729
730         def __init__(self, downloader=None):
731                 InfoExtractor.__init__(self, downloader)
732
733         def report_download_webpage(self, video_id):
734                 """Report webpage download."""
735                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
736
737         def report_extraction(self, video_id):
738                 """Report information extraction."""
739                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
740
741         def _real_extract(self, url):
742                 # Extract id from URL
743                 mobj = re.match(self._VALID_URL, url)
744                 if mobj is None:
745                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
746                         return
747
748                 video_id = mobj.group(1)
749
750                 video_extension = 'mp4'
751
752                 # Retrieve video webpage to extract further information
753                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
754                 try:
755                         self.report_download_webpage(video_id)
756                         webpage = urllib2.urlopen(request).read()
757                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
758                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
759                         return
760
761                 # Extract URL, uploader, and title from webpage
762                 self.report_extraction(video_id)
763                 mobj = re.search(r"download_url:'([^']+)'", webpage)
764                 if mobj is None:
765                         video_extension = 'flv'
766                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
767                 if mobj is None:
768                         self._downloader.trouble(u'ERROR: unable to extract media URL')
769                         return
770                 mediaURL = urllib.unquote(mobj.group(1))
771                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
772                 mediaURL = mediaURL.replace('\\x26', '\x26')
773
774                 video_url = mediaURL
775
776                 mobj = re.search(r'<title>(.*)</title>', webpage)
777                 if mobj is None:
778                         self._downloader.trouble(u'ERROR: unable to extract title')
779                         return
780                 video_title = mobj.group(1).decode('utf-8')
781
782                 # Extract video description
783                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
784                 if mobj is None:
785                         self._downloader.trouble(u'ERROR: unable to extract video description')
786                         return
787                 video_description = mobj.group(1).decode('utf-8')
788                 if not video_description:
789                         video_description = 'No description available.'
790
791                 # Extract video thumbnail
792                 if self._downloader.params.get('forcethumbnail', False):
793                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
794                         try:
795                                 webpage = urllib2.urlopen(request).read()
796                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
797                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
798                                 return
799                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
800                         if mobj is None:
801                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
802                                 return
803                         video_thumbnail = mobj.group(1)
804                 else:   # we need something to pass to process_info
805                         video_thumbnail = ''
806
807                 return [{
808                         'id':           video_id.decode('utf-8'),
809                         'url':          video_url.decode('utf-8'),
810                         'uploader':     u'NA',
811                         'upload_date':  u'NA',
812                         'title':        video_title,
813                         'ext':          video_extension.decode('utf-8'),
814                 }]
815
816
817 class PhotobucketIE(InfoExtractor):
818         """Information extractor for photobucket.com."""
819
820         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
821         IE_NAME = u'photobucket'
822
823         def __init__(self, downloader=None):
824                 InfoExtractor.__init__(self, downloader)
825
826         def report_download_webpage(self, video_id):
827                 """Report webpage download."""
828                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
829
830         def report_extraction(self, video_id):
831                 """Report information extraction."""
832                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
833
834         def _real_extract(self, url):
835                 # Extract id from URL
836                 mobj = re.match(self._VALID_URL, url)
837                 if mobj is None:
838                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
839                         return
840
841                 video_id = mobj.group(1)
842
843                 video_extension = 'flv'
844
845                 # Retrieve video webpage to extract further information
846                 request = urllib2.Request(url)
847                 try:
848                         self.report_download_webpage(video_id)
849                         webpage = urllib2.urlopen(request).read()
850                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
851                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
852                         return
853
854                 # Extract URL, uploader, and title from webpage
855                 self.report_extraction(video_id)
856                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
857                 if mobj is None:
858                         self._downloader.trouble(u'ERROR: unable to extract media URL')
859                         return
860                 mediaURL = urllib.unquote(mobj.group(1))
861
862                 video_url = mediaURL
863
864                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
865                 if mobj is None:
866                         self._downloader.trouble(u'ERROR: unable to extract title')
867                         return
868                 video_title = mobj.group(1).decode('utf-8')
869
870                 video_uploader = mobj.group(2).decode('utf-8')
871
872                 return [{
873                         'id':           video_id.decode('utf-8'),
874                         'url':          video_url.decode('utf-8'),
875                         'uploader':     video_uploader,
876                         'upload_date':  u'NA',
877                         'title':        video_title,
878                         'ext':          video_extension.decode('utf-8'),
879                 }]
880
881
882 class YahooIE(InfoExtractor):
883         """Information extractor for video.yahoo.com."""
884
885         # _VALID_URL matches all Yahoo! Video URLs
886         # _VPAGE_URL matches only the extractable '/watch/' URLs
887         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
888         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
889         IE_NAME = u'video.yahoo'
890
891         def __init__(self, downloader=None):
892                 InfoExtractor.__init__(self, downloader)
893
894         def report_download_webpage(self, video_id):
895                 """Report webpage download."""
896                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
897
898         def report_extraction(self, video_id):
899                 """Report information extraction."""
900                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
901
902         def _real_extract(self, url, new_video=True):
903                 # Extract ID from URL
904                 mobj = re.match(self._VALID_URL, url)
905                 if mobj is None:
906                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
907                         return
908
909                 video_id = mobj.group(2)
910                 video_extension = 'flv'
911
912                 # Rewrite valid but non-extractable URLs as
913                 # extractable English language /watch/ URLs
914                 if re.match(self._VPAGE_URL, url) is None:
915                         request = urllib2.Request(url)
916                         try:
917                                 webpage = urllib2.urlopen(request).read()
918                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
919                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
920                                 return
921
922                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
923                         if mobj is None:
924                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
925                                 return
926                         yahoo_id = mobj.group(1)
927
928                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
929                         if mobj is None:
930                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
931                                 return
932                         yahoo_vid = mobj.group(1)
933
934                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
935                         return self._real_extract(url, new_video=False)
936
937                 # Retrieve video webpage to extract further information
938                 request = urllib2.Request(url)
939                 try:
940                         self.report_download_webpage(video_id)
941                         webpage = urllib2.urlopen(request).read()
942                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
943                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
944                         return
945
946                 # Extract uploader and title from webpage
947                 self.report_extraction(video_id)
948                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
949                 if mobj is None:
950                         self._downloader.trouble(u'ERROR: unable to extract video title')
951                         return
952                 video_title = mobj.group(1).decode('utf-8')
953
954                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
955                 if mobj is None:
956                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
957                         return
958                 video_uploader = mobj.group(1).decode('utf-8')
959
960                 # Extract video thumbnail
961                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
962                 if mobj is None:
963                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
964                         return
965                 video_thumbnail = mobj.group(1).decode('utf-8')
966
967                 # Extract video description
968                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
969                 if mobj is None:
970                         self._downloader.trouble(u'ERROR: unable to extract video description')
971                         return
972                 video_description = mobj.group(1).decode('utf-8')
973                 if not video_description:
974                         video_description = 'No description available.'
975
976                 # Extract video height and width
977                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
978                 if mobj is None:
979                         self._downloader.trouble(u'ERROR: unable to extract video height')
980                         return
981                 yv_video_height = mobj.group(1)
982
983                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
984                 if mobj is None:
985                         self._downloader.trouble(u'ERROR: unable to extract video width')
986                         return
987                 yv_video_width = mobj.group(1)
988
989                 # Retrieve video playlist to extract media URL
990                 # I'm not completely sure what all these options are, but we
991                 # seem to need most of them, otherwise the server sends a 401.
992                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
993                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
994                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
995                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
996                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
997                 try:
998                         self.report_download_webpage(video_id)
999                         webpage = urllib2.urlopen(request).read()
1000                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1001                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1002                         return
1003
1004                 # Extract media URL from playlist XML
1005                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1006                 if mobj is None:
1007                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1008                         return
1009                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1010                 video_url = unescapeHTML(video_url)
1011
1012                 return [{
1013                         'id':           video_id.decode('utf-8'),
1014                         'url':          video_url,
1015                         'uploader':     video_uploader,
1016                         'upload_date':  u'NA',
1017                         'title':        video_title,
1018                         'ext':          video_extension.decode('utf-8'),
1019                         'thumbnail':    video_thumbnail.decode('utf-8'),
1020                         'description':  video_description,
1021                         'thumbnail':    video_thumbnail,
1022                 }]
1023
1024
1025 class VimeoIE(InfoExtractor):
1026         """Information extractor for vimeo.com."""
1027
1028         # _VALID_URL matches Vimeo URLs
1029         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
1030         IE_NAME = u'vimeo'
1031
1032         def __init__(self, downloader=None):
1033                 InfoExtractor.__init__(self, downloader)
1034
1035         def report_download_webpage(self, video_id):
1036                 """Report webpage download."""
1037                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1038
1039         def report_extraction(self, video_id):
1040                 """Report information extraction."""
1041                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1042
1043         def _real_extract(self, url, new_video=True):
1044                 # Extract ID from URL
1045                 mobj = re.match(self._VALID_URL, url)
1046                 if mobj is None:
1047                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1048                         return
1049
1050                 video_id = mobj.group(1)
1051
1052                 # Retrieve video webpage to extract further information
1053                 request = urllib2.Request(url, None, std_headers)
1054                 try:
1055                         self.report_download_webpage(video_id)
1056                         webpage = urllib2.urlopen(request).read()
1057                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1058                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1059                         return
1060
1061                 # Now we begin extracting as much information as we can from what we
1062                 # retrieved. First we extract the information common to all extractors,
1063                 # and latter we extract those that are Vimeo specific.
1064                 self.report_extraction(video_id)
1065
1066                 # Extract the config JSON
1067                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1068                 try:
1069                         config = json.loads(config)
1070                 except:
1071                         self._downloader.trouble(u'ERROR: unable to extract info section')
1072                         return
1073                 
1074                 # Extract title
1075                 video_title = config["video"]["title"]
1076
1077                 # Extract uploader
1078                 video_uploader = config["video"]["owner"]["name"]
1079
1080                 # Extract video thumbnail
1081                 video_thumbnail = config["video"]["thumbnail"]
1082
1083                 # Extract video description
1084                 video_description = get_element_by_id("description", webpage.decode('utf8'))
1085                 if video_description: video_description = clean_html(video_description)
1086                 else: video_description = ''
1087
1088                 # Extract upload date
1089                 video_upload_date = u'NA'
1090                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1091                 if mobj is not None:
1092                         video_upload_date = mobj.group(1)
1093
1094                 # Vimeo specific: extract request signature and timestamp
1095                 sig = config['request']['signature']
1096                 timestamp = config['request']['timestamp']
1097
1098                 # Vimeo specific: extract video codec and quality information
1099                 # First consider quality, then codecs, then take everything
1100                 # TODO bind to format param
1101                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1102                 files = { 'hd': [], 'sd': [], 'other': []}
1103                 for codec_name, codec_extension in codecs:
1104                         if codec_name in config["video"]["files"]:
1105                                 if 'hd' in config["video"]["files"][codec_name]:
1106                                         files['hd'].append((codec_name, codec_extension, 'hd'))
1107                                 elif 'sd' in config["video"]["files"][codec_name]:
1108                                         files['sd'].append((codec_name, codec_extension, 'sd'))
1109                                 else:
1110                                         files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1111
1112                 for quality in ('hd', 'sd', 'other'):
1113                         if len(files[quality]) > 0:
1114                                 video_quality = files[quality][0][2]
1115                                 video_codec = files[quality][0][0]
1116                                 video_extension = files[quality][0][1]
1117                                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1118                                 break
1119                 else:
1120                         self._downloader.trouble(u'ERROR: no known codec found')
1121                         return
1122
1123                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1124                                         %(video_id, sig, timestamp, video_quality, video_codec.upper())
1125
1126                 return [{
1127                         'id':           video_id,
1128                         'url':          video_url,
1129                         'uploader':     video_uploader,
1130                         'upload_date':  video_upload_date,
1131                         'title':        video_title,
1132                         'ext':          video_extension,
1133                         'thumbnail':    video_thumbnail,
1134                         'description':  video_description,
1135                 }]
1136
1137
1138 class GenericIE(InfoExtractor):
1139         """Generic last-resort information extractor."""
1140
1141         _VALID_URL = r'.*'
1142         IE_NAME = u'generic'
1143
1144         def __init__(self, downloader=None):
1145                 InfoExtractor.__init__(self, downloader)
1146
1147         def report_download_webpage(self, video_id):
1148                 """Report webpage download."""
1149                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1150                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1151
1152         def report_extraction(self, video_id):
1153                 """Report information extraction."""
1154                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1155
1156         def report_following_redirect(self, new_url):
1157                 """Report information extraction."""
1158                 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1159                 
1160         def _test_redirect(self, url):
1161                 """Check if it is a redirect, like url shorteners, in case restart chain."""
1162                 class HeadRequest(urllib2.Request):
1163                         def get_method(self):
1164                                 return "HEAD"
1165
1166                 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1167                         """
1168                         Subclass the HTTPRedirectHandler to make it use our 
1169                         HeadRequest also on the redirected URL
1170                         """
1171                         def redirect_request(self, req, fp, code, msg, headers, newurl): 
1172                                 if code in (301, 302, 303, 307):
1173                                         newurl = newurl.replace(' ', '%20') 
1174                                         newheaders = dict((k,v) for k,v in req.headers.items()
1175                                                                           if k.lower() not in ("content-length", "content-type"))
1176                                         return HeadRequest(newurl, 
1177                                                                            headers=newheaders,
1178                                                                            origin_req_host=req.get_origin_req_host(), 
1179                                                                            unverifiable=True) 
1180                                 else: 
1181                                         raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp) 
1182
1183                 class HTTPMethodFallback(urllib2.BaseHandler):
1184                         """
1185                         Fallback to GET if HEAD is not allowed (405 HTTP error)
1186                         """
1187                         def http_error_405(self, req, fp, code, msg, headers): 
1188                                 fp.read()
1189                                 fp.close()
1190
1191                                 newheaders = dict((k,v) for k,v in req.headers.items()
1192                                                                   if k.lower() not in ("content-length", "content-type"))
1193                                 return self.parent.open(urllib2.Request(req.get_full_url(), 
1194                                                                                                  headers=newheaders, 
1195                                                                                                  origin_req_host=req.get_origin_req_host(), 
1196                                                                                                  unverifiable=True))
1197
1198                 # Build our opener
1199                 opener = urllib2.OpenerDirector() 
1200                 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1201                                                 HTTPMethodFallback, HEADRedirectHandler,
1202                                                 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1203                         opener.add_handler(handler())
1204
1205                 response = opener.open(HeadRequest(url))
1206                 new_url = response.geturl()
1207                 
1208                 if url == new_url: return False
1209                 
1210                 self.report_following_redirect(new_url)
1211                 self._downloader.download([new_url])
1212                 return True
1213
1214         def _real_extract(self, url):
1215                 if self._test_redirect(url): return
1216
1217                 video_id = url.split('/')[-1]
1218                 request = urllib2.Request(url)
1219                 try:
1220                         self.report_download_webpage(video_id)
1221                         webpage = urllib2.urlopen(request).read()
1222                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1223                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1224                         return
1225                 except ValueError, err:
1226                         # since this is the last-resort InfoExtractor, if
1227                         # this error is thrown, it'll be thrown here
1228                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1229                         return
1230
1231                 self.report_extraction(video_id)
1232                 # Start with something easy: JW Player in SWFObject
1233                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1234                 if mobj is None:
1235                         # Broaden the search a little bit
1236                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1237                 if mobj is None:
1238                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1239                         return
1240
1241                 # It's possible that one of the regexes
1242                 # matched, but returned an empty group:
1243                 if mobj.group(1) is None:
1244                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1245                         return
1246
1247                 video_url = urllib.unquote(mobj.group(1))
1248                 video_id = os.path.basename(video_url)
1249
1250                 # here's a fun little line of code for you:
1251                 video_extension = os.path.splitext(video_id)[1][1:]
1252                 video_id = os.path.splitext(video_id)[0]
1253
1254                 # it's tempting to parse this further, but you would
1255                 # have to take into account all the variations like
1256                 #   Video Title - Site Name
1257                 #   Site Name | Video Title
1258                 #   Video Title - Tagline | Site Name
1259                 # and so on and so forth; it's just not practical
1260                 mobj = re.search(r'<title>(.*)</title>', webpage)
1261                 if mobj is None:
1262                         self._downloader.trouble(u'ERROR: unable to extract title')
1263                         return
1264                 video_title = mobj.group(1).decode('utf-8')
1265
1266                 # video uploader is domain name
1267                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1268                 if mobj is None:
1269                         self._downloader.trouble(u'ERROR: unable to extract title')
1270                         return
1271                 video_uploader = mobj.group(1).decode('utf-8')
1272
1273                 return [{
1274                         'id':           video_id.decode('utf-8'),
1275                         'url':          video_url.decode('utf-8'),
1276                         'uploader':     video_uploader,
1277                         'upload_date':  u'NA',
1278                         'title':        video_title,
1279                         'ext':          video_extension.decode('utf-8'),
1280                 }]
1281
1282
1283 class YoutubeSearchIE(InfoExtractor):
1284         """Information Extractor for YouTube search queries."""
1285         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1286         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1287         _max_youtube_results = 1000
1288         IE_NAME = u'youtube:search'
1289
1290         def __init__(self, downloader=None):
1291                 InfoExtractor.__init__(self, downloader)
1292
1293         def report_download_page(self, query, pagenum):
1294                 """Report attempt to download search page with given number."""
1295                 query = query.decode(preferredencoding())
1296                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1297
1298         def _real_extract(self, query):
1299                 mobj = re.match(self._VALID_URL, query)
1300                 if mobj is None:
1301                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1302                         return
1303
1304                 prefix, query = query.split(':')
1305                 prefix = prefix[8:]
1306                 query = query.encode('utf-8')
1307                 if prefix == '':
1308                         self._download_n_results(query, 1)
1309                         return
1310                 elif prefix == 'all':
1311                         self._download_n_results(query, self._max_youtube_results)
1312                         return
1313                 else:
1314                         try:
1315                                 n = long(prefix)
1316                                 if n <= 0:
1317                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1318                                         return
1319                                 elif n > self._max_youtube_results:
1320                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1321                                         n = self._max_youtube_results
1322                                 self._download_n_results(query, n)
1323                                 return
1324                         except ValueError: # parsing prefix as integer fails
1325                                 self._download_n_results(query, 1)
1326                                 return
1327
1328         def _download_n_results(self, query, n):
1329                 """Downloads a specified number of results for a query"""
1330
1331                 video_ids = []
1332                 pagenum = 0
1333                 limit = n
1334
1335                 while (50 * pagenum) < limit:
1336                         self.report_download_page(query, pagenum+1)
1337                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1338                         request = urllib2.Request(result_url)
1339                         try:
1340                                 data = urllib2.urlopen(request).read()
1341                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1342                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1343                                 return
1344                         api_response = json.loads(data)['data']
1345
1346                         new_ids = list(video['id'] for video in api_response['items'])
1347                         video_ids += new_ids
1348
1349                         limit = min(n, api_response['totalItems'])
1350                         pagenum += 1
1351
1352                 if len(video_ids) > n:
1353                         video_ids = video_ids[:n]
1354                 for id in video_ids:
1355                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1356                 return
1357
1358
1359 class GoogleSearchIE(InfoExtractor):
1360         """Information Extractor for Google Video search queries."""
1361         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1362         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1363         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1364         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1365         _max_google_results = 1000
1366         IE_NAME = u'video.google:search'
1367
1368         def __init__(self, downloader=None):
1369                 InfoExtractor.__init__(self, downloader)
1370
1371         def report_download_page(self, query, pagenum):
1372                 """Report attempt to download playlist page with given number."""
1373                 query = query.decode(preferredencoding())
1374                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1375
1376         def _real_extract(self, query):
1377                 mobj = re.match(self._VALID_URL, query)
1378                 if mobj is None:
1379                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1380                         return
1381
1382                 prefix, query = query.split(':')
1383                 prefix = prefix[8:]
1384                 query = query.encode('utf-8')
1385                 if prefix == '':
1386                         self._download_n_results(query, 1)
1387                         return
1388                 elif prefix == 'all':
1389                         self._download_n_results(query, self._max_google_results)
1390                         return
1391                 else:
1392                         try:
1393                                 n = long(prefix)
1394                                 if n <= 0:
1395                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1396                                         return
1397                                 elif n > self._max_google_results:
1398                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1399                                         n = self._max_google_results
1400                                 self._download_n_results(query, n)
1401                                 return
1402                         except ValueError: # parsing prefix as integer fails
1403                                 self._download_n_results(query, 1)
1404                                 return
1405
1406         def _download_n_results(self, query, n):
1407                 """Downloads a specified number of results for a query"""
1408
1409                 video_ids = []
1410                 pagenum = 0
1411
1412                 while True:
1413                         self.report_download_page(query, pagenum)
1414                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1415                         request = urllib2.Request(result_url)
1416                         try:
1417                                 page = urllib2.urlopen(request).read()
1418                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1419                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1420                                 return
1421
1422                         # Extract video identifiers
1423                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1424                                 video_id = mobj.group(1)
1425                                 if video_id not in video_ids:
1426                                         video_ids.append(video_id)
1427                                         if len(video_ids) == n:
1428                                                 # Specified n videos reached
1429                                                 for id in video_ids:
1430                                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1431                                                 return
1432
1433                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1434                                 for id in video_ids:
1435                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1436                                 return
1437
1438                         pagenum = pagenum + 1
1439
1440
1441 class YahooSearchIE(InfoExtractor):
1442         """Information Extractor for Yahoo! Video search queries."""
1443         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1444         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1445         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1446         _MORE_PAGES_INDICATOR = r'\s*Next'
1447         _max_yahoo_results = 1000
1448         IE_NAME = u'video.yahoo:search'
1449
1450         def __init__(self, downloader=None):
1451                 InfoExtractor.__init__(self, downloader)
1452
1453         def report_download_page(self, query, pagenum):
1454                 """Report attempt to download playlist page with given number."""
1455                 query = query.decode(preferredencoding())
1456                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1457
1458         def _real_extract(self, query):
1459                 mobj = re.match(self._VALID_URL, query)
1460                 if mobj is None:
1461                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1462                         return
1463
1464                 prefix, query = query.split(':')
1465                 prefix = prefix[8:]
1466                 query = query.encode('utf-8')
1467                 if prefix == '':
1468                         self._download_n_results(query, 1)
1469                         return
1470                 elif prefix == 'all':
1471                         self._download_n_results(query, self._max_yahoo_results)
1472                         return
1473                 else:
1474                         try:
1475                                 n = long(prefix)
1476                                 if n <= 0:
1477                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1478                                         return
1479                                 elif n > self._max_yahoo_results:
1480                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1481                                         n = self._max_yahoo_results
1482                                 self._download_n_results(query, n)
1483                                 return
1484                         except ValueError: # parsing prefix as integer fails
1485                                 self._download_n_results(query, 1)
1486                                 return
1487
1488         def _download_n_results(self, query, n):
1489                 """Downloads a specified number of results for a query"""
1490
1491                 video_ids = []
1492                 already_seen = set()
1493                 pagenum = 1
1494
1495                 while True:
1496                         self.report_download_page(query, pagenum)
1497                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1498                         request = urllib2.Request(result_url)
1499                         try:
1500                                 page = urllib2.urlopen(request).read()
1501                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1502                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1503                                 return
1504
1505                         # Extract video identifiers
1506                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1507                                 video_id = mobj.group(1)
1508                                 if video_id not in already_seen:
1509                                         video_ids.append(video_id)
1510                                         already_seen.add(video_id)
1511                                         if len(video_ids) == n:
1512                                                 # Specified n videos reached
1513                                                 for id in video_ids:
1514                                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1515                                                 return
1516
1517                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1518                                 for id in video_ids:
1519                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1520                                 return
1521
1522                         pagenum = pagenum + 1
1523
1524
1525 class YoutubePlaylistIE(InfoExtractor):
1526         """Information Extractor for YouTube playlists."""
1527
1528         _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1529         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1530         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1531         _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1532         IE_NAME = u'youtube:playlist'
1533
1534         def __init__(self, downloader=None):
1535                 InfoExtractor.__init__(self, downloader)
1536
1537         def report_download_page(self, playlist_id, pagenum):
1538                 """Report attempt to download playlist page with given number."""
1539                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1540
1541         def _real_extract(self, url):
1542                 # Extract playlist id
1543                 mobj = re.match(self._VALID_URL, url)
1544                 if mobj is None:
1545                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1546                         return
1547
1548                 # Single video case
1549                 if mobj.group(3) is not None:
1550                         self._downloader.download([mobj.group(3)])
1551                         return
1552
1553                 # Download playlist pages
1554                 # prefix is 'p' as default for playlists but there are other types that need extra care
1555                 playlist_prefix = mobj.group(1)
1556                 if playlist_prefix == 'a':
1557                         playlist_access = 'artist'
1558                 else:
1559                         playlist_prefix = 'p'
1560                         playlist_access = 'view_play_list'
1561                 playlist_id = mobj.group(2)
1562                 video_ids = []
1563                 pagenum = 1
1564
1565                 while True:
1566                         self.report_download_page(playlist_id, pagenum)
1567                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1568                         request = urllib2.Request(url)
1569                         try:
1570                                 page = urllib2.urlopen(request).read()
1571                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1572                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1573                                 return
1574
1575                         # Extract video identifiers
1576                         ids_in_page = []
1577                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1578                                 if mobj.group(1) not in ids_in_page:
1579                                         ids_in_page.append(mobj.group(1))
1580                         video_ids.extend(ids_in_page)
1581
1582                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1583                                 break
1584                         pagenum = pagenum + 1
1585
1586                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1587                 playlistend = self._downloader.params.get('playlistend', -1)
1588                 if playlistend == -1:
1589                         video_ids = video_ids[playliststart:]
1590                 else:
1591                         video_ids = video_ids[playliststart:playlistend]
1592
1593                 for id in video_ids:
1594                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1595                 return
1596
1597
1598 class YoutubeChannelIE(InfoExtractor):
1599         """Information Extractor for YouTube channels."""
1600
1601         _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1602         _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1603         _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1604         IE_NAME = u'youtube:channel'
1605
1606         def report_download_page(self, channel_id, pagenum):
1607                 """Report attempt to download channel page with given number."""
1608                 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1609
1610         def _real_extract(self, url):
1611                 # Extract channel id
1612                 mobj = re.match(self._VALID_URL, url)
1613                 if mobj is None:
1614                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1615                         return
1616
1617                 # Download channel pages
1618                 channel_id = mobj.group(1)
1619                 video_ids = []
1620                 pagenum = 1
1621
1622                 while True:
1623                         self.report_download_page(channel_id, pagenum)
1624                         url = self._TEMPLATE_URL % (channel_id, pagenum)
1625                         request = urllib2.Request(url)
1626                         try:
1627                                 page = urllib2.urlopen(request).read()
1628                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1629                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1630                                 return
1631
1632                         # Extract video identifiers
1633                         ids_in_page = []
1634                         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1635                                 if mobj.group(1) not in ids_in_page:
1636                                         ids_in_page.append(mobj.group(1))
1637                         video_ids.extend(ids_in_page)
1638
1639                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1640                                 break
1641                         pagenum = pagenum + 1
1642
1643                 for id in video_ids:
1644                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1645                 return
1646
1647
1648 class YoutubeUserIE(InfoExtractor):
1649         """Information Extractor for YouTube users."""
1650
1651         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1652         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1653         _GDATA_PAGE_SIZE = 50
1654         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1655         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1656         IE_NAME = u'youtube:user'
1657
1658         def __init__(self, downloader=None):
1659                 InfoExtractor.__init__(self, downloader)
1660
1661         def report_download_page(self, username, start_index):
1662                 """Report attempt to download user page."""
1663                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1664                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1665
1666         def _real_extract(self, url):
1667                 # Extract username
1668                 mobj = re.match(self._VALID_URL, url)
1669                 if mobj is None:
1670                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1671                         return
1672
1673                 username = mobj.group(1)
1674
1675                 # Download video ids using YouTube Data API. Result size per
1676                 # query is limited (currently to 50 videos) so we need to query
1677                 # page by page until there are no video ids - it means we got
1678                 # all of them.
1679
1680                 video_ids = []
1681                 pagenum = 0
1682
1683                 while True:
1684                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1685                         self.report_download_page(username, start_index)
1686
1687                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1688
1689                         try:
1690                                 page = urllib2.urlopen(request).read()
1691                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1692                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1693                                 return
1694
1695                         # Extract video identifiers
1696                         ids_in_page = []
1697
1698                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1699                                 if mobj.group(1) not in ids_in_page:
1700                                         ids_in_page.append(mobj.group(1))
1701
1702                         video_ids.extend(ids_in_page)
1703
1704                         # A little optimization - if current page is not
1705                         # "full", ie. does not contain PAGE_SIZE video ids then
1706                         # we can assume that this page is the last one - there
1707                         # are no more ids on further pages - no need to query
1708                         # again.
1709
1710                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1711                                 break
1712
1713                         pagenum += 1
1714
1715                 all_ids_count = len(video_ids)
1716                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1717                 playlistend = self._downloader.params.get('playlistend', -1)
1718
1719                 if playlistend == -1:
1720                         video_ids = video_ids[playliststart:]
1721                 else:
1722                         video_ids = video_ids[playliststart:playlistend]
1723
1724                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1725                                 (username, all_ids_count, len(video_ids)))
1726
1727                 for video_id in video_ids:
1728                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1729
1730
1731 class BlipTVUserIE(InfoExtractor):
1732         """Information Extractor for blip.tv users."""
1733
1734         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1735         _PAGE_SIZE = 12
1736         IE_NAME = u'blip.tv:user'
1737
1738         def __init__(self, downloader=None):
1739                 InfoExtractor.__init__(self, downloader)
1740
1741         def report_download_page(self, username, pagenum):
1742                 """Report attempt to download user page."""
1743                 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1744                                 (self.IE_NAME, username, pagenum))
1745
1746         def _real_extract(self, url):
1747                 # Extract username
1748                 mobj = re.match(self._VALID_URL, url)
1749                 if mobj is None:
1750                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1751                         return
1752
1753                 username = mobj.group(1)
1754
1755                 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1756
1757                 request = urllib2.Request(url)
1758
1759                 try:
1760                         page = urllib2.urlopen(request).read().decode('utf-8')
1761                         mobj = re.search(r'data-users-id="([^"]+)"', page)
1762                         page_base = page_base % mobj.group(1)
1763                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1764                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1765                         return
1766
1767
1768                 # Download video ids using BlipTV Ajax calls. Result size per
1769                 # query is limited (currently to 12 videos) so we need to query
1770                 # page by page until there are no video ids - it means we got
1771                 # all of them.
1772
1773                 video_ids = []
1774                 pagenum = 1
1775
1776                 while True:
1777                         self.report_download_page(username, pagenum)
1778
1779                         request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1780
1781                         try:
1782                                 page = urllib2.urlopen(request).read().decode('utf-8')
1783                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1784                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1785                                 return
1786
1787                         # Extract video identifiers
1788                         ids_in_page = []
1789
1790                         for mobj in re.finditer(r'href="/([^"]+)"', page):
1791                                 if mobj.group(1) not in ids_in_page:
1792                                         ids_in_page.append(unescapeHTML(mobj.group(1)))
1793
1794                         video_ids.extend(ids_in_page)
1795
1796                         # A little optimization - if current page is not
1797                         # "full", ie. does not contain PAGE_SIZE video ids then
1798                         # we can assume that this page is the last one - there
1799                         # are no more ids on further pages - no need to query
1800                         # again.
1801
1802                         if len(ids_in_page) < self._PAGE_SIZE:
1803                                 break
1804
1805                         pagenum += 1
1806
1807                 all_ids_count = len(video_ids)
1808                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1809                 playlistend = self._downloader.params.get('playlistend', -1)
1810
1811                 if playlistend == -1:
1812                         video_ids = video_ids[playliststart:]
1813                 else:
1814                         video_ids = video_ids[playliststart:playlistend]
1815
1816                 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1817                                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1818
1819                 for video_id in video_ids:
1820                         self._downloader.download([u'http://blip.tv/'+video_id])
1821
1822
1823 class DepositFilesIE(InfoExtractor):
1824         """Information extractor for depositfiles.com"""
1825
1826         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1827         IE_NAME = u'DepositFiles'
1828
1829         def __init__(self, downloader=None):
1830                 InfoExtractor.__init__(self, downloader)
1831
1832         def report_download_webpage(self, file_id):
1833                 """Report webpage download."""
1834                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1835
1836         def report_extraction(self, file_id):
1837                 """Report information extraction."""
1838                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1839
1840         def _real_extract(self, url):
1841                 file_id = url.split('/')[-1]
1842                 # Rebuild url in english locale
1843                 url = 'http://depositfiles.com/en/files/' + file_id
1844
1845                 # Retrieve file webpage with 'Free download' button pressed
1846                 free_download_indication = { 'gateway_result' : '1' }
1847                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1848                 try:
1849                         self.report_download_webpage(file_id)
1850                         webpage = urllib2.urlopen(request).read()
1851                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1852                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1853                         return
1854
1855                 # Search for the real file URL
1856                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1857                 if (mobj is None) or (mobj.group(1) is None):
1858                         # Try to figure out reason of the error.
1859                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1860                         if (mobj is not None) and (mobj.group(1) is not None):
1861                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1862                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1863                         else:
1864                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1865                         return
1866
1867                 file_url = mobj.group(1)
1868                 file_extension = os.path.splitext(file_url)[1][1:]
1869
1870                 # Search for file title
1871                 mobj = re.search(r'<b title="(.*?)">', webpage)
1872                 if mobj is None:
1873                         self._downloader.trouble(u'ERROR: unable to extract title')
1874                         return
1875                 file_title = mobj.group(1).decode('utf-8')
1876
1877                 return [{
1878                         'id':           file_id.decode('utf-8'),
1879                         'url':          file_url.decode('utf-8'),
1880                         'uploader':     u'NA',
1881                         'upload_date':  u'NA',
1882                         'title':        file_title,
1883                         'ext':          file_extension.decode('utf-8'),
1884                 }]
1885
1886
1887 class FacebookIE(InfoExtractor):
1888         """Information Extractor for Facebook"""
1889
1890         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1891         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1892         _NETRC_MACHINE = 'facebook'
1893         _available_formats = ['video', 'highqual', 'lowqual']
1894         _video_extensions = {
1895                 'video': 'mp4',
1896                 'highqual': 'mp4',
1897                 'lowqual': 'mp4',
1898         }
1899         IE_NAME = u'facebook'
1900
1901         def __init__(self, downloader=None):
1902                 InfoExtractor.__init__(self, downloader)
1903
1904         def _reporter(self, message):
1905                 """Add header and report message."""
1906                 self._downloader.to_screen(u'[facebook] %s' % message)
1907
1908         def report_login(self):
1909                 """Report attempt to log in."""
1910                 self._reporter(u'Logging in')
1911
1912         def report_video_webpage_download(self, video_id):
1913                 """Report attempt to download video webpage."""
1914                 self._reporter(u'%s: Downloading video webpage' % video_id)
1915
1916         def report_information_extraction(self, video_id):
1917                 """Report attempt to extract video information."""
1918                 self._reporter(u'%s: Extracting video information' % video_id)
1919
1920         def _parse_page(self, video_webpage):
1921                 """Extract video information from page"""
1922                 # General data
1923                 data = {'title': r'\("video_title", "(.*?)"\)',
1924                         'description': r'<div class="datawrap">(.*?)</div>',
1925                         'owner': r'\("video_owner_name", "(.*?)"\)',
1926                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1927                         }
1928                 video_info = {}
1929                 for piece in data.keys():
1930                         mobj = re.search(data[piece], video_webpage)
1931                         if mobj is not None:
1932                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1933
1934                 # Video urls
1935                 video_urls = {}
1936                 for fmt in self._available_formats:
1937                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1938                         if mobj is not None:
1939                                 # URL is in a Javascript segment inside an escaped Unicode format within
1940                                 # the generally utf-8 page
1941                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1942                 video_info['video_urls'] = video_urls
1943
1944                 return video_info
1945
1946         def _real_initialize(self):
1947                 if self._downloader is None:
1948                         return
1949
1950                 useremail = None
1951                 password = None
1952                 downloader_params = self._downloader.params
1953
1954                 # Attempt to use provided username and password or .netrc data
1955                 if downloader_params.get('username', None) is not None:
1956                         useremail = downloader_params['username']
1957                         password = downloader_params['password']
1958                 elif downloader_params.get('usenetrc', False):
1959                         try:
1960                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1961                                 if info is not None:
1962                                         useremail = info[0]
1963                                         password = info[2]
1964                                 else:
1965                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1966                         except (IOError, netrc.NetrcParseError), err:
1967                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
1968                                 return
1969
1970                 if useremail is None:
1971                         return
1972
1973                 # Log in
1974                 login_form = {
1975                         'email': useremail,
1976                         'pass': password,
1977                         'login': 'Log+In'
1978                         }
1979                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1980                 try:
1981                         self.report_login()
1982                         login_results = urllib2.urlopen(request).read()
1983                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1984                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1985                                 return
1986                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1987                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
1988                         return
1989
1990         def _real_extract(self, url):
1991                 mobj = re.match(self._VALID_URL, url)
1992                 if mobj is None:
1993                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1994                         return
1995                 video_id = mobj.group('ID')
1996
1997                 # Get video webpage
1998                 self.report_video_webpage_download(video_id)
1999                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2000                 try:
2001                         page = urllib2.urlopen(request)
2002                         video_webpage = page.read()
2003                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2004                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2005                         return
2006
2007                 # Start extracting information
2008                 self.report_information_extraction(video_id)
2009
2010                 # Extract information
2011                 video_info = self._parse_page(video_webpage)
2012
2013                 # uploader
2014                 if 'owner' not in video_info:
2015                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2016                         return
2017                 video_uploader = video_info['owner']
2018
2019                 # title
2020                 if 'title' not in video_info:
2021                         self._downloader.trouble(u'ERROR: unable to extract video title')
2022                         return
2023                 video_title = video_info['title']
2024                 video_title = video_title.decode('utf-8')
2025
2026                 # thumbnail image
2027                 if 'thumbnail' not in video_info:
2028                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2029                         video_thumbnail = ''
2030                 else:
2031                         video_thumbnail = video_info['thumbnail']
2032
2033                 # upload date
2034                 upload_date = u'NA'
2035                 if 'upload_date' in video_info:
2036                         upload_time = video_info['upload_date']
2037                         timetuple = email.utils.parsedate_tz(upload_time)
2038                         if timetuple is not None:
2039                                 try:
2040                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2041                                 except:
2042                                         pass
2043
2044                 # description
2045                 video_description = video_info.get('description', 'No description available.')
2046
2047                 url_map = video_info['video_urls']
2048                 if len(url_map.keys()) > 0:
2049                         # Decide which formats to download
2050                         req_format = self._downloader.params.get('format', None)
2051                         format_limit = self._downloader.params.get('format_limit', None)
2052
2053                         if format_limit is not None and format_limit in self._available_formats:
2054                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2055                         else:
2056                                 format_list = self._available_formats
2057                         existing_formats = [x for x in format_list if x in url_map]
2058                         if len(existing_formats) == 0:
2059                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2060                                 return
2061                         if req_format is None:
2062                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2063                         elif req_format == 'worst':
2064                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2065                         elif req_format == '-1':
2066                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2067                         else:
2068                                 # Specific format
2069                                 if req_format not in url_map:
2070                                         self._downloader.trouble(u'ERROR: requested format not available')
2071                                         return
2072                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2073
2074                 results = []
2075                 for format_param, video_real_url in video_url_list:
2076                         # Extension
2077                         video_extension = self._video_extensions.get(format_param, 'mp4')
2078
2079                         results.append({
2080                                 'id':           video_id.decode('utf-8'),
2081                                 'url':          video_real_url.decode('utf-8'),
2082                                 'uploader':     video_uploader.decode('utf-8'),
2083                                 'upload_date':  upload_date,
2084                                 'title':        video_title,
2085                                 'ext':          video_extension.decode('utf-8'),
2086                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2087                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2088                                 'description':  video_description.decode('utf-8'),
2089                         })
2090                 return results
2091
2092 class BlipTVIE(InfoExtractor):
2093         """Information extractor for blip.tv"""
2094
2095         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2096         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2097         IE_NAME = u'blip.tv'
2098
2099         def report_extraction(self, file_id):
2100                 """Report information extraction."""
2101                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2102
2103         def report_direct_download(self, title):
2104                 """Report information extraction."""
2105                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2106
2107         def _real_extract(self, url):
2108                 mobj = re.match(self._VALID_URL, url)
2109                 if mobj is None:
2110                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2111                         return
2112
2113                 if '?' in url:
2114                         cchar = '&'
2115                 else:
2116                         cchar = '?'
2117                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2118                 request = urllib2.Request(json_url.encode('utf-8'))
2119                 self.report_extraction(mobj.group(1))
2120                 info = None
2121                 try:
2122                         urlh = urllib2.urlopen(request)
2123                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2124                                 basename = url.split('/')[-1]
2125                                 title,ext = os.path.splitext(basename)
2126                                 title = title.decode('UTF-8')
2127                                 ext = ext.replace('.', '')
2128                                 self.report_direct_download(title)
2129                                 info = {
2130                                         'id': title,
2131                                         'url': url,
2132                                         'title': title,
2133                                         'ext': ext,
2134                                         'urlhandle': urlh
2135                                 }
2136                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2137                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2138                         return
2139                 if info is None: # Regular URL
2140                         try:
2141                                 json_code = urlh.read()
2142                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2143                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2144                                 return
2145
2146                         try:
2147                                 json_data = json.loads(json_code)
2148                                 if 'Post' in json_data:
2149                                         data = json_data['Post']
2150                                 else:
2151                                         data = json_data
2152
2153                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2154                                 video_url = data['media']['url']
2155                                 umobj = re.match(self._URL_EXT, video_url)
2156                                 if umobj is None:
2157                                         raise ValueError('Can not determine filename extension')
2158                                 ext = umobj.group(1)
2159
2160                                 info = {
2161                                         'id': data['item_id'],
2162                                         'url': video_url,
2163                                         'uploader': data['display_name'],
2164                                         'upload_date': upload_date,
2165                                         'title': data['title'],
2166                                         'ext': ext,
2167                                         'format': data['media']['mimeType'],
2168                                         'thumbnail': data['thumbnailUrl'],
2169                                         'description': data['description'],
2170                                         'player_url': data['embedUrl']
2171                                 }
2172                         except (ValueError,KeyError), err:
2173                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2174                                 return
2175
2176                 std_headers['User-Agent'] = 'iTunes/10.6.1'
2177                 return [info]
2178
2179
2180 class MyVideoIE(InfoExtractor):
2181         """Information Extractor for myvideo.de."""
2182
2183         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2184         IE_NAME = u'myvideo'
2185
2186         def __init__(self, downloader=None):
2187                 InfoExtractor.__init__(self, downloader)
2188         
2189         def report_download_webpage(self, video_id):
2190                 """Report webpage download."""
2191                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2192
2193         def report_extraction(self, video_id):
2194                 """Report information extraction."""
2195                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2196
2197         def _real_extract(self,url):
2198                 mobj = re.match(self._VALID_URL, url)
2199                 if mobj is None:
2200                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2201                         return
2202
2203                 video_id = mobj.group(1)
2204
2205                 # Get video webpage
2206                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2207                 try:
2208                         self.report_download_webpage(video_id)
2209                         webpage = urllib2.urlopen(request).read()
2210                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2211                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2212                         return
2213
2214                 self.report_extraction(video_id)
2215                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2216                                  webpage)
2217                 if mobj is None:
2218                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2219                         return
2220                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2221
2222                 mobj = re.search('<title>([^<]+)</title>', webpage)
2223                 if mobj is None:
2224                         self._downloader.trouble(u'ERROR: unable to extract title')
2225                         return
2226
2227                 video_title = mobj.group(1)
2228
2229                 return [{
2230                         'id':           video_id,
2231                         'url':          video_url,
2232                         'uploader':     u'NA',
2233                         'upload_date':  u'NA',
2234                         'title':        video_title,
2235                         'ext':          u'flv',
2236                 }]
2237
2238 class ComedyCentralIE(InfoExtractor):
2239         """Information extractor for The Daily Show and Colbert Report """
2240
2241         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2242         IE_NAME = u'comedycentral'
2243
2244         _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2245
2246         _video_extensions = {
2247                 '3500': 'mp4',
2248                 '2200': 'mp4',
2249                 '1700': 'mp4',
2250                 '1200': 'mp4',
2251                 '750': 'mp4',
2252                 '400': 'mp4',
2253         }
2254         _video_dimensions = {
2255                 '3500': '1280x720',
2256                 '2200': '960x540',
2257                 '1700': '768x432',
2258                 '1200': '640x360',
2259                 '750': '512x288',
2260                 '400': '384x216',
2261         }
2262
2263         def report_extraction(self, episode_id):
2264                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2265
2266         def report_config_download(self, episode_id):
2267                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2268
2269         def report_index_download(self, episode_id):
2270                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2271
2272         def report_player_url(self, episode_id):
2273                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2274
2275
2276         def _print_formats(self, formats):
2277                 print('Available formats:')
2278                 for x in formats:
2279                         print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2280
2281
2282         def _real_extract(self, url):
2283                 mobj = re.match(self._VALID_URL, url)
2284                 if mobj is None:
2285                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2286                         return
2287
2288                 if mobj.group('shortname'):
2289                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
2290                                 url = u'http://www.thedailyshow.com/full-episodes/'
2291                         else:
2292                                 url = u'http://www.colbertnation.com/full-episodes/'
2293                         mobj = re.match(self._VALID_URL, url)
2294                         assert mobj is not None
2295
2296                 dlNewest = not mobj.group('episode')
2297                 if dlNewest:
2298                         epTitle = mobj.group('showname')
2299                 else:
2300                         epTitle = mobj.group('episode')
2301
2302                 req = urllib2.Request(url)
2303                 self.report_extraction(epTitle)
2304                 try:
2305                         htmlHandle = urllib2.urlopen(req)
2306                         html = htmlHandle.read()
2307                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2308                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2309                         return
2310                 if dlNewest:
2311                         url = htmlHandle.geturl()
2312                         mobj = re.match(self._VALID_URL, url)
2313                         if mobj is None:
2314                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2315                                 return
2316                         if mobj.group('episode') == '':
2317                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2318                                 return
2319                         epTitle = mobj.group('episode')
2320
2321                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2322
2323                 if len(mMovieParams) == 0:
2324                         # The Colbert Report embeds the information in a without
2325                         # a URL prefix; so extract the alternate reference
2326                         # and then add the URL prefix manually.
2327
2328                         altMovieParams = re.findall('data-mgid="([^"]*episode.*?:.*?)"', html)
2329                         if len(altMovieParams) == 0:
2330                                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2331                                 return
2332                         else:
2333                                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2334                 
2335                 playerUrl_raw = mMovieParams[0][0]
2336                 self.report_player_url(epTitle)
2337                 try:
2338                         urlHandle = urllib2.urlopen(playerUrl_raw)
2339                         playerUrl = urlHandle.geturl()
2340                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2341                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2342                         return
2343
2344                 uri = mMovieParams[0][1]
2345                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2346                 self.report_index_download(epTitle)
2347                 try:
2348                         indexXml = urllib2.urlopen(indexUrl).read()
2349                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2350                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2351                         return
2352
2353                 results = []
2354
2355                 idoc = xml.etree.ElementTree.fromstring(indexXml)
2356                 itemEls = idoc.findall('.//item')
2357                 for itemEl in itemEls:
2358                         mediaId = itemEl.findall('./guid')[0].text
2359                         shortMediaId = mediaId.split(':')[-1]
2360                         showId = mediaId.split(':')[-2].replace('.com', '')
2361                         officialTitle = itemEl.findall('./title')[0].text
2362                         officialDate = itemEl.findall('./pubDate')[0].text
2363
2364                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2365                                                 urllib.urlencode({'uri': mediaId}))
2366                         configReq = urllib2.Request(configUrl)
2367                         self.report_config_download(epTitle)
2368                         try:
2369                                 configXml = urllib2.urlopen(configReq).read()
2370                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2371                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2372                                 return
2373
2374                         cdoc = xml.etree.ElementTree.fromstring(configXml)
2375                         turls = []
2376                         for rendition in cdoc.findall('.//rendition'):
2377                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2378                                 turls.append(finfo)
2379
2380                         if len(turls) == 0:
2381                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2382                                 continue
2383                         
2384                         if self._downloader.params.get('listformats', None):
2385                                 self._print_formats([i[0] for i in turls])
2386                                 return
2387
2388                         # For now, just pick the highest bitrate
2389                         format,video_url = turls[-1]
2390
2391                         # Get the format arg from the arg stream
2392                         req_format = self._downloader.params.get('format', None)
2393
2394                         # Select format if we can find one
2395                         for f,v in turls:
2396                                 if f == req_format:
2397                                         format, video_url = f, v
2398                                         break
2399
2400                         # Patch to download from alternative CDN, which does not
2401                         # break on current RTMPDump builds
2402                         broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2403                         better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2404
2405                         if video_url.startswith(broken_cdn):
2406                                 video_url = video_url.replace(broken_cdn, better_cdn)
2407
2408                         effTitle = showId + u'-' + epTitle
2409                         info = {
2410                                 'id': shortMediaId,
2411                                 'url': video_url,
2412                                 'uploader': showId,
2413                                 'upload_date': officialDate,
2414                                 'title': effTitle,
2415                                 'ext': 'mp4',
2416                                 'format': format,
2417                                 'thumbnail': None,
2418                                 'description': officialTitle,
2419                                 'player_url': None #playerUrl
2420                         }
2421
2422                         results.append(info)
2423                         
2424                 return results
2425
2426
2427 class EscapistIE(InfoExtractor):
2428         """Information extractor for The Escapist """
2429
2430         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2431         IE_NAME = u'escapist'
2432
2433         def report_extraction(self, showName):
2434                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2435
2436         def report_config_download(self, showName):
2437                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2438
2439         def _real_extract(self, url):
2440                 mobj = re.match(self._VALID_URL, url)
2441                 if mobj is None:
2442                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2443                         return
2444                 showName = mobj.group('showname')
2445                 videoId = mobj.group('episode')
2446
2447                 self.report_extraction(showName)
2448                 try:
2449                         webPage = urllib2.urlopen(url)
2450                         webPageBytes = webPage.read()
2451                         m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2452                         webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2453                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2454                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2455                         return
2456
2457                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2458                 description = unescapeHTML(descMatch.group(1))
2459                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2460                 imgUrl = unescapeHTML(imgMatch.group(1))
2461                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2462                 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2463                 configUrlMatch = re.search('config=(.*)$', playerUrl)
2464                 configUrl = urllib2.unquote(configUrlMatch.group(1))
2465
2466                 self.report_config_download(showName)
2467                 try:
2468                         configJSON = urllib2.urlopen(configUrl).read()
2469                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2470                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2471                         return
2472
2473                 # Technically, it's JavaScript, not JSON
2474                 configJSON = configJSON.replace("'", '"')
2475
2476                 try:
2477                         config = json.loads(configJSON)
2478                 except (ValueError,), err:
2479                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2480                         return
2481
2482                 playlist = config['playlist']
2483                 videoUrl = playlist[1]['url']
2484
2485                 info = {
2486                         'id': videoId,
2487                         'url': videoUrl,
2488                         'uploader': showName,
2489                         'upload_date': None,
2490                         'title': showName,
2491                         'ext': 'flv',
2492                         'thumbnail': imgUrl,
2493                         'description': description,
2494                         'player_url': playerUrl,
2495                 }
2496
2497                 return [info]
2498
2499
2500 class CollegeHumorIE(InfoExtractor):
2501         """Information extractor for collegehumor.com"""
2502
2503         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2504         IE_NAME = u'collegehumor'
2505
2506         def report_webpage(self, video_id):
2507                 """Report information extraction."""
2508                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2509
2510         def report_extraction(self, video_id):
2511                 """Report information extraction."""
2512                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2513
2514         def _real_extract(self, url):
2515                 mobj = re.match(self._VALID_URL, url)
2516                 if mobj is None:
2517                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2518                         return
2519                 video_id = mobj.group('videoid')
2520
2521                 self.report_webpage(video_id)
2522                 request = urllib2.Request(url)
2523                 try:
2524                         webpage = urllib2.urlopen(request).read()
2525                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2526                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2527                         return
2528
2529                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2530                 if m is None:
2531                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2532                         return
2533                 internal_video_id = m.group('internalvideoid')
2534
2535                 info = {
2536                         'id': video_id,
2537                         'internal_id': internal_video_id,
2538                 }
2539
2540                 self.report_extraction(video_id)
2541                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2542                 try:
2543                         metaXml = urllib2.urlopen(xmlUrl).read()
2544                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2545                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2546                         return
2547
2548                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2549                 try:
2550                         videoNode = mdoc.findall('./video')[0]
2551                         info['description'] = videoNode.findall('./description')[0].text
2552                         info['title'] = videoNode.findall('./caption')[0].text
2553                         info['url'] = videoNode.findall('./file')[0].text
2554                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2555                         info['ext'] = info['url'].rpartition('.')[2]
2556                 except IndexError:
2557                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2558                         return
2559
2560                 return [info]
2561
2562
2563 class XVideosIE(InfoExtractor):
2564         """Information extractor for xvideos.com"""
2565
2566         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2567         IE_NAME = u'xvideos'
2568
2569         def report_webpage(self, video_id):
2570                 """Report information extraction."""
2571                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2572
2573         def report_extraction(self, video_id):
2574                 """Report information extraction."""
2575                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2576
2577         def _real_extract(self, url):
2578                 mobj = re.match(self._VALID_URL, url)
2579                 if mobj is None:
2580                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2581                         return
2582                 video_id = mobj.group(1).decode('utf-8')
2583
2584                 self.report_webpage(video_id)
2585
2586                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2587                 try:
2588                         webpage = urllib2.urlopen(request).read()
2589                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2590                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2591                         return
2592
2593                 self.report_extraction(video_id)
2594
2595
2596                 # Extract video URL
2597                 mobj = re.search(r'flv_url=(.+?)&', webpage)
2598                 if mobj is None:
2599                         self._downloader.trouble(u'ERROR: unable to extract video url')
2600                         return
2601                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2602
2603
2604                 # Extract title
2605                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2606                 if mobj is None:
2607                         self._downloader.trouble(u'ERROR: unable to extract video title')
2608                         return
2609                 video_title = mobj.group(1).decode('utf-8')
2610
2611
2612                 # Extract video thumbnail
2613                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2614                 if mobj is None:
2615                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2616                         return
2617                 video_thumbnail = mobj.group(0).decode('utf-8')
2618
2619                 info = {
2620                         'id': video_id,
2621                         'url': video_url,
2622                         'uploader': None,
2623                         'upload_date': None,
2624                         'title': video_title,
2625                         'ext': 'flv',
2626                         'thumbnail': video_thumbnail,
2627                         'description': None,
2628                 }
2629
2630                 return [info]
2631
2632
2633 class SoundcloudIE(InfoExtractor):
2634         """Information extractor for soundcloud.com
2635            To access the media, the uid of the song and a stream token
2636            must be extracted from the page source and the script must make
2637            a request to media.soundcloud.com/crossdomain.xml. Then
2638            the media can be grabbed by requesting from an url composed
2639            of the stream token and uid
2640          """
2641
2642         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2643         IE_NAME = u'soundcloud'
2644
2645         def __init__(self, downloader=None):
2646                 InfoExtractor.__init__(self, downloader)
2647
2648         def report_webpage(self, video_id):
2649                 """Report information extraction."""
2650                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2651
2652         def report_extraction(self, video_id):
2653                 """Report information extraction."""
2654                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2655
2656         def _real_extract(self, url):
2657                 mobj = re.match(self._VALID_URL, url)
2658                 if mobj is None:
2659                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2660                         return
2661
2662                 # extract uploader (which is in the url)
2663                 uploader = mobj.group(1).decode('utf-8')
2664                 # extract simple title (uploader + slug of song title)
2665                 slug_title =  mobj.group(2).decode('utf-8')
2666                 simple_title = uploader + u'-' + slug_title
2667
2668                 self.report_webpage('%s/%s' % (uploader, slug_title))
2669
2670                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2671                 try:
2672                         webpage = urllib2.urlopen(request).read()
2673                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2674                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2675                         return
2676
2677                 self.report_extraction('%s/%s' % (uploader, slug_title))
2678
2679                 # extract uid and stream token that soundcloud hands out for access
2680                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2681                 if mobj:
2682                         video_id = mobj.group(1)
2683                         stream_token = mobj.group(2)
2684
2685                 # extract unsimplified title
2686                 mobj = re.search('"title":"(.*?)",', webpage)
2687                 if mobj:
2688                         title = mobj.group(1).decode('utf-8')
2689                 else:
2690                         title = simple_title
2691
2692                 # construct media url (with uid/token)
2693                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2694                 mediaURL = mediaURL % (video_id, stream_token)
2695
2696                 # description
2697                 description = u'No description available'
2698                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2699                 if mobj:
2700                         description = mobj.group(1)
2701
2702                 # upload date
2703                 upload_date = None
2704                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2705                 if mobj:
2706                         try:
2707                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2708                         except Exception, e:
2709                                 self._downloader.to_stderr(compat_str(e))
2710
2711                 # for soundcloud, a request to a cross domain is required for cookies
2712                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2713
2714                 return [{
2715                         'id':           video_id.decode('utf-8'),
2716                         'url':          mediaURL,
2717                         'uploader':     uploader.decode('utf-8'),
2718                         'upload_date':  upload_date,
2719                         'title':        title,
2720                         'ext':          u'mp3',
2721                         'description': description.decode('utf-8')
2722                 }]
2723
2724
2725 class InfoQIE(InfoExtractor):
2726         """Information extractor for infoq.com"""
2727
2728         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2729         IE_NAME = u'infoq'
2730
2731         def report_webpage(self, video_id):
2732                 """Report information extraction."""
2733                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2734
2735         def report_extraction(self, video_id):
2736                 """Report information extraction."""
2737                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2738
2739         def _real_extract(self, url):
2740                 mobj = re.match(self._VALID_URL, url)
2741                 if mobj is None:
2742                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2743                         return
2744
2745                 self.report_webpage(url)
2746
2747                 request = urllib2.Request(url)
2748                 try:
2749                         webpage = urllib2.urlopen(request).read()
2750                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2751                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2752                         return
2753
2754                 self.report_extraction(url)
2755
2756
2757                 # Extract video URL
2758                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2759                 if mobj is None:
2760                         self._downloader.trouble(u'ERROR: unable to extract video url')
2761                         return
2762                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2763
2764
2765                 # Extract title
2766                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2767                 if mobj is None:
2768                         self._downloader.trouble(u'ERROR: unable to extract video title')
2769                         return
2770                 video_title = mobj.group(1).decode('utf-8')
2771
2772                 # Extract description
2773                 video_description = u'No description available.'
2774                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2775                 if mobj is not None:
2776                         video_description = mobj.group(1).decode('utf-8')
2777
2778                 video_filename = video_url.split('/')[-1]
2779                 video_id, extension = video_filename.split('.')
2780
2781                 info = {
2782                         'id': video_id,
2783                         'url': video_url,
2784                         'uploader': None,
2785                         'upload_date': None,
2786                         'title': video_title,
2787                         'ext': extension, # Extension is always(?) mp4, but seems to be flv
2788                         'thumbnail': None,
2789                         'description': video_description,
2790                 }
2791
2792                 return [info]
2793
2794 class MixcloudIE(InfoExtractor):
2795         """Information extractor for www.mixcloud.com"""
2796         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2797         IE_NAME = u'mixcloud'
2798
2799         def __init__(self, downloader=None):
2800                 InfoExtractor.__init__(self, downloader)
2801
2802         def report_download_json(self, file_id):
2803                 """Report JSON download."""
2804                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2805
2806         def report_extraction(self, file_id):
2807                 """Report information extraction."""
2808                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2809
2810         def get_urls(self, jsonData, fmt, bitrate='best'):
2811                 """Get urls from 'audio_formats' section in json"""
2812                 file_url = None
2813                 try:
2814                         bitrate_list = jsonData[fmt]
2815                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2816                                 bitrate = max(bitrate_list) # select highest
2817
2818                         url_list = jsonData[fmt][bitrate]
2819                 except TypeError: # we have no bitrate info.
2820                         url_list = jsonData[fmt]
2821                 return url_list
2822
2823         def check_urls(self, url_list):
2824                 """Returns 1st active url from list"""
2825                 for url in url_list:
2826                         try:
2827                                 urllib2.urlopen(url)
2828                                 return url
2829                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2830                                 url = None
2831
2832                 return None
2833
2834         def _print_formats(self, formats):
2835                 print('Available formats:')
2836                 for fmt in formats.keys():
2837                         for b in formats[fmt]:
2838                                 try:
2839                                         ext = formats[fmt][b][0]
2840                                         print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2841                                 except TypeError: # we have no bitrate info
2842                                         ext = formats[fmt][0]
2843                                         print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2844                                         break
2845
2846         def _real_extract(self, url):
2847                 mobj = re.match(self._VALID_URL, url)
2848                 if mobj is None:
2849                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2850                         return
2851                 # extract uploader & filename from url
2852                 uploader = mobj.group(1).decode('utf-8')
2853                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2854
2855                 # construct API request
2856                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2857                 # retrieve .json file with links to files
2858                 request = urllib2.Request(file_url)
2859                 try:
2860                         self.report_download_json(file_url)
2861                         jsonData = urllib2.urlopen(request).read()
2862                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2863                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2864                         return
2865
2866                 # parse JSON
2867                 json_data = json.loads(jsonData)
2868                 player_url = json_data['player_swf_url']
2869                 formats = dict(json_data['audio_formats'])
2870
2871                 req_format = self._downloader.params.get('format', None)
2872                 bitrate = None
2873
2874                 if self._downloader.params.get('listformats', None):
2875                         self._print_formats(formats)
2876                         return
2877
2878                 if req_format is None or req_format == 'best':
2879                         for format_param in formats.keys():
2880                                 url_list = self.get_urls(formats, format_param)
2881                                 # check urls
2882                                 file_url = self.check_urls(url_list)
2883                                 if file_url is not None:
2884                                         break # got it!
2885                 else:
2886                         if req_format not in formats.keys():
2887                                 self._downloader.trouble(u'ERROR: format is not available')
2888                                 return
2889
2890                         url_list = self.get_urls(formats, req_format)
2891                         file_url = self.check_urls(url_list)
2892                         format_param = req_format
2893
2894                 return [{
2895                         'id': file_id.decode('utf-8'),
2896                         'url': file_url.decode('utf-8'),
2897                         'uploader':     uploader.decode('utf-8'),
2898                         'upload_date': u'NA',
2899                         'title': json_data['name'],
2900                         'ext': file_url.split('.')[-1].decode('utf-8'),
2901                         'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2902                         'thumbnail': json_data['thumbnail_url'],
2903                         'description': json_data['description'],
2904                         'player_url': player_url.decode('utf-8'),
2905                 }]
2906
2907 class StanfordOpenClassroomIE(InfoExtractor):
2908         """Information extractor for Stanford's Open ClassRoom"""
2909
2910         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2911         IE_NAME = u'stanfordoc'
2912
2913         def report_download_webpage(self, objid):
2914                 """Report information extraction."""
2915                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2916
2917         def report_extraction(self, video_id):
2918                 """Report information extraction."""
2919                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2920
2921         def _real_extract(self, url):
2922                 mobj = re.match(self._VALID_URL, url)
2923                 if mobj is None:
2924                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2925                         return
2926
2927                 if mobj.group('course') and mobj.group('video'): # A specific video
2928                         course = mobj.group('course')
2929                         video = mobj.group('video')
2930                         info = {
2931                                 'id': course + '_' + video,
2932                         }
2933
2934                         self.report_extraction(info['id'])
2935                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2936                         xmlUrl = baseUrl + video + '.xml'
2937                         try:
2938                                 metaXml = urllib2.urlopen(xmlUrl).read()
2939                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2940                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2941                                 return
2942                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2943                         try:
2944                                 info['title'] = mdoc.findall('./title')[0].text
2945                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2946                         except IndexError:
2947                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2948                                 return
2949                         info['ext'] = info['url'].rpartition('.')[2]
2950                         return [info]
2951                 elif mobj.group('course'): # A course page
2952                         course = mobj.group('course')
2953                         info = {
2954                                 'id': course,
2955                                 'type': 'playlist',
2956                         }
2957
2958                         self.report_download_webpage(info['id'])
2959                         try:
2960                                 coursepage = urllib2.urlopen(url).read()
2961                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2962                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2963                                 return
2964
2965                         m = re.search('<h1>([^<]+)</h1>', coursepage)
2966                         if m:
2967                                 info['title'] = unescapeHTML(m.group(1))
2968                         else:
2969                                 info['title'] = info['id']
2970
2971                         m = re.search('<description>([^<]+)</description>', coursepage)
2972                         if m:
2973                                 info['description'] = unescapeHTML(m.group(1))
2974
2975                         links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2976                         info['list'] = [
2977                                 {
2978                                         'type': 'reference',
2979                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2980                                 }
2981                                         for vpage in links]
2982                         results = []
2983                         for entry in info['list']:
2984                                 assert entry['type'] == 'reference'
2985                                 results += self.extract(entry['url'])
2986                         return results
2987                         
2988                 else: # Root page
2989                         info = {
2990                                 'id': 'Stanford OpenClassroom',
2991                                 'type': 'playlist',
2992                         }
2993
2994                         self.report_download_webpage(info['id'])
2995                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2996                         try:
2997                                 rootpage = urllib2.urlopen(rootURL).read()
2998                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2999                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3000                                 return
3001
3002                         info['title'] = info['id']
3003
3004                         links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3005                         info['list'] = [
3006                                 {
3007                                         'type': 'reference',
3008                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3009                                 }
3010                                         for cpage in links]
3011
3012                         results = []
3013                         for entry in info['list']:
3014                                 assert entry['type'] == 'reference'
3015                                 results += self.extract(entry['url'])
3016                         return results
3017
3018 class MTVIE(InfoExtractor):
3019         """Information extractor for MTV.com"""
3020
3021         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3022         IE_NAME = u'mtv'
3023
3024         def report_webpage(self, video_id):
3025                 """Report information extraction."""
3026                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3027
3028         def report_extraction(self, video_id):
3029                 """Report information extraction."""
3030                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3031
3032         def _real_extract(self, url):
3033                 mobj = re.match(self._VALID_URL, url)
3034                 if mobj is None:
3035                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3036                         return
3037                 if not mobj.group('proto'):
3038                         url = 'http://' + url
3039                 video_id = mobj.group('videoid')
3040                 self.report_webpage(video_id)
3041
3042                 request = urllib2.Request(url)
3043                 try:
3044                         webpage = urllib2.urlopen(request).read()
3045                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3046                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3047                         return
3048
3049                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3050                 if mobj is None:
3051                         self._downloader.trouble(u'ERROR: unable to extract song name')
3052                         return
3053                 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3054                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3055                 if mobj is None:
3056                         self._downloader.trouble(u'ERROR: unable to extract performer')
3057                         return
3058                 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3059                 video_title = performer + ' - ' + song_name 
3060
3061                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3062                 if mobj is None:
3063                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3064                         return
3065                 mtvn_uri = mobj.group(1)
3066
3067                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3068                 if mobj is None:
3069                         self._downloader.trouble(u'ERROR: unable to extract content id')
3070                         return
3071                 content_id = mobj.group(1)
3072
3073                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3074                 self.report_extraction(video_id)
3075                 request = urllib2.Request(videogen_url)
3076                 try:
3077                         metadataXml = urllib2.urlopen(request).read()
3078                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3079                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3080                         return
3081
3082                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3083                 renditions = mdoc.findall('.//rendition')
3084
3085                 # For now, always pick the highest quality.
3086                 rendition = renditions[-1]
3087
3088                 try:
3089                         _,_,ext = rendition.attrib['type'].partition('/')
3090                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3091                         video_url = rendition.find('./src').text
3092                 except KeyError:
3093                         self._downloader.trouble('Invalid rendition field.')
3094                         return
3095
3096                 info = {
3097                         'id': video_id,
3098                         'url': video_url,
3099                         'uploader': performer,
3100                         'title': video_title,
3101                         'ext': ext,
3102                         'format': format,
3103                 }
3104
3105                 return [info]
3106
3107
3108 class YoukuIE(InfoExtractor):
3109
3110         _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3111         IE_NAME = u'Youku'
3112
3113         def __init__(self, downloader=None):
3114                 InfoExtractor.__init__(self, downloader)
3115
3116         def report_download_webpage(self, file_id):
3117                 """Report webpage download."""
3118                 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3119
3120         def report_extraction(self, file_id):
3121                 """Report information extraction."""
3122                 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3123
3124         def _gen_sid(self):
3125                 nowTime = int(time.time() * 1000)
3126                 random1 = random.randint(1000,1998)
3127                 random2 = random.randint(1000,9999)
3128
3129                 return "%d%d%d" %(nowTime,random1,random2)
3130
3131         def _get_file_ID_mix_string(self, seed):
3132                 mixed = []
3133                 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3134                 seed = float(seed)
3135                 for i in range(len(source)):
3136                         seed  =  (seed * 211 + 30031 ) % 65536
3137                         index  =  math.floor(seed / 65536 * len(source) )
3138                         mixed.append(source[int(index)])
3139                         source.remove(source[int(index)])
3140                 #return ''.join(mixed)
3141                 return mixed
3142
3143         def _get_file_id(self, fileId, seed):
3144                 mixed = self._get_file_ID_mix_string(seed)
3145                 ids = fileId.split('*')
3146                 realId = []
3147                 for ch in ids:
3148                         if ch:
3149                                 realId.append(mixed[int(ch)])
3150                 return ''.join(realId)
3151
3152         def _real_extract(self, url):
3153                 mobj = re.match(self._VALID_URL, url)
3154                 if mobj is None:
3155                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3156                         return
3157                 video_id = mobj.group('ID')
3158
3159                 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3160
3161                 request = urllib2.Request(info_url, None, std_headers)
3162                 try:
3163                         self.report_download_webpage(video_id)
3164                         jsondata = urllib2.urlopen(request).read()
3165                 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3166                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3167                         return
3168
3169                 self.report_extraction(video_id)
3170                 try:
3171                         config = json.loads(jsondata)
3172
3173                         video_title =  config['data'][0]['title']
3174                         seed = config['data'][0]['seed']
3175
3176                         format = self._downloader.params.get('format', None)
3177                         supported_format = config['data'][0]['streamfileids'].keys()
3178
3179                         if format is None or format == 'best':
3180                                 if 'hd2' in supported_format:
3181                                         format = 'hd2'
3182                                 else:
3183                                         format = 'flv'
3184                                 ext = u'flv'
3185                         elif format == 'worst':
3186                                 format = 'mp4'
3187                                 ext = u'mp4'
3188                         else:
3189                                 format = 'flv'
3190                                 ext = u'flv'
3191
3192
3193                         fileid = config['data'][0]['streamfileids'][format]
3194                         seg_number = len(config['data'][0]['segs'][format])
3195
3196                         keys=[]
3197                         for i in xrange(seg_number):
3198                                 keys.append(config['data'][0]['segs'][format][i]['k'])
3199
3200                         #TODO check error
3201                         #youku only could be viewed from mainland china
3202                 except:
3203                         self._downloader.trouble(u'ERROR: unable to extract info section')
3204                         return
3205
3206                 files_info=[]
3207                 sid = self._gen_sid()
3208                 fileid = self._get_file_id(fileid, seed)
3209
3210                 #column 8,9 of fileid represent the segment number
3211                 #fileid[7:9] should be changed
3212                 for index, key in enumerate(keys):
3213
3214                         temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3215                         download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3216
3217                         info = {
3218                                 'id': '%s_part%02d' % (video_id, index),
3219                                 'url': download_url,
3220                                 'uploader': None,
3221                                 'title': video_title,
3222                                 'ext': ext,
3223                         }
3224                         files_info.append(info)
3225
3226                 return files_info
3227
3228
3229 class XNXXIE(InfoExtractor):
3230         """Information extractor for xnxx.com"""
3231
3232         _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3233         IE_NAME = u'xnxx'
3234         VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3235         VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3236         VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3237
3238         def report_webpage(self, video_id):
3239                 """Report information extraction"""
3240                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3241
3242         def report_extraction(self, video_id):
3243                 """Report information extraction"""
3244                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3245
3246         def _real_extract(self, url):
3247                 mobj = re.match(self._VALID_URL, url)
3248                 if mobj is None:
3249                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3250                         return
3251                 video_id = mobj.group(1).decode('utf-8')
3252
3253                 self.report_webpage(video_id)
3254
3255                 # Get webpage content
3256                 try:
3257                         webpage = urllib2.urlopen(url).read()
3258                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3259                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3260                         return
3261
3262                 result = re.search(self.VIDEO_URL_RE, webpage)
3263                 if result is None:
3264                         self._downloader.trouble(u'ERROR: unable to extract video url')
3265                         return
3266                 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3267
3268                 result = re.search(self.VIDEO_TITLE_RE, webpage)
3269                 if result is None:
3270                         self._downloader.trouble(u'ERROR: unable to extract video title')
3271                         return
3272                 video_title = result.group(1).decode('utf-8')
3273
3274                 result = re.search(self.VIDEO_THUMB_RE, webpage)
3275                 if result is None:
3276                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3277                         return
3278                 video_thumbnail = result.group(1).decode('utf-8')
3279
3280                 return [{
3281                         'id': video_id,
3282                         'url': video_url,
3283                         'uploader': None,
3284                         'upload_date': None,
3285                         'title': video_title,
3286                         'ext': 'flv',
3287                         'thumbnail': video_thumbnail,
3288                         'description': None,
3289                 }]
3290
3291
3292 class GooglePlusIE(InfoExtractor):
3293         """Information extractor for plus.google.com."""
3294
3295         _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3296         IE_NAME = u'plus.google'
3297
3298         def __init__(self, downloader=None):
3299                 InfoExtractor.__init__(self, downloader)
3300
3301         def report_extract_entry(self, url):
3302                 """Report downloading extry"""
3303                 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3304
3305         def report_date(self, upload_date):
3306                 """Report downloading extry"""
3307                 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3308
3309         def report_uploader(self, uploader):
3310                 """Report downloading extry"""
3311                 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3312
3313         def report_title(self, video_title):
3314                 """Report downloading extry"""
3315                 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3316
3317         def report_extract_vid_page(self, video_page):
3318                 """Report information extraction."""
3319                 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3320
3321         def _real_extract(self, url):
3322                 # Extract id from URL
3323                 mobj = re.match(self._VALID_URL, url)
3324                 if mobj is None:
3325                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3326                         return
3327
3328                 post_url = mobj.group(0)
3329                 video_id = mobj.group(2)
3330
3331                 video_extension = 'flv'
3332
3333                 # Step 1, Retrieve post webpage to extract further information
3334                 self.report_extract_entry(post_url)
3335                 request = urllib2.Request(post_url)
3336                 try:
3337                         webpage = urllib2.urlopen(request).read()
3338                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3339                         self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3340                         return
3341
3342                 # Extract update date
3343                 upload_date = u'NA'
3344                 pattern = 'title="Timestamp">(.*?)</a>'
3345                 mobj = re.search(pattern, webpage)
3346                 if mobj:
3347                         upload_date = mobj.group(1)
3348                         # Convert timestring to a format suitable for filename
3349                         upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3350                         upload_date = upload_date.strftime('%Y%m%d')
3351                 self.report_date(upload_date)
3352
3353                 # Extract uploader
3354                 uploader = u'NA'
3355                 pattern = r'rel\="author".*?>(.*?)</a>'
3356                 mobj = re.search(pattern, webpage)
3357                 if mobj:
3358                         uploader = mobj.group(1)
3359                 self.report_uploader(uploader)
3360
3361                 # Extract title
3362                 # Get the first line for title
3363                 video_title = u'NA'
3364                 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3365                 mobj = re.search(pattern, webpage)
3366                 if mobj:
3367                         video_title = mobj.group(1)
3368                 self.report_title(video_title)
3369
3370                 # Step 2, Stimulate clicking the image box to launch video
3371                 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3372                 mobj = re.search(pattern, webpage)
3373                 if mobj is None:
3374                         self._downloader.trouble(u'ERROR: unable to extract video page URL')
3375
3376                 video_page = mobj.group(1)
3377                 request = urllib2.Request(video_page)
3378                 try:
3379                         webpage = urllib2.urlopen(request).read()
3380                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3381                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3382                         return
3383                 self.report_extract_vid_page(video_page)
3384
3385
3386                 # Extract video links on video page
3387                 """Extract video links of all sizes"""
3388                 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3389                 mobj = re.findall(pattern, webpage)
3390                 if len(mobj) == 0:
3391                         self._downloader.trouble(u'ERROR: unable to extract video links')
3392
3393                 # Sort in resolution
3394                 links = sorted(mobj)
3395
3396                 # Choose the lowest of the sort, i.e. highest resolution
3397                 video_url = links[-1]
3398                 # Only get the url. The resolution part in the tuple has no use anymore
3399                 video_url = video_url[-1]
3400                 # Treat escaped \u0026 style hex
3401                 video_url = unicode(video_url, "unicode_escape")
3402
3403
3404                 return [{
3405                         'id':           video_id.decode('utf-8'),
3406                         'url':          video_url,
3407                         'uploader':     uploader.decode('utf-8'),
3408                         'upload_date':  upload_date.decode('utf-8'),
3409                         'title':        video_title.decode('utf-8'),
3410                         'ext':          video_extension.decode('utf-8'),
3411                 }]