Merge pull request #403 from FiloSottile/re_VERBOSE
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import datetime
5 import HTMLParser
6 import httplib
7 import netrc
8 import os
9 import re
10 import socket
11 import time
12 import urllib
13 import urllib2
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 from urlparse import parse_qs
19
20 try:
21         import cStringIO as StringIO
22 except ImportError:
23         import StringIO
24
25 from utils import *
26
27
28 class InfoExtractor(object):
29         """Information Extractor class.
30
31         Information extractors are the classes that, given a URL, extract
32         information from the video (or videos) the URL refers to. This
33         information includes the real video URL, the video title and simplified
34         title, author and others. The information is stored in a dictionary
35         which is then passed to the FileDownloader. The FileDownloader
36         processes this information possibly downloading the video to the file
37         system, among other possible outcomes. The dictionaries must include
38         the following fields:
39
40         id:             Video identifier.
41         url:            Final video URL.
42         uploader:       Nickname of the video uploader.
43         title:          Literal title.
44         ext:            Video filename extension.
45         format:         Video format.
46         player_url:     SWF Player URL (may be None).
47
48         The following fields are optional. Their primary purpose is to allow
49         youtube-dl to serve as the backend for a video search function, such
50         as the one in youtube2mp3.  They are only used when their respective
51         forced printing functions are called:
52
53         thumbnail:      Full URL to a video thumbnail image.
54         description:    One-line video description.
55
56         Subclasses of this one should re-define the _real_initialize() and
57         _real_extract() methods and define a _VALID_URL regexp.
58         Probably, they should also be added to the list of extractors.
59         """
60
61         _ready = False
62         _downloader = None
63
64         def __init__(self, downloader=None):
65                 """Constructor. Receives an optional downloader."""
66                 self._ready = False
67                 self.set_downloader(downloader)
68
69         def suitable(self, url):
70                 """Receives a URL and returns True if suitable for this IE."""
71                 return re.match(self._VALID_URL, url) is not None
72
73         def initialize(self):
74                 """Initializes an instance (authentication, etc)."""
75                 if not self._ready:
76                         self._real_initialize()
77                         self._ready = True
78
79         def extract(self, url):
80                 """Extracts URL information and returns it in list of dicts."""
81                 self.initialize()
82                 return self._real_extract(url)
83
84         def set_downloader(self, downloader):
85                 """Sets the downloader for this IE."""
86                 self._downloader = downloader
87
88         def _real_initialize(self):
89                 """Real initialization process. Redefine in subclasses."""
90                 pass
91
92         def _real_extract(self, url):
93                 """Real extraction process. Redefine in subclasses."""
94                 pass
95
96
97 class YoutubeIE(InfoExtractor):
98         """Information extractor for youtube.com."""
99
100         _VALID_URL = r"""^
101                          (
102                              (?:https?://)?                                       # http(s):// (optional)
103                              (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
104                                 tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
105                              (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
106                              (?:                                                  # the various things that can precede the ID:
107                                  (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
108                                  |(?:                                             # or the v= param in all its forms
109                                      (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
110                                      (?:\?|\#!?)                                  # the params delimiter ? or # or #!
111                                      (?:.+&)?                                     # any other preceding param (like /?s=tuff&v=xxxx)
112                                      v=
113                                  )
114                              )?                                                   # optional -> youtube.com/xxxx is OK
115                          )?                                                       # all until now is optional -> you can pass the naked ID
116                          ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
117                          (?(1).+)?                                                # if we found the ID, everything can follow
118                          $"""
119         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
120         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
121         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
122         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
123         _NETRC_MACHINE = 'youtube'
124         # Listed in order of quality
125         _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
126         _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
127         _video_extensions = {
128                 '13': '3gp',
129                 '17': 'mp4',
130                 '18': 'mp4',
131                 '22': 'mp4',
132                 '37': 'mp4',
133                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
134                 '43': 'webm',
135                 '44': 'webm',
136                 '45': 'webm',
137                 '46': 'webm',
138         }
139         _video_dimensions = {
140                 '5': '240x400',
141                 '6': '???',
142                 '13': '???',
143                 '17': '144x176',
144                 '18': '360x640',
145                 '22': '720x1280',
146                 '34': '360x640',
147                 '35': '480x854',
148                 '37': '1080x1920',
149                 '38': '3072x4096',
150                 '43': '360x640',
151                 '44': '480x854',
152                 '45': '720x1280',
153                 '46': '1080x1920',
154         }       
155         IE_NAME = u'youtube'
156
157         def suitable(self, url):
158                 """Receives a URL and returns True if suitable for this IE."""
159                 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
160
161         def report_lang(self):
162                 """Report attempt to set language."""
163                 self._downloader.to_screen(u'[youtube] Setting language')
164
165         def report_login(self):
166                 """Report attempt to log in."""
167                 self._downloader.to_screen(u'[youtube] Logging in')
168
169         def report_age_confirmation(self):
170                 """Report attempt to confirm age."""
171                 self._downloader.to_screen(u'[youtube] Confirming age')
172
173         def report_video_webpage_download(self, video_id):
174                 """Report attempt to download video webpage."""
175                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
176
177         def report_video_info_webpage_download(self, video_id):
178                 """Report attempt to download video info webpage."""
179                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
180
181         def report_video_subtitles_download(self, video_id):
182                 """Report attempt to download video info webpage."""
183                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
184
185         def report_information_extraction(self, video_id):
186                 """Report attempt to extract video information."""
187                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
188
189         def report_unavailable_format(self, video_id, format):
190                 """Report extracted video URL."""
191                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
192
193         def report_rtmp_download(self):
194                 """Indicate the download will use the RTMP protocol."""
195                 self._downloader.to_screen(u'[youtube] RTMP download detected')
196
197         def _closed_captions_xml_to_srt(self, xml_string):
198                 srt = ''
199                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
200                 # TODO parse xml instead of regex
201                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
202                         if not dur: dur = '4'
203                         start = float(start)
204                         end = start + float(dur)
205                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
206                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
207                         caption = unescapeHTML(caption)
208                         caption = unescapeHTML(caption) # double cycle, intentional
209                         srt += str(n+1) + '\n'
210                         srt += start + ' --> ' + end + '\n'
211                         srt += caption + '\n\n'
212                 return srt
213
214         def _print_formats(self, formats):
215                 print 'Available formats:'
216                 for x in formats:
217                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
218
219         def _real_initialize(self):
220                 if self._downloader is None:
221                         return
222
223                 username = None
224                 password = None
225                 downloader_params = self._downloader.params
226
227                 # Attempt to use provided username and password or .netrc data
228                 if downloader_params.get('username', None) is not None:
229                         username = downloader_params['username']
230                         password = downloader_params['password']
231                 elif downloader_params.get('usenetrc', False):
232                         try:
233                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
234                                 if info is not None:
235                                         username = info[0]
236                                         password = info[2]
237                                 else:
238                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
239                         except (IOError, netrc.NetrcParseError), err:
240                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
241                                 return
242
243                 # Set language
244                 request = urllib2.Request(self._LANG_URL)
245                 try:
246                         self.report_lang()
247                         urllib2.urlopen(request).read()
248                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
249                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
250                         return
251
252                 # No authentication to be performed
253                 if username is None:
254                         return
255
256                 # Log in
257                 login_form = {
258                                 'current_form': 'loginForm',
259                                 'next':         '/',
260                                 'action_login': 'Log In',
261                                 'username':     username,
262                                 'password':     password,
263                                 }
264                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
265                 try:
266                         self.report_login()
267                         login_results = urllib2.urlopen(request).read()
268                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
269                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
270                                 return
271                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
272                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
273                         return
274
275                 # Confirm age
276                 age_form = {
277                                 'next_url':             '/',
278                                 'action_confirm':       'Confirm',
279                                 }
280                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
281                 try:
282                         self.report_age_confirmation()
283                         age_results = urllib2.urlopen(request).read()
284                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
285                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
286                         return
287
288         def _real_extract(self, url):
289                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
290                 mobj = re.search(self._NEXT_URL_RE, url)
291                 if mobj:
292                         url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
293
294                 # Extract video id from URL
295                 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
296                 if mobj is None:
297                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
298                         return
299                 video_id = mobj.group(2)
300
301                 # Get video webpage
302                 self.report_video_webpage_download(video_id)
303                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
304                 try:
305                         video_webpage = urllib2.urlopen(request).read()
306                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
307                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
308                         return
309
310                 # Attempt to extract SWF player URL
311                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
312                 if mobj is not None:
313                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
314                 else:
315                         player_url = None
316
317                 # Get video info
318                 self.report_video_info_webpage_download(video_id)
319                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
320                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
321                                         % (video_id, el_type))
322                         request = urllib2.Request(video_info_url)
323                         try:
324                                 video_info_webpage = urllib2.urlopen(request).read()
325                                 video_info = parse_qs(video_info_webpage)
326                                 if 'token' in video_info:
327                                         break
328                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
329                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
330                                 return
331                 if 'token' not in video_info:
332                         if 'reason' in video_info:
333                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
334                         else:
335                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
336                         return
337
338                 # Check for "rental" videos
339                 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
340                         self._downloader.trouble(u'ERROR: "rental" videos not supported')
341                         return
342
343                 # Start extracting information
344                 self.report_information_extraction(video_id)
345
346                 # uploader
347                 if 'author' not in video_info:
348                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
349                         return
350                 video_uploader = urllib.unquote_plus(video_info['author'][0])
351
352                 # title
353                 if 'title' not in video_info:
354                         self._downloader.trouble(u'ERROR: unable to extract video title')
355                         return
356                 video_title = urllib.unquote_plus(video_info['title'][0])
357                 video_title = video_title.decode('utf-8')
358
359                 # thumbnail image
360                 if 'thumbnail_url' not in video_info:
361                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
362                         video_thumbnail = ''
363                 else:   # don't panic if we can't find it
364                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
365
366                 # upload date
367                 upload_date = u'NA'
368                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
369                 if mobj is not None:
370                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
371                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
372                         for expression in format_expressions:
373                                 try:
374                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
375                                 except:
376                                         pass
377
378                 # description
379                 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
380                 if video_description: video_description = clean_html(video_description)
381                 else: video_description = ''
382                         
383                 # closed captions
384                 video_subtitles = None
385                 if self._downloader.params.get('writesubtitles', False):
386                         try:
387                                 self.report_video_subtitles_download(video_id)
388                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
389                                 try:
390                                         srt_list = urllib2.urlopen(request).read()
391                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
392                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
393                                 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
394                                 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
395                                 if not srt_lang_list:
396                                         raise Trouble(u'WARNING: video has no closed captions')
397                                 if self._downloader.params.get('subtitleslang', False):
398                                         srt_lang = self._downloader.params.get('subtitleslang')
399                                 elif 'en' in srt_lang_list:
400                                         srt_lang = 'en'
401                                 else:
402                                         srt_lang = srt_lang_list.keys()[0]
403                                 if not srt_lang in srt_lang_list:
404                                         raise Trouble(u'WARNING: no closed captions found in the specified language')
405                                 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
406                                 try:
407                                         srt_xml = urllib2.urlopen(request).read()
408                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
409                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
410                                 if not srt_xml:
411                                         raise Trouble(u'WARNING: unable to download video subtitles')
412                                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
413                         except Trouble as trouble:
414                                 self._downloader.trouble(trouble[0])
415
416                 # token
417                 video_token = urllib.unquote_plus(video_info['token'][0])
418
419                 # Decide which formats to download
420                 req_format = self._downloader.params.get('format', None)
421
422                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
423                         self.report_rtmp_download()
424                         video_url_list = [(None, video_info['conn'][0])]
425                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
426                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
427                         url_data = [parse_qs(uds) for uds in url_data_strs]
428                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
429                         url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
430
431                         format_limit = self._downloader.params.get('format_limit', None)
432                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
433                         if format_limit is not None and format_limit in available_formats:
434                                 format_list = available_formats[available_formats.index(format_limit):]
435                         else:
436                                 format_list = available_formats
437                         existing_formats = [x for x in format_list if x in url_map]
438                         if len(existing_formats) == 0:
439                                 self._downloader.trouble(u'ERROR: no known formats available for video')
440                                 return
441                         if self._downloader.params.get('listformats', None):
442                                 self._print_formats(existing_formats)
443                                 return
444                         if req_format is None or req_format == 'best':
445                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
446                         elif req_format == 'worst':
447                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
448                         elif req_format in ('-1', 'all'):
449                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
450                         else:
451                                 # Specific formats. We pick the first in a slash-delimeted sequence.
452                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
453                                 req_formats = req_format.split('/')
454                                 video_url_list = None
455                                 for rf in req_formats:
456                                         if rf in url_map:
457                                                 video_url_list = [(rf, url_map[rf])]
458                                                 break
459                                 if video_url_list is None:
460                                         self._downloader.trouble(u'ERROR: requested format not available')
461                                         return
462                 else:
463                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
464                         return
465
466                 results = []
467                 for format_param, video_real_url in video_url_list:
468                         # Extension
469                         video_extension = self._video_extensions.get(format_param, 'flv')
470
471                         results.append({
472                                 'id':           video_id.decode('utf-8'),
473                                 'url':          video_real_url.decode('utf-8'),
474                                 'uploader':     video_uploader.decode('utf-8'),
475                                 'upload_date':  upload_date,
476                                 'title':        video_title,
477                                 'ext':          video_extension.decode('utf-8'),
478                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
479                                 'thumbnail':    video_thumbnail.decode('utf-8'),
480                                 'description':  video_description,
481                                 'player_url':   player_url,
482                                 'subtitles':    video_subtitles
483                         })
484                 return results
485
486
487 class MetacafeIE(InfoExtractor):
488         """Information Extractor for metacafe.com."""
489
490         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
491         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
492         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
493         IE_NAME = u'metacafe'
494
495         def __init__(self, downloader=None):
496                 InfoExtractor.__init__(self, downloader)
497
498         def report_disclaimer(self):
499                 """Report disclaimer retrieval."""
500                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
501
502         def report_age_confirmation(self):
503                 """Report attempt to confirm age."""
504                 self._downloader.to_screen(u'[metacafe] Confirming age')
505
506         def report_download_webpage(self, video_id):
507                 """Report webpage download."""
508                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
509
510         def report_extraction(self, video_id):
511                 """Report information extraction."""
512                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
513
514         def _real_initialize(self):
515                 # Retrieve disclaimer
516                 request = urllib2.Request(self._DISCLAIMER)
517                 try:
518                         self.report_disclaimer()
519                         disclaimer = urllib2.urlopen(request).read()
520                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
521                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
522                         return
523
524                 # Confirm age
525                 disclaimer_form = {
526                         'filters': '0',
527                         'submit': "Continue - I'm over 18",
528                         }
529                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
530                 try:
531                         self.report_age_confirmation()
532                         disclaimer = urllib2.urlopen(request).read()
533                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
534                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
535                         return
536
537         def _real_extract(self, url):
538                 # Extract id and simplified title from URL
539                 mobj = re.match(self._VALID_URL, url)
540                 if mobj is None:
541                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
542                         return
543
544                 video_id = mobj.group(1)
545
546                 # Check if video comes from YouTube
547                 mobj2 = re.match(r'^yt-(.*)$', video_id)
548                 if mobj2 is not None:
549                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
550                         return
551
552                 # Retrieve video webpage to extract further information
553                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
554                 try:
555                         self.report_download_webpage(video_id)
556                         webpage = urllib2.urlopen(request).read()
557                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
558                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
559                         return
560
561                 # Extract URL, uploader and title from webpage
562                 self.report_extraction(video_id)
563                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
564                 if mobj is not None:
565                         mediaURL = urllib.unquote(mobj.group(1))
566                         video_extension = mediaURL[-3:]
567
568                         # Extract gdaKey if available
569                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
570                         if mobj is None:
571                                 video_url = mediaURL
572                         else:
573                                 gdaKey = mobj.group(1)
574                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
575                 else:
576                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
577                         if mobj is None:
578                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
579                                 return
580                         vardict = parse_qs(mobj.group(1))
581                         if 'mediaData' not in vardict:
582                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
583                                 return
584                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
585                         if mobj is None:
586                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
587                                 return
588                         mediaURL = mobj.group(1).replace('\\/', '/')
589                         video_extension = mediaURL[-3:]
590                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
591
592                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
593                 if mobj is None:
594                         self._downloader.trouble(u'ERROR: unable to extract title')
595                         return
596                 video_title = mobj.group(1).decode('utf-8')
597
598                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
599                 if mobj is None:
600                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
601                         return
602                 video_uploader = mobj.group(1)
603
604                 return [{
605                         'id':           video_id.decode('utf-8'),
606                         'url':          video_url.decode('utf-8'),
607                         'uploader':     video_uploader.decode('utf-8'),
608                         'upload_date':  u'NA',
609                         'title':        video_title,
610                         'ext':          video_extension.decode('utf-8'),
611                         'format':       u'NA',
612                         'player_url':   None,
613                 }]
614
615
616 class DailymotionIE(InfoExtractor):
617         """Information Extractor for Dailymotion"""
618
619         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
620         IE_NAME = u'dailymotion'
621
622         def __init__(self, downloader=None):
623                 InfoExtractor.__init__(self, downloader)
624
625         def report_download_webpage(self, video_id):
626                 """Report webpage download."""
627                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
628
629         def report_extraction(self, video_id):
630                 """Report information extraction."""
631                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
632
633         def _real_extract(self, url):
634                 # Extract id and simplified title from URL
635                 mobj = re.match(self._VALID_URL, url)
636                 if mobj is None:
637                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
638                         return
639
640                 video_id = mobj.group(1)
641
642                 video_extension = 'mp4'
643
644                 # Retrieve video webpage to extract further information
645                 request = urllib2.Request(url)
646                 request.add_header('Cookie', 'family_filter=off')
647                 try:
648                         self.report_download_webpage(video_id)
649                         webpage = urllib2.urlopen(request).read()
650                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
651                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
652                         return
653
654                 # Extract URL, uploader and title from webpage
655                 self.report_extraction(video_id)
656                 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
657                 if mobj is None:
658                         self._downloader.trouble(u'ERROR: unable to extract media URL')
659                         return
660                 flashvars = urllib.unquote(mobj.group(1))
661                 if 'hqURL' in flashvars: max_quality = 'hqURL'
662                 elif 'sdURL' in flashvars: max_quality = 'sdURL'
663                 else: max_quality = 'ldURL'
664                 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
665                 if mobj is None:
666                         self._downloader.trouble(u'ERROR: unable to extract media URL')
667                         return
668                 video_url = mobj.group(1).replace('\\/', '/')
669
670                 # TODO: support choosing qualities
671
672                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
673                 if mobj is None:
674                         self._downloader.trouble(u'ERROR: unable to extract title')
675                         return
676                 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
677
678                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
679                 if mobj is None:
680                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
681                         return
682                 video_uploader = mobj.group(1)
683
684                 return [{
685                         'id':           video_id.decode('utf-8'),
686                         'url':          video_url.decode('utf-8'),
687                         'uploader':     video_uploader.decode('utf-8'),
688                         'upload_date':  u'NA',
689                         'title':        video_title,
690                         'ext':          video_extension.decode('utf-8'),
691                         'format':       u'NA',
692                         'player_url':   None,
693                 }]
694
695
696 class GoogleIE(InfoExtractor):
697         """Information extractor for video.google.com."""
698
699         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
700         IE_NAME = u'video.google'
701
702         def __init__(self, downloader=None):
703                 InfoExtractor.__init__(self, downloader)
704
705         def report_download_webpage(self, video_id):
706                 """Report webpage download."""
707                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
708
709         def report_extraction(self, video_id):
710                 """Report information extraction."""
711                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
712
713         def _real_extract(self, url):
714                 # Extract id from URL
715                 mobj = re.match(self._VALID_URL, url)
716                 if mobj is None:
717                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
718                         return
719
720                 video_id = mobj.group(1)
721
722                 video_extension = 'mp4'
723
724                 # Retrieve video webpage to extract further information
725                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
726                 try:
727                         self.report_download_webpage(video_id)
728                         webpage = urllib2.urlopen(request).read()
729                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
730                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
731                         return
732
733                 # Extract URL, uploader, and title from webpage
734                 self.report_extraction(video_id)
735                 mobj = re.search(r"download_url:'([^']+)'", webpage)
736                 if mobj is None:
737                         video_extension = 'flv'
738                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
739                 if mobj is None:
740                         self._downloader.trouble(u'ERROR: unable to extract media URL')
741                         return
742                 mediaURL = urllib.unquote(mobj.group(1))
743                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
744                 mediaURL = mediaURL.replace('\\x26', '\x26')
745
746                 video_url = mediaURL
747
748                 mobj = re.search(r'<title>(.*)</title>', webpage)
749                 if mobj is None:
750                         self._downloader.trouble(u'ERROR: unable to extract title')
751                         return
752                 video_title = mobj.group(1).decode('utf-8')
753
754                 # Extract video description
755                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
756                 if mobj is None:
757                         self._downloader.trouble(u'ERROR: unable to extract video description')
758                         return
759                 video_description = mobj.group(1).decode('utf-8')
760                 if not video_description:
761                         video_description = 'No description available.'
762
763                 # Extract video thumbnail
764                 if self._downloader.params.get('forcethumbnail', False):
765                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
766                         try:
767                                 webpage = urllib2.urlopen(request).read()
768                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
769                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
770                                 return
771                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
772                         if mobj is None:
773                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
774                                 return
775                         video_thumbnail = mobj.group(1)
776                 else:   # we need something to pass to process_info
777                         video_thumbnail = ''
778
779                 return [{
780                         'id':           video_id.decode('utf-8'),
781                         'url':          video_url.decode('utf-8'),
782                         'uploader':     u'NA',
783                         'upload_date':  u'NA',
784                         'title':        video_title,
785                         'ext':          video_extension.decode('utf-8'),
786                         'format':       u'NA',
787                         'player_url':   None,
788                 }]
789
790
791 class PhotobucketIE(InfoExtractor):
792         """Information extractor for photobucket.com."""
793
794         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
795         IE_NAME = u'photobucket'
796
797         def __init__(self, downloader=None):
798                 InfoExtractor.__init__(self, downloader)
799
800         def report_download_webpage(self, video_id):
801                 """Report webpage download."""
802                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
803
804         def report_extraction(self, video_id):
805                 """Report information extraction."""
806                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
807
808         def _real_extract(self, url):
809                 # Extract id from URL
810                 mobj = re.match(self._VALID_URL, url)
811                 if mobj is None:
812                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
813                         return
814
815                 video_id = mobj.group(1)
816
817                 video_extension = 'flv'
818
819                 # Retrieve video webpage to extract further information
820                 request = urllib2.Request(url)
821                 try:
822                         self.report_download_webpage(video_id)
823                         webpage = urllib2.urlopen(request).read()
824                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
825                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
826                         return
827
828                 # Extract URL, uploader, and title from webpage
829                 self.report_extraction(video_id)
830                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
831                 if mobj is None:
832                         self._downloader.trouble(u'ERROR: unable to extract media URL')
833                         return
834                 mediaURL = urllib.unquote(mobj.group(1))
835
836                 video_url = mediaURL
837
838                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
839                 if mobj is None:
840                         self._downloader.trouble(u'ERROR: unable to extract title')
841                         return
842                 video_title = mobj.group(1).decode('utf-8')
843
844                 video_uploader = mobj.group(2).decode('utf-8')
845
846                 return [{
847                         'id':           video_id.decode('utf-8'),
848                         'url':          video_url.decode('utf-8'),
849                         'uploader':     video_uploader,
850                         'upload_date':  u'NA',
851                         'title':        video_title,
852                         'ext':          video_extension.decode('utf-8'),
853                         'format':       u'NA',
854                         'player_url':   None,
855                 }]
856
857
858 class YahooIE(InfoExtractor):
859         """Information extractor for video.yahoo.com."""
860
861         # _VALID_URL matches all Yahoo! Video URLs
862         # _VPAGE_URL matches only the extractable '/watch/' URLs
863         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
864         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
865         IE_NAME = u'video.yahoo'
866
867         def __init__(self, downloader=None):
868                 InfoExtractor.__init__(self, downloader)
869
870         def report_download_webpage(self, video_id):
871                 """Report webpage download."""
872                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
873
874         def report_extraction(self, video_id):
875                 """Report information extraction."""
876                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
877
878         def _real_extract(self, url, new_video=True):
879                 # Extract ID from URL
880                 mobj = re.match(self._VALID_URL, url)
881                 if mobj is None:
882                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
883                         return
884
885                 video_id = mobj.group(2)
886                 video_extension = 'flv'
887
888                 # Rewrite valid but non-extractable URLs as
889                 # extractable English language /watch/ URLs
890                 if re.match(self._VPAGE_URL, url) is None:
891                         request = urllib2.Request(url)
892                         try:
893                                 webpage = urllib2.urlopen(request).read()
894                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
895                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
896                                 return
897
898                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
899                         if mobj is None:
900                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
901                                 return
902                         yahoo_id = mobj.group(1)
903
904                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
905                         if mobj is None:
906                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
907                                 return
908                         yahoo_vid = mobj.group(1)
909
910                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
911                         return self._real_extract(url, new_video=False)
912
913                 # Retrieve video webpage to extract further information
914                 request = urllib2.Request(url)
915                 try:
916                         self.report_download_webpage(video_id)
917                         webpage = urllib2.urlopen(request).read()
918                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
919                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
920                         return
921
922                 # Extract uploader and title from webpage
923                 self.report_extraction(video_id)
924                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
925                 if mobj is None:
926                         self._downloader.trouble(u'ERROR: unable to extract video title')
927                         return
928                 video_title = mobj.group(1).decode('utf-8')
929
930                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
931                 if mobj is None:
932                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
933                         return
934                 video_uploader = mobj.group(1).decode('utf-8')
935
936                 # Extract video thumbnail
937                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
938                 if mobj is None:
939                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
940                         return
941                 video_thumbnail = mobj.group(1).decode('utf-8')
942
943                 # Extract video description
944                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
945                 if mobj is None:
946                         self._downloader.trouble(u'ERROR: unable to extract video description')
947                         return
948                 video_description = mobj.group(1).decode('utf-8')
949                 if not video_description:
950                         video_description = 'No description available.'
951
952                 # Extract video height and width
953                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
954                 if mobj is None:
955                         self._downloader.trouble(u'ERROR: unable to extract video height')
956                         return
957                 yv_video_height = mobj.group(1)
958
959                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
960                 if mobj is None:
961                         self._downloader.trouble(u'ERROR: unable to extract video width')
962                         return
963                 yv_video_width = mobj.group(1)
964
965                 # Retrieve video playlist to extract media URL
966                 # I'm not completely sure what all these options are, but we
967                 # seem to need most of them, otherwise the server sends a 401.
968                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
969                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
970                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
971                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
972                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
973                 try:
974                         self.report_download_webpage(video_id)
975                         webpage = urllib2.urlopen(request).read()
976                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
977                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
978                         return
979
980                 # Extract media URL from playlist XML
981                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
982                 if mobj is None:
983                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
984                         return
985                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
986                 video_url = unescapeHTML(video_url)
987
988                 return [{
989                         'id':           video_id.decode('utf-8'),
990                         'url':          video_url,
991                         'uploader':     video_uploader,
992                         'upload_date':  u'NA',
993                         'title':        video_title,
994                         'ext':          video_extension.decode('utf-8'),
995                         'thumbnail':    video_thumbnail.decode('utf-8'),
996                         'description':  video_description,
997                         'thumbnail':    video_thumbnail,
998                         'player_url':   None,
999                 }]
1000
1001
1002 class VimeoIE(InfoExtractor):
1003         """Information extractor for vimeo.com."""
1004
1005         # _VALID_URL matches Vimeo URLs
1006         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1007         IE_NAME = u'vimeo'
1008
1009         def __init__(self, downloader=None):
1010                 InfoExtractor.__init__(self, downloader)
1011
1012         def report_download_webpage(self, video_id):
1013                 """Report webpage download."""
1014                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1015
1016         def report_extraction(self, video_id):
1017                 """Report information extraction."""
1018                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1019
1020         def _real_extract(self, url, new_video=True):
1021                 # Extract ID from URL
1022                 mobj = re.match(self._VALID_URL, url)
1023                 if mobj is None:
1024                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1025                         return
1026
1027                 video_id = mobj.group(1)
1028
1029                 # Retrieve video webpage to extract further information
1030                 request = urllib2.Request(url, None, std_headers)
1031                 try:
1032                         self.report_download_webpage(video_id)
1033                         webpage = urllib2.urlopen(request).read()
1034                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1035                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1036                         return
1037
1038                 # Now we begin extracting as much information as we can from what we
1039                 # retrieved. First we extract the information common to all extractors,
1040                 # and latter we extract those that are Vimeo specific.
1041                 self.report_extraction(video_id)
1042
1043                 # Extract the config JSON
1044                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1045                 try:
1046                         config = json.loads(config)
1047                 except:
1048                         self._downloader.trouble(u'ERROR: unable to extract info section')
1049                         return
1050                 
1051                 # Extract title
1052                 video_title = config["video"]["title"]
1053
1054                 # Extract uploader
1055                 video_uploader = config["video"]["owner"]["name"]
1056
1057                 # Extract video thumbnail
1058                 video_thumbnail = config["video"]["thumbnail"]
1059
1060                 # Extract video description
1061                 video_description = get_element_by_id("description", webpage.decode('utf8'))
1062                 if video_description: video_description = clean_html(video_description)
1063                 else: video_description = ''
1064
1065                 # Extract upload date
1066                 video_upload_date = u'NA'
1067                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1068                 if mobj is not None:
1069                         video_upload_date = mobj.group(1)
1070
1071                 # Vimeo specific: extract request signature and timestamp
1072                 sig = config['request']['signature']
1073                 timestamp = config['request']['timestamp']
1074
1075                 # Vimeo specific: extract video codec and quality information
1076                 # TODO bind to format param
1077                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1078                 for codec in codecs:
1079                         if codec[0] in config["video"]["files"]:
1080                                 video_codec = codec[0]
1081                                 video_extension = codec[1]
1082                                 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1083                                 else: quality = 'sd'
1084                                 break
1085                 else:
1086                         self._downloader.trouble(u'ERROR: no known codec found')
1087                         return
1088
1089                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1090                                         %(video_id, sig, timestamp, quality, video_codec.upper())
1091
1092                 return [{
1093                         'id':           video_id,
1094                         'url':          video_url,
1095                         'uploader':     video_uploader,
1096                         'upload_date':  video_upload_date,
1097                         'title':        video_title,
1098                         'ext':          video_extension,
1099                         'thumbnail':    video_thumbnail,
1100                         'description':  video_description,
1101                         'player_url':   None,
1102                 }]
1103
1104
1105 class GenericIE(InfoExtractor):
1106         """Generic last-resort information extractor."""
1107
1108         _VALID_URL = r'.*'
1109         IE_NAME = u'generic'
1110
1111         def __init__(self, downloader=None):
1112                 InfoExtractor.__init__(self, downloader)
1113
1114         def report_download_webpage(self, video_id):
1115                 """Report webpage download."""
1116                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1117                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1118
1119         def report_extraction(self, video_id):
1120                 """Report information extraction."""
1121                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1122
1123         def report_following_redirect(self, new_url):
1124                 """Report information extraction."""
1125                 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1126                 
1127         def _test_redirect(self, url):
1128                 """Check if it is a redirect, like url shorteners, in case restart chain."""
1129                 class HeadRequest(urllib2.Request):
1130                         def get_method(self):
1131                                 return "HEAD"
1132
1133                 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1134                         """
1135                         Subclass the HTTPRedirectHandler to make it use our 
1136                         HeadRequest also on the redirected URL
1137                         """
1138                         def redirect_request(self, req, fp, code, msg, headers, newurl): 
1139                                 if code in (301, 302, 303, 307):
1140                                         newurl = newurl.replace(' ', '%20') 
1141                                         newheaders = dict((k,v) for k,v in req.headers.items()
1142                                                                           if k.lower() not in ("content-length", "content-type"))
1143                                         return HeadRequest(newurl, 
1144                                                                            headers=newheaders,
1145                                                                            origin_req_host=req.get_origin_req_host(), 
1146                                                                            unverifiable=True) 
1147                                 else: 
1148                                         raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp) 
1149
1150                 class HTTPMethodFallback(urllib2.BaseHandler):
1151                         """
1152                         Fallback to GET if HEAD is not allowed (405 HTTP error)
1153                         """
1154                         def http_error_405(self, req, fp, code, msg, headers): 
1155                                 fp.read()
1156                                 fp.close()
1157
1158                                 newheaders = dict((k,v) for k,v in req.headers.items()
1159                                                                   if k.lower() not in ("content-length", "content-type"))
1160                                 return self.parent.open(urllib2.Request(req.get_full_url(), 
1161                                                                                                  headers=newheaders, 
1162                                                                                                  origin_req_host=req.get_origin_req_host(), 
1163                                                                                                  unverifiable=True))
1164
1165                 # Build our opener
1166                 opener = urllib2.OpenerDirector() 
1167                 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1168                                                 HTTPMethodFallback, HEADRedirectHandler,
1169                                                 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1170                         opener.add_handler(handler())
1171
1172                 response = opener.open(HeadRequest(url))
1173                 new_url = response.geturl()
1174                 
1175                 if url == new_url: return False
1176                 
1177                 self.report_following_redirect(new_url)
1178                 self._downloader.download([new_url])
1179                 return True
1180
1181         def _real_extract(self, url):
1182                 if self._test_redirect(url): return
1183
1184                 video_id = url.split('/')[-1]
1185                 request = urllib2.Request(url)
1186                 try:
1187                         self.report_download_webpage(video_id)
1188                         webpage = urllib2.urlopen(request).read()
1189                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1190                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1191                         return
1192                 except ValueError, err:
1193                         # since this is the last-resort InfoExtractor, if
1194                         # this error is thrown, it'll be thrown here
1195                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1196                         return
1197
1198                 self.report_extraction(video_id)
1199                 # Start with something easy: JW Player in SWFObject
1200                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1201                 if mobj is None:
1202                         # Broaden the search a little bit
1203                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1204                 if mobj is None:
1205                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1206                         return
1207
1208                 # It's possible that one of the regexes
1209                 # matched, but returned an empty group:
1210                 if mobj.group(1) is None:
1211                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1212                         return
1213
1214                 video_url = urllib.unquote(mobj.group(1))
1215                 video_id = os.path.basename(video_url)
1216
1217                 # here's a fun little line of code for you:
1218                 video_extension = os.path.splitext(video_id)[1][1:]
1219                 video_id = os.path.splitext(video_id)[0]
1220
1221                 # it's tempting to parse this further, but you would
1222                 # have to take into account all the variations like
1223                 #   Video Title - Site Name
1224                 #   Site Name | Video Title
1225                 #   Video Title - Tagline | Site Name
1226                 # and so on and so forth; it's just not practical
1227                 mobj = re.search(r'<title>(.*)</title>', webpage)
1228                 if mobj is None:
1229                         self._downloader.trouble(u'ERROR: unable to extract title')
1230                         return
1231                 video_title = mobj.group(1).decode('utf-8')
1232
1233                 # video uploader is domain name
1234                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1235                 if mobj is None:
1236                         self._downloader.trouble(u'ERROR: unable to extract title')
1237                         return
1238                 video_uploader = mobj.group(1).decode('utf-8')
1239
1240                 return [{
1241                         'id':           video_id.decode('utf-8'),
1242                         'url':          video_url.decode('utf-8'),
1243                         'uploader':     video_uploader,
1244                         'upload_date':  u'NA',
1245                         'title':        video_title,
1246                         'ext':          video_extension.decode('utf-8'),
1247                         'format':       u'NA',
1248                         'player_url':   None,
1249                 }]
1250
1251
1252 class YoutubeSearchIE(InfoExtractor):
1253         """Information Extractor for YouTube search queries."""
1254         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1255         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1256         _max_youtube_results = 1000
1257         IE_NAME = u'youtube:search'
1258
1259         def __init__(self, downloader=None):
1260                 InfoExtractor.__init__(self, downloader)
1261
1262         def report_download_page(self, query, pagenum):
1263                 """Report attempt to download search page with given number."""
1264                 query = query.decode(preferredencoding())
1265                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1266
1267         def _real_extract(self, query):
1268                 mobj = re.match(self._VALID_URL, query)
1269                 if mobj is None:
1270                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1271                         return
1272
1273                 prefix, query = query.split(':')
1274                 prefix = prefix[8:]
1275                 query = query.encode('utf-8')
1276                 if prefix == '':
1277                         self._download_n_results(query, 1)
1278                         return
1279                 elif prefix == 'all':
1280                         self._download_n_results(query, self._max_youtube_results)
1281                         return
1282                 else:
1283                         try:
1284                                 n = long(prefix)
1285                                 if n <= 0:
1286                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1287                                         return
1288                                 elif n > self._max_youtube_results:
1289                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1290                                         n = self._max_youtube_results
1291                                 self._download_n_results(query, n)
1292                                 return
1293                         except ValueError: # parsing prefix as integer fails
1294                                 self._download_n_results(query, 1)
1295                                 return
1296
1297         def _download_n_results(self, query, n):
1298                 """Downloads a specified number of results for a query"""
1299
1300                 video_ids = []
1301                 pagenum = 0
1302                 limit = n
1303
1304                 while (50 * pagenum) < limit:
1305                         self.report_download_page(query, pagenum+1)
1306                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1307                         request = urllib2.Request(result_url)
1308                         try:
1309                                 data = urllib2.urlopen(request).read()
1310                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1311                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1312                                 return
1313                         api_response = json.loads(data)['data']
1314
1315                         new_ids = list(video['id'] for video in api_response['items'])
1316                         video_ids += new_ids
1317
1318                         limit = min(n, api_response['totalItems'])
1319                         pagenum += 1
1320
1321                 if len(video_ids) > n:
1322                         video_ids = video_ids[:n]
1323                 for id in video_ids:
1324                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1325                 return
1326
1327
1328 class GoogleSearchIE(InfoExtractor):
1329         """Information Extractor for Google Video search queries."""
1330         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1331         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1332         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1333         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1334         _max_google_results = 1000
1335         IE_NAME = u'video.google:search'
1336
1337         def __init__(self, downloader=None):
1338                 InfoExtractor.__init__(self, downloader)
1339
1340         def report_download_page(self, query, pagenum):
1341                 """Report attempt to download playlist page with given number."""
1342                 query = query.decode(preferredencoding())
1343                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1344
1345         def _real_extract(self, query):
1346                 mobj = re.match(self._VALID_URL, query)
1347                 if mobj is None:
1348                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1349                         return
1350
1351                 prefix, query = query.split(':')
1352                 prefix = prefix[8:]
1353                 query = query.encode('utf-8')
1354                 if prefix == '':
1355                         self._download_n_results(query, 1)
1356                         return
1357                 elif prefix == 'all':
1358                         self._download_n_results(query, self._max_google_results)
1359                         return
1360                 else:
1361                         try:
1362                                 n = long(prefix)
1363                                 if n <= 0:
1364                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1365                                         return
1366                                 elif n > self._max_google_results:
1367                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1368                                         n = self._max_google_results
1369                                 self._download_n_results(query, n)
1370                                 return
1371                         except ValueError: # parsing prefix as integer fails
1372                                 self._download_n_results(query, 1)
1373                                 return
1374
1375         def _download_n_results(self, query, n):
1376                 """Downloads a specified number of results for a query"""
1377
1378                 video_ids = []
1379                 pagenum = 0
1380
1381                 while True:
1382                         self.report_download_page(query, pagenum)
1383                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1384                         request = urllib2.Request(result_url)
1385                         try:
1386                                 page = urllib2.urlopen(request).read()
1387                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1388                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1389                                 return
1390
1391                         # Extract video identifiers
1392                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1393                                 video_id = mobj.group(1)
1394                                 if video_id not in video_ids:
1395                                         video_ids.append(video_id)
1396                                         if len(video_ids) == n:
1397                                                 # Specified n videos reached
1398                                                 for id in video_ids:
1399                                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1400                                                 return
1401
1402                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1403                                 for id in video_ids:
1404                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1405                                 return
1406
1407                         pagenum = pagenum + 1
1408
1409
1410 class YahooSearchIE(InfoExtractor):
1411         """Information Extractor for Yahoo! Video search queries."""
1412         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1413         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1414         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1415         _MORE_PAGES_INDICATOR = r'\s*Next'
1416         _max_yahoo_results = 1000
1417         IE_NAME = u'video.yahoo:search'
1418
1419         def __init__(self, downloader=None):
1420                 InfoExtractor.__init__(self, downloader)
1421
1422         def report_download_page(self, query, pagenum):
1423                 """Report attempt to download playlist page with given number."""
1424                 query = query.decode(preferredencoding())
1425                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1426
1427         def _real_extract(self, query):
1428                 mobj = re.match(self._VALID_URL, query)
1429                 if mobj is None:
1430                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1431                         return
1432
1433                 prefix, query = query.split(':')
1434                 prefix = prefix[8:]
1435                 query = query.encode('utf-8')
1436                 if prefix == '':
1437                         self._download_n_results(query, 1)
1438                         return
1439                 elif prefix == 'all':
1440                         self._download_n_results(query, self._max_yahoo_results)
1441                         return
1442                 else:
1443                         try:
1444                                 n = long(prefix)
1445                                 if n <= 0:
1446                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1447                                         return
1448                                 elif n > self._max_yahoo_results:
1449                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1450                                         n = self._max_yahoo_results
1451                                 self._download_n_results(query, n)
1452                                 return
1453                         except ValueError: # parsing prefix as integer fails
1454                                 self._download_n_results(query, 1)
1455                                 return
1456
1457         def _download_n_results(self, query, n):
1458                 """Downloads a specified number of results for a query"""
1459
1460                 video_ids = []
1461                 already_seen = set()
1462                 pagenum = 1
1463
1464                 while True:
1465                         self.report_download_page(query, pagenum)
1466                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1467                         request = urllib2.Request(result_url)
1468                         try:
1469                                 page = urllib2.urlopen(request).read()
1470                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1471                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1472                                 return
1473
1474                         # Extract video identifiers
1475                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1476                                 video_id = mobj.group(1)
1477                                 if video_id not in already_seen:
1478                                         video_ids.append(video_id)
1479                                         already_seen.add(video_id)
1480                                         if len(video_ids) == n:
1481                                                 # Specified n videos reached
1482                                                 for id in video_ids:
1483                                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1484                                                 return
1485
1486                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1487                                 for id in video_ids:
1488                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1489                                 return
1490
1491                         pagenum = pagenum + 1
1492
1493
1494 class YoutubePlaylistIE(InfoExtractor):
1495         """Information Extractor for YouTube playlists."""
1496
1497         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1498         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1499         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=.*?%s'
1500         _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1501         IE_NAME = u'youtube:playlist'
1502
1503         def __init__(self, downloader=None):
1504                 InfoExtractor.__init__(self, downloader)
1505
1506         def report_download_page(self, playlist_id, pagenum):
1507                 """Report attempt to download playlist page with given number."""
1508                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1509
1510         def _real_extract(self, url):
1511                 # Extract playlist id
1512                 mobj = re.match(self._VALID_URL, url)
1513                 if mobj is None:
1514                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1515                         return
1516
1517                 # Single video case
1518                 if mobj.group(3) is not None:
1519                         self._downloader.download([mobj.group(3)])
1520                         return
1521
1522                 # Download playlist pages
1523                 # prefix is 'p' as default for playlists but there are other types that need extra care
1524                 playlist_prefix = mobj.group(1)
1525                 if playlist_prefix == 'a':
1526                         playlist_access = 'artist'
1527                 else:
1528                         playlist_prefix = 'p'
1529                         playlist_access = 'view_play_list'
1530                 playlist_id = mobj.group(2)
1531                 video_ids = []
1532                 pagenum = 1
1533
1534                 while True:
1535                         self.report_download_page(playlist_id, pagenum)
1536                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1537                         request = urllib2.Request(url)
1538                         try:
1539                                 page = urllib2.urlopen(request).read()
1540                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1541                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1542                                 return
1543
1544                         # Extract video identifiers
1545                         ids_in_page = []
1546                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1547                                 if mobj.group(1) not in ids_in_page:
1548                                         ids_in_page.append(mobj.group(1))
1549                         video_ids.extend(ids_in_page)
1550
1551                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1552                                 break
1553                         pagenum = pagenum + 1
1554
1555                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1556                 playlistend = self._downloader.params.get('playlistend', -1)
1557                 if playlistend == -1:
1558                         video_ids = video_ids[playliststart:]
1559                 else:
1560                         video_ids = video_ids[playliststart:playlistend]
1561
1562                 for id in video_ids:
1563                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1564                 return
1565
1566
1567 class YoutubeUserIE(InfoExtractor):
1568         """Information Extractor for YouTube users."""
1569
1570         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1571         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1572         _GDATA_PAGE_SIZE = 50
1573         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1574         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1575         IE_NAME = u'youtube:user'
1576
1577         def __init__(self, downloader=None):
1578                 InfoExtractor.__init__(self, downloader)
1579
1580         def report_download_page(self, username, start_index):
1581                 """Report attempt to download user page."""
1582                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1583                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1584
1585         def _real_extract(self, url):
1586                 # Extract username
1587                 mobj = re.match(self._VALID_URL, url)
1588                 if mobj is None:
1589                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1590                         return
1591
1592                 username = mobj.group(1)
1593
1594                 # Download video ids using YouTube Data API. Result size per
1595                 # query is limited (currently to 50 videos) so we need to query
1596                 # page by page until there are no video ids - it means we got
1597                 # all of them.
1598
1599                 video_ids = []
1600                 pagenum = 0
1601
1602                 while True:
1603                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1604                         self.report_download_page(username, start_index)
1605
1606                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1607
1608                         try:
1609                                 page = urllib2.urlopen(request).read()
1610                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1611                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1612                                 return
1613
1614                         # Extract video identifiers
1615                         ids_in_page = []
1616
1617                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1618                                 if mobj.group(1) not in ids_in_page:
1619                                         ids_in_page.append(mobj.group(1))
1620
1621                         video_ids.extend(ids_in_page)
1622
1623                         # A little optimization - if current page is not
1624                         # "full", ie. does not contain PAGE_SIZE video ids then
1625                         # we can assume that this page is the last one - there
1626                         # are no more ids on further pages - no need to query
1627                         # again.
1628
1629                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1630                                 break
1631
1632                         pagenum += 1
1633
1634                 all_ids_count = len(video_ids)
1635                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1636                 playlistend = self._downloader.params.get('playlistend', -1)
1637
1638                 if playlistend == -1:
1639                         video_ids = video_ids[playliststart:]
1640                 else:
1641                         video_ids = video_ids[playliststart:playlistend]
1642
1643                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1644                                 (username, all_ids_count, len(video_ids)))
1645
1646                 for video_id in video_ids:
1647                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1648
1649
1650 class BlipTVUserIE(InfoExtractor):
1651         """Information Extractor for blip.tv users."""
1652
1653         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1654         _PAGE_SIZE = 12
1655         IE_NAME = u'blip.tv:user'
1656
1657         def __init__(self, downloader=None):
1658                 InfoExtractor.__init__(self, downloader)
1659
1660         def report_download_page(self, username, pagenum):
1661                 """Report attempt to download user page."""
1662                 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1663                                 (self.IE_NAME, username, pagenum))
1664
1665         def _real_extract(self, url):
1666                 # Extract username
1667                 mobj = re.match(self._VALID_URL, url)
1668                 if mobj is None:
1669                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1670                         return
1671
1672                 username = mobj.group(1)
1673
1674                 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1675
1676                 request = urllib2.Request(url)
1677
1678                 try:
1679                         page = urllib2.urlopen(request).read().decode('utf-8')
1680                         mobj = re.search(r'data-users-id="([^"]+)"', page)
1681                         page_base = page_base % mobj.group(1)
1682                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1683                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1684                         return
1685
1686
1687                 # Download video ids using BlipTV Ajax calls. Result size per
1688                 # query is limited (currently to 12 videos) so we need to query
1689                 # page by page until there are no video ids - it means we got
1690                 # all of them.
1691
1692                 video_ids = []
1693                 pagenum = 1
1694
1695                 while True:
1696                         self.report_download_page(username, pagenum)
1697
1698                         request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1699
1700                         try:
1701                                 page = urllib2.urlopen(request).read().decode('utf-8')
1702                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1703                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1704                                 return
1705
1706                         # Extract video identifiers
1707                         ids_in_page = []
1708
1709                         for mobj in re.finditer(r'href="/([^"]+)"', page):
1710                                 if mobj.group(1) not in ids_in_page:
1711                                         ids_in_page.append(unescapeHTML(mobj.group(1)))
1712
1713                         video_ids.extend(ids_in_page)
1714
1715                         # A little optimization - if current page is not
1716                         # "full", ie. does not contain PAGE_SIZE video ids then
1717                         # we can assume that this page is the last one - there
1718                         # are no more ids on further pages - no need to query
1719                         # again.
1720
1721                         if len(ids_in_page) < self._PAGE_SIZE:
1722                                 break
1723
1724                         pagenum += 1
1725
1726                 all_ids_count = len(video_ids)
1727                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1728                 playlistend = self._downloader.params.get('playlistend', -1)
1729
1730                 if playlistend == -1:
1731                         video_ids = video_ids[playliststart:]
1732                 else:
1733                         video_ids = video_ids[playliststart:playlistend]
1734
1735                 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1736                                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1737
1738                 for video_id in video_ids:
1739                         self._downloader.download([u'http://blip.tv/'+video_id])
1740
1741
1742 class DepositFilesIE(InfoExtractor):
1743         """Information extractor for depositfiles.com"""
1744
1745         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1746         IE_NAME = u'DepositFiles'
1747
1748         def __init__(self, downloader=None):
1749                 InfoExtractor.__init__(self, downloader)
1750
1751         def report_download_webpage(self, file_id):
1752                 """Report webpage download."""
1753                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1754
1755         def report_extraction(self, file_id):
1756                 """Report information extraction."""
1757                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1758
1759         def _real_extract(self, url):
1760                 file_id = url.split('/')[-1]
1761                 # Rebuild url in english locale
1762                 url = 'http://depositfiles.com/en/files/' + file_id
1763
1764                 # Retrieve file webpage with 'Free download' button pressed
1765                 free_download_indication = { 'gateway_result' : '1' }
1766                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1767                 try:
1768                         self.report_download_webpage(file_id)
1769                         webpage = urllib2.urlopen(request).read()
1770                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1771                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1772                         return
1773
1774                 # Search for the real file URL
1775                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1776                 if (mobj is None) or (mobj.group(1) is None):
1777                         # Try to figure out reason of the error.
1778                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1779                         if (mobj is not None) and (mobj.group(1) is not None):
1780                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1781                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1782                         else:
1783                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1784                         return
1785
1786                 file_url = mobj.group(1)
1787                 file_extension = os.path.splitext(file_url)[1][1:]
1788
1789                 # Search for file title
1790                 mobj = re.search(r'<b title="(.*?)">', webpage)
1791                 if mobj is None:
1792                         self._downloader.trouble(u'ERROR: unable to extract title')
1793                         return
1794                 file_title = mobj.group(1).decode('utf-8')
1795
1796                 return [{
1797                         'id':           file_id.decode('utf-8'),
1798                         'url':          file_url.decode('utf-8'),
1799                         'uploader':     u'NA',
1800                         'upload_date':  u'NA',
1801                         'title':        file_title,
1802                         'ext':          file_extension.decode('utf-8'),
1803                         'format':       u'NA',
1804                         'player_url':   None,
1805                 }]
1806
1807
1808 class FacebookIE(InfoExtractor):
1809         """Information Extractor for Facebook"""
1810
1811         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1812         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1813         _NETRC_MACHINE = 'facebook'
1814         _available_formats = ['video', 'highqual', 'lowqual']
1815         _video_extensions = {
1816                 'video': 'mp4',
1817                 'highqual': 'mp4',
1818                 'lowqual': 'mp4',
1819         }
1820         IE_NAME = u'facebook'
1821
1822         def __init__(self, downloader=None):
1823                 InfoExtractor.__init__(self, downloader)
1824
1825         def _reporter(self, message):
1826                 """Add header and report message."""
1827                 self._downloader.to_screen(u'[facebook] %s' % message)
1828
1829         def report_login(self):
1830                 """Report attempt to log in."""
1831                 self._reporter(u'Logging in')
1832
1833         def report_video_webpage_download(self, video_id):
1834                 """Report attempt to download video webpage."""
1835                 self._reporter(u'%s: Downloading video webpage' % video_id)
1836
1837         def report_information_extraction(self, video_id):
1838                 """Report attempt to extract video information."""
1839                 self._reporter(u'%s: Extracting video information' % video_id)
1840
1841         def _parse_page(self, video_webpage):
1842                 """Extract video information from page"""
1843                 # General data
1844                 data = {'title': r'\("video_title", "(.*?)"\)',
1845                         'description': r'<div class="datawrap">(.*?)</div>',
1846                         'owner': r'\("video_owner_name", "(.*?)"\)',
1847                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1848                         }
1849                 video_info = {}
1850                 for piece in data.keys():
1851                         mobj = re.search(data[piece], video_webpage)
1852                         if mobj is not None:
1853                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1854
1855                 # Video urls
1856                 video_urls = {}
1857                 for fmt in self._available_formats:
1858                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1859                         if mobj is not None:
1860                                 # URL is in a Javascript segment inside an escaped Unicode format within
1861                                 # the generally utf-8 page
1862                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1863                 video_info['video_urls'] = video_urls
1864
1865                 return video_info
1866
1867         def _real_initialize(self):
1868                 if self._downloader is None:
1869                         return
1870
1871                 useremail = None
1872                 password = None
1873                 downloader_params = self._downloader.params
1874
1875                 # Attempt to use provided username and password or .netrc data
1876                 if downloader_params.get('username', None) is not None:
1877                         useremail = downloader_params['username']
1878                         password = downloader_params['password']
1879                 elif downloader_params.get('usenetrc', False):
1880                         try:
1881                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1882                                 if info is not None:
1883                                         useremail = info[0]
1884                                         password = info[2]
1885                                 else:
1886                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1887                         except (IOError, netrc.NetrcParseError), err:
1888                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1889                                 return
1890
1891                 if useremail is None:
1892                         return
1893
1894                 # Log in
1895                 login_form = {
1896                         'email': useremail,
1897                         'pass': password,
1898                         'login': 'Log+In'
1899                         }
1900                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1901                 try:
1902                         self.report_login()
1903                         login_results = urllib2.urlopen(request).read()
1904                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1905                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1906                                 return
1907                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1908                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1909                         return
1910
1911         def _real_extract(self, url):
1912                 mobj = re.match(self._VALID_URL, url)
1913                 if mobj is None:
1914                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1915                         return
1916                 video_id = mobj.group('ID')
1917
1918                 # Get video webpage
1919                 self.report_video_webpage_download(video_id)
1920                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1921                 try:
1922                         page = urllib2.urlopen(request)
1923                         video_webpage = page.read()
1924                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1925                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1926                         return
1927
1928                 # Start extracting information
1929                 self.report_information_extraction(video_id)
1930
1931                 # Extract information
1932                 video_info = self._parse_page(video_webpage)
1933
1934                 # uploader
1935                 if 'owner' not in video_info:
1936                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1937                         return
1938                 video_uploader = video_info['owner']
1939
1940                 # title
1941                 if 'title' not in video_info:
1942                         self._downloader.trouble(u'ERROR: unable to extract video title')
1943                         return
1944                 video_title = video_info['title']
1945                 video_title = video_title.decode('utf-8')
1946
1947                 # thumbnail image
1948                 if 'thumbnail' not in video_info:
1949                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1950                         video_thumbnail = ''
1951                 else:
1952                         video_thumbnail = video_info['thumbnail']
1953
1954                 # upload date
1955                 upload_date = u'NA'
1956                 if 'upload_date' in video_info:
1957                         upload_time = video_info['upload_date']
1958                         timetuple = email.utils.parsedate_tz(upload_time)
1959                         if timetuple is not None:
1960                                 try:
1961                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1962                                 except:
1963                                         pass
1964
1965                 # description
1966                 video_description = video_info.get('description', 'No description available.')
1967
1968                 url_map = video_info['video_urls']
1969                 if len(url_map.keys()) > 0:
1970                         # Decide which formats to download
1971                         req_format = self._downloader.params.get('format', None)
1972                         format_limit = self._downloader.params.get('format_limit', None)
1973
1974                         if format_limit is not None and format_limit in self._available_formats:
1975                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1976                         else:
1977                                 format_list = self._available_formats
1978                         existing_formats = [x for x in format_list if x in url_map]
1979                         if len(existing_formats) == 0:
1980                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1981                                 return
1982                         if req_format is None:
1983                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1984                         elif req_format == 'worst':
1985                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1986                         elif req_format == '-1':
1987                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1988                         else:
1989                                 # Specific format
1990                                 if req_format not in url_map:
1991                                         self._downloader.trouble(u'ERROR: requested format not available')
1992                                         return
1993                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1994
1995                 results = []
1996                 for format_param, video_real_url in video_url_list:
1997                         # Extension
1998                         video_extension = self._video_extensions.get(format_param, 'mp4')
1999
2000                         results.append({
2001                                 'id':           video_id.decode('utf-8'),
2002                                 'url':          video_real_url.decode('utf-8'),
2003                                 'uploader':     video_uploader.decode('utf-8'),
2004                                 'upload_date':  upload_date,
2005                                 'title':        video_title,
2006                                 'ext':          video_extension.decode('utf-8'),
2007                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2008                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2009                                 'description':  video_description.decode('utf-8'),
2010                                 'player_url':   None,
2011                         })
2012                 return results
2013
2014 class BlipTVIE(InfoExtractor):
2015         """Information extractor for blip.tv"""
2016
2017         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2018         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2019         IE_NAME = u'blip.tv'
2020
2021         def report_extraction(self, file_id):
2022                 """Report information extraction."""
2023                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2024
2025         def report_direct_download(self, title):
2026                 """Report information extraction."""
2027                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2028
2029         def _real_extract(self, url):
2030                 mobj = re.match(self._VALID_URL, url)
2031                 if mobj is None:
2032                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2033                         return
2034
2035                 if '?' in url:
2036                         cchar = '&'
2037                 else:
2038                         cchar = '?'
2039                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2040                 request = urllib2.Request(json_url.encode('utf-8'))
2041                 self.report_extraction(mobj.group(1))
2042                 info = None
2043                 try:
2044                         urlh = urllib2.urlopen(request)
2045                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2046                                 basename = url.split('/')[-1]
2047                                 title,ext = os.path.splitext(basename)
2048                                 title = title.decode('UTF-8')
2049                                 ext = ext.replace('.', '')
2050                                 self.report_direct_download(title)
2051                                 info = {
2052                                         'id': title,
2053                                         'url': url,
2054                                         'title': title,
2055                                         'ext': ext,
2056                                         'urlhandle': urlh
2057                                 }
2058                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2059                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2060                         return
2061                 if info is None: # Regular URL
2062                         try:
2063                                 json_code = urlh.read()
2064                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2065                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2066                                 return
2067
2068                         try:
2069                                 json_data = json.loads(json_code)
2070                                 if 'Post' in json_data:
2071                                         data = json_data['Post']
2072                                 else:
2073                                         data = json_data
2074
2075                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2076                                 video_url = data['media']['url']
2077                                 umobj = re.match(self._URL_EXT, video_url)
2078                                 if umobj is None:
2079                                         raise ValueError('Can not determine filename extension')
2080                                 ext = umobj.group(1)
2081
2082                                 info = {
2083                                         'id': data['item_id'],
2084                                         'url': video_url,
2085                                         'uploader': data['display_name'],
2086                                         'upload_date': upload_date,
2087                                         'title': data['title'],
2088                                         'ext': ext,
2089                                         'format': data['media']['mimeType'],
2090                                         'thumbnail': data['thumbnailUrl'],
2091                                         'description': data['description'],
2092                                         'player_url': data['embedUrl']
2093                                 }
2094                         except (ValueError,KeyError), err:
2095                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2096                                 return
2097
2098                 std_headers['User-Agent'] = 'iTunes/10.6.1'
2099                 return [info]
2100
2101
2102 class MyVideoIE(InfoExtractor):
2103         """Information Extractor for myvideo.de."""
2104
2105         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2106         IE_NAME = u'myvideo'
2107
2108         def __init__(self, downloader=None):
2109                 InfoExtractor.__init__(self, downloader)
2110         
2111         def report_download_webpage(self, video_id):
2112                 """Report webpage download."""
2113                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2114
2115         def report_extraction(self, video_id):
2116                 """Report information extraction."""
2117                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2118
2119         def _real_extract(self,url):
2120                 mobj = re.match(self._VALID_URL, url)
2121                 if mobj is None:
2122                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2123                         return
2124
2125                 video_id = mobj.group(1)
2126
2127                 # Get video webpage
2128                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2129                 try:
2130                         self.report_download_webpage(video_id)
2131                         webpage = urllib2.urlopen(request).read()
2132                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2133                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2134                         return
2135
2136                 self.report_extraction(video_id)
2137                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2138                                  webpage)
2139                 if mobj is None:
2140                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2141                         return
2142                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2143
2144                 mobj = re.search('<title>([^<]+)</title>', webpage)
2145                 if mobj is None:
2146                         self._downloader.trouble(u'ERROR: unable to extract title')
2147                         return
2148
2149                 video_title = mobj.group(1)
2150
2151                 return [{
2152                         'id':           video_id,
2153                         'url':          video_url,
2154                         'uploader':     u'NA',
2155                         'upload_date':  u'NA',
2156                         'title':        video_title,
2157                         'ext':          u'flv',
2158                         'format':       u'NA',
2159                         'player_url':   None,
2160                 }]
2161
2162 class ComedyCentralIE(InfoExtractor):
2163         """Information extractor for The Daily Show and Colbert Report """
2164
2165         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2166         IE_NAME = u'comedycentral'
2167
2168         def report_extraction(self, episode_id):
2169                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2170
2171         def report_config_download(self, episode_id):
2172                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2173
2174         def report_index_download(self, episode_id):
2175                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2176
2177         def report_player_url(self, episode_id):
2178                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2179
2180         def _real_extract(self, url):
2181                 mobj = re.match(self._VALID_URL, url)
2182                 if mobj is None:
2183                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2184                         return
2185
2186                 if mobj.group('shortname'):
2187                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
2188                                 url = u'http://www.thedailyshow.com/full-episodes/'
2189                         else:
2190                                 url = u'http://www.colbertnation.com/full-episodes/'
2191                         mobj = re.match(self._VALID_URL, url)
2192                         assert mobj is not None
2193
2194                 dlNewest = not mobj.group('episode')
2195                 if dlNewest:
2196                         epTitle = mobj.group('showname')
2197                 else:
2198                         epTitle = mobj.group('episode')
2199
2200                 req = urllib2.Request(url)
2201                 self.report_extraction(epTitle)
2202                 try:
2203                         htmlHandle = urllib2.urlopen(req)
2204                         html = htmlHandle.read()
2205                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2206                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2207                         return
2208                 if dlNewest:
2209                         url = htmlHandle.geturl()
2210                         mobj = re.match(self._VALID_URL, url)
2211                         if mobj is None:
2212                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2213                                 return
2214                         if mobj.group('episode') == '':
2215                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2216                                 return
2217                         epTitle = mobj.group('episode')
2218
2219                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2220                 if len(mMovieParams) == 0:
2221                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2222                         return
2223
2224                 playerUrl_raw = mMovieParams[0][0]
2225                 self.report_player_url(epTitle)
2226                 try:
2227                         urlHandle = urllib2.urlopen(playerUrl_raw)
2228                         playerUrl = urlHandle.geturl()
2229                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2230                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2231                         return
2232
2233                 uri = mMovieParams[0][1]
2234                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2235                 self.report_index_download(epTitle)
2236                 try:
2237                         indexXml = urllib2.urlopen(indexUrl).read()
2238                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2239                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2240                         return
2241
2242                 results = []
2243
2244                 idoc = xml.etree.ElementTree.fromstring(indexXml)
2245                 itemEls = idoc.findall('.//item')
2246                 for itemEl in itemEls:
2247                         mediaId = itemEl.findall('./guid')[0].text
2248                         shortMediaId = mediaId.split(':')[-1]
2249                         showId = mediaId.split(':')[-2].replace('.com', '')
2250                         officialTitle = itemEl.findall('./title')[0].text
2251                         officialDate = itemEl.findall('./pubDate')[0].text
2252
2253                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2254                                                 urllib.urlencode({'uri': mediaId}))
2255                         configReq = urllib2.Request(configUrl)
2256                         self.report_config_download(epTitle)
2257                         try:
2258                                 configXml = urllib2.urlopen(configReq).read()
2259                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2260                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2261                                 return
2262
2263                         cdoc = xml.etree.ElementTree.fromstring(configXml)
2264                         turls = []
2265                         for rendition in cdoc.findall('.//rendition'):
2266                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2267                                 turls.append(finfo)
2268
2269                         if len(turls) == 0:
2270                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2271                                 continue
2272
2273                         # For now, just pick the highest bitrate
2274                         format,video_url = turls[-1]
2275
2276                         effTitle = showId + u'-' + epTitle
2277                         info = {
2278                                 'id': shortMediaId,
2279                                 'url': video_url,
2280                                 'uploader': showId,
2281                                 'upload_date': officialDate,
2282                                 'title': effTitle,
2283                                 'ext': 'mp4',
2284                                 'format': format,
2285                                 'thumbnail': None,
2286                                 'description': officialTitle,
2287                                 'player_url': playerUrl
2288                         }
2289
2290                         results.append(info)
2291                         
2292                 return results
2293
2294
2295 class EscapistIE(InfoExtractor):
2296         """Information extractor for The Escapist """
2297
2298         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2299         IE_NAME = u'escapist'
2300
2301         def report_extraction(self, showName):
2302                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2303
2304         def report_config_download(self, showName):
2305                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2306
2307         def _real_extract(self, url):
2308                 mobj = re.match(self._VALID_URL, url)
2309                 if mobj is None:
2310                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2311                         return
2312                 showName = mobj.group('showname')
2313                 videoId = mobj.group('episode')
2314
2315                 self.report_extraction(showName)
2316                 try:
2317                         webPage = urllib2.urlopen(url)
2318                         webPageBytes = webPage.read()
2319                         m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2320                         webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2321                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2322                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2323                         return
2324
2325                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2326                 description = unescapeHTML(descMatch.group(1))
2327                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2328                 imgUrl = unescapeHTML(imgMatch.group(1))
2329                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2330                 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2331                 configUrlMatch = re.search('config=(.*)$', playerUrl)
2332                 configUrl = urllib2.unquote(configUrlMatch.group(1))
2333
2334                 self.report_config_download(showName)
2335                 try:
2336                         configJSON = urllib2.urlopen(configUrl).read()
2337                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2338                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2339                         return
2340
2341                 # Technically, it's JavaScript, not JSON
2342                 configJSON = configJSON.replace("'", '"')
2343
2344                 try:
2345                         config = json.loads(configJSON)
2346                 except (ValueError,), err:
2347                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2348                         return
2349
2350                 playlist = config['playlist']
2351                 videoUrl = playlist[1]['url']
2352
2353                 info = {
2354                         'id': videoId,
2355                         'url': videoUrl,
2356                         'uploader': showName,
2357                         'upload_date': None,
2358                         'title': showName,
2359                         'ext': 'flv',
2360                         'format': 'flv',
2361                         'thumbnail': imgUrl,
2362                         'description': description,
2363                         'player_url': playerUrl,
2364                 }
2365
2366                 return [info]
2367
2368
2369 class CollegeHumorIE(InfoExtractor):
2370         """Information extractor for collegehumor.com"""
2371
2372         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2373         IE_NAME = u'collegehumor'
2374
2375         def report_webpage(self, video_id):
2376                 """Report information extraction."""
2377                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2378
2379         def report_extraction(self, video_id):
2380                 """Report information extraction."""
2381                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2382
2383         def _real_extract(self, url):
2384                 mobj = re.match(self._VALID_URL, url)
2385                 if mobj is None:
2386                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2387                         return
2388                 video_id = mobj.group('videoid')
2389
2390                 self.report_webpage(video_id)
2391                 request = urllib2.Request(url)
2392                 try:
2393                         webpage = urllib2.urlopen(request).read()
2394                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2395                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2396                         return
2397
2398                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2399                 if m is None:
2400                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2401                         return
2402                 internal_video_id = m.group('internalvideoid')
2403
2404                 info = {
2405                         'id': video_id,
2406                         'internal_id': internal_video_id,
2407                 }
2408
2409                 self.report_extraction(video_id)
2410                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2411                 try:
2412                         metaXml = urllib2.urlopen(xmlUrl).read()
2413                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2414                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2415                         return
2416
2417                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2418                 try:
2419                         videoNode = mdoc.findall('./video')[0]
2420                         info['description'] = videoNode.findall('./description')[0].text
2421                         info['title'] = videoNode.findall('./caption')[0].text
2422                         info['url'] = videoNode.findall('./file')[0].text
2423                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2424                         info['ext'] = info['url'].rpartition('.')[2]
2425                         info['format'] = info['ext']
2426                 except IndexError:
2427                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2428                         return
2429
2430                 return [info]
2431
2432
2433 class XVideosIE(InfoExtractor):
2434         """Information extractor for xvideos.com"""
2435
2436         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2437         IE_NAME = u'xvideos'
2438
2439         def report_webpage(self, video_id):
2440                 """Report information extraction."""
2441                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2442
2443         def report_extraction(self, video_id):
2444                 """Report information extraction."""
2445                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2446
2447         def _real_extract(self, url):
2448                 mobj = re.match(self._VALID_URL, url)
2449                 if mobj is None:
2450                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2451                         return
2452                 video_id = mobj.group(1).decode('utf-8')
2453
2454                 self.report_webpage(video_id)
2455
2456                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2457                 try:
2458                         webpage = urllib2.urlopen(request).read()
2459                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2460                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2461                         return
2462
2463                 self.report_extraction(video_id)
2464
2465
2466                 # Extract video URL
2467                 mobj = re.search(r'flv_url=(.+?)&', webpage)
2468                 if mobj is None:
2469                         self._downloader.trouble(u'ERROR: unable to extract video url')
2470                         return
2471                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2472
2473
2474                 # Extract title
2475                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2476                 if mobj is None:
2477                         self._downloader.trouble(u'ERROR: unable to extract video title')
2478                         return
2479                 video_title = mobj.group(1).decode('utf-8')
2480
2481
2482                 # Extract video thumbnail
2483                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2484                 if mobj is None:
2485                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2486                         return
2487                 video_thumbnail = mobj.group(0).decode('utf-8')
2488
2489                 info = {
2490                         'id': video_id,
2491                         'url': video_url,
2492                         'uploader': None,
2493                         'upload_date': None,
2494                         'title': video_title,
2495                         'ext': 'flv',
2496                         'format': 'flv',
2497                         'thumbnail': video_thumbnail,
2498                         'description': None,
2499                         'player_url': None,
2500                 }
2501
2502                 return [info]
2503
2504
2505 class SoundcloudIE(InfoExtractor):
2506         """Information extractor for soundcloud.com
2507            To access the media, the uid of the song and a stream token
2508            must be extracted from the page source and the script must make
2509            a request to media.soundcloud.com/crossdomain.xml. Then
2510            the media can be grabbed by requesting from an url composed
2511            of the stream token and uid
2512          """
2513
2514         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2515         IE_NAME = u'soundcloud'
2516
2517         def __init__(self, downloader=None):
2518                 InfoExtractor.__init__(self, downloader)
2519
2520         def report_webpage(self, video_id):
2521                 """Report information extraction."""
2522                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2523
2524         def report_extraction(self, video_id):
2525                 """Report information extraction."""
2526                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2527
2528         def _real_extract(self, url):
2529                 mobj = re.match(self._VALID_URL, url)
2530                 if mobj is None:
2531                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2532                         return
2533
2534                 # extract uploader (which is in the url)
2535                 uploader = mobj.group(1).decode('utf-8')
2536                 # extract simple title (uploader + slug of song title)
2537                 slug_title =  mobj.group(2).decode('utf-8')
2538                 simple_title = uploader + u'-' + slug_title
2539
2540                 self.report_webpage('%s/%s' % (uploader, slug_title))
2541
2542                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2543                 try:
2544                         webpage = urllib2.urlopen(request).read()
2545                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2546                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2547                         return
2548
2549                 self.report_extraction('%s/%s' % (uploader, slug_title))
2550
2551                 # extract uid and stream token that soundcloud hands out for access
2552                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2553                 if mobj:
2554                         video_id = mobj.group(1)
2555                         stream_token = mobj.group(2)
2556
2557                 # extract unsimplified title
2558                 mobj = re.search('"title":"(.*?)",', webpage)
2559                 if mobj:
2560                         title = mobj.group(1).decode('utf-8')
2561                 else:
2562                         title = simple_title
2563
2564                 # construct media url (with uid/token)
2565                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2566                 mediaURL = mediaURL % (video_id, stream_token)
2567
2568                 # description
2569                 description = u'No description available'
2570                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2571                 if mobj:
2572                         description = mobj.group(1)
2573                 
2574                 # upload date
2575                 upload_date = None
2576                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2577                 if mobj:
2578                         try:
2579                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2580                         except Exception, e:
2581                                 self._downloader.to_stderr(str(e))
2582
2583                 # for soundcloud, a request to a cross domain is required for cookies
2584                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2585
2586                 return [{
2587                         'id':           video_id.decode('utf-8'),
2588                         'url':          mediaURL,
2589                         'uploader':     uploader.decode('utf-8'),
2590                         'upload_date':  upload_date,
2591                         'title':        title,
2592                         'ext':          u'mp3',
2593                         'format':       u'NA',
2594                         'player_url':   None,
2595                         'description': description.decode('utf-8')
2596                 }]
2597
2598
2599 class InfoQIE(InfoExtractor):
2600         """Information extractor for infoq.com"""
2601
2602         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2603         IE_NAME = u'infoq'
2604
2605         def report_webpage(self, video_id):
2606                 """Report information extraction."""
2607                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2608
2609         def report_extraction(self, video_id):
2610                 """Report information extraction."""
2611                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2612
2613         def _real_extract(self, url):
2614                 mobj = re.match(self._VALID_URL, url)
2615                 if mobj is None:
2616                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2617                         return
2618
2619                 self.report_webpage(url)
2620
2621                 request = urllib2.Request(url)
2622                 try:
2623                         webpage = urllib2.urlopen(request).read()
2624                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2625                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2626                         return
2627
2628                 self.report_extraction(url)
2629
2630
2631                 # Extract video URL
2632                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2633                 if mobj is None:
2634                         self._downloader.trouble(u'ERROR: unable to extract video url')
2635                         return
2636                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2637
2638
2639                 # Extract title
2640                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2641                 if mobj is None:
2642                         self._downloader.trouble(u'ERROR: unable to extract video title')
2643                         return
2644                 video_title = mobj.group(1).decode('utf-8')
2645
2646                 # Extract description
2647                 video_description = u'No description available.'
2648                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2649                 if mobj is not None:
2650                         video_description = mobj.group(1).decode('utf-8')
2651
2652                 video_filename = video_url.split('/')[-1]
2653                 video_id, extension = video_filename.split('.')
2654
2655                 info = {
2656                         'id': video_id,
2657                         'url': video_url,
2658                         'uploader': None,
2659                         'upload_date': None,
2660                         'title': video_title,
2661                         'ext': extension,
2662                         'format': extension, # Extension is always(?) mp4, but seems to be flv
2663                         'thumbnail': None,
2664                         'description': video_description,
2665                         'player_url': None,
2666                 }
2667
2668                 return [info]
2669
2670 class MixcloudIE(InfoExtractor):
2671         """Information extractor for www.mixcloud.com"""
2672         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2673         IE_NAME = u'mixcloud'
2674
2675         def __init__(self, downloader=None):
2676                 InfoExtractor.__init__(self, downloader)
2677
2678         def report_download_json(self, file_id):
2679                 """Report JSON download."""
2680                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2681
2682         def report_extraction(self, file_id):
2683                 """Report information extraction."""
2684                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2685
2686         def get_urls(self, jsonData, fmt, bitrate='best'):
2687                 """Get urls from 'audio_formats' section in json"""
2688                 file_url = None
2689                 try:
2690                         bitrate_list = jsonData[fmt]
2691                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2692                                 bitrate = max(bitrate_list) # select highest
2693
2694                         url_list = jsonData[fmt][bitrate]
2695                 except TypeError: # we have no bitrate info.
2696                         url_list = jsonData[fmt]
2697                 return url_list
2698
2699         def check_urls(self, url_list):
2700                 """Returns 1st active url from list"""
2701                 for url in url_list:
2702                         try:
2703                                 urllib2.urlopen(url)
2704                                 return url
2705                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2706                                 url = None
2707
2708                 return None
2709
2710         def _print_formats(self, formats):
2711                 print 'Available formats:'
2712                 for fmt in formats.keys():
2713                         for b in formats[fmt]:
2714                                 try:
2715                                         ext = formats[fmt][b][0]
2716                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2717                                 except TypeError: # we have no bitrate info
2718                                         ext = formats[fmt][0]
2719                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2720                                         break
2721
2722         def _real_extract(self, url):
2723                 mobj = re.match(self._VALID_URL, url)
2724                 if mobj is None:
2725                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2726                         return
2727                 # extract uploader & filename from url
2728                 uploader = mobj.group(1).decode('utf-8')
2729                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2730
2731                 # construct API request
2732                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2733                 # retrieve .json file with links to files
2734                 request = urllib2.Request(file_url)
2735                 try:
2736                         self.report_download_json(file_url)
2737                         jsonData = urllib2.urlopen(request).read()
2738                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2739                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2740                         return
2741
2742                 # parse JSON
2743                 json_data = json.loads(jsonData)
2744                 player_url = json_data['player_swf_url']
2745                 formats = dict(json_data['audio_formats'])
2746
2747                 req_format = self._downloader.params.get('format', None)
2748                 bitrate = None
2749
2750                 if self._downloader.params.get('listformats', None):
2751                         self._print_formats(formats)
2752                         return
2753
2754                 if req_format is None or req_format == 'best':
2755                         for format_param in formats.keys():
2756                                 url_list = self.get_urls(formats, format_param)
2757                                 # check urls
2758                                 file_url = self.check_urls(url_list)
2759                                 if file_url is not None:
2760                                         break # got it!
2761                 else:
2762                         if req_format not in formats.keys():
2763                                 self._downloader.trouble(u'ERROR: format is not available')
2764                                 return
2765
2766                         url_list = self.get_urls(formats, req_format)
2767                         file_url = self.check_urls(url_list)
2768                         format_param = req_format
2769
2770                 return [{
2771                         'id': file_id.decode('utf-8'),
2772                         'url': file_url.decode('utf-8'),
2773                         'uploader':     uploader.decode('utf-8'),
2774                         'upload_date': u'NA',
2775                         'title': json_data['name'],
2776                         'ext': file_url.split('.')[-1].decode('utf-8'),
2777                         'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2778                         'thumbnail': json_data['thumbnail_url'],
2779                         'description': json_data['description'],
2780                         'player_url': player_url.decode('utf-8'),
2781                 }]
2782
2783 class StanfordOpenClassroomIE(InfoExtractor):
2784         """Information extractor for Stanford's Open ClassRoom"""
2785
2786         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2787         IE_NAME = u'stanfordoc'
2788
2789         def report_download_webpage(self, objid):
2790                 """Report information extraction."""
2791                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2792
2793         def report_extraction(self, video_id):
2794                 """Report information extraction."""
2795                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2796
2797         def _real_extract(self, url):
2798                 mobj = re.match(self._VALID_URL, url)
2799                 if mobj is None:
2800                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2801                         return
2802
2803                 if mobj.group('course') and mobj.group('video'): # A specific video
2804                         course = mobj.group('course')
2805                         video = mobj.group('video')
2806                         info = {
2807                                 'id': course + '_' + video,
2808                         }
2809
2810                         self.report_extraction(info['id'])
2811                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2812                         xmlUrl = baseUrl + video + '.xml'
2813                         try:
2814                                 metaXml = urllib2.urlopen(xmlUrl).read()
2815                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2816                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2817                                 return
2818                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2819                         try:
2820                                 info['title'] = mdoc.findall('./title')[0].text
2821                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2822                         except IndexError:
2823                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2824                                 return
2825                         info['ext'] = info['url'].rpartition('.')[2]
2826                         info['format'] = info['ext']
2827                         return [info]
2828                 elif mobj.group('course'): # A course page
2829                         course = mobj.group('course')
2830                         info = {
2831                                 'id': course,
2832                                 'type': 'playlist',
2833                         }
2834
2835                         self.report_download_webpage(info['id'])
2836                         try:
2837                                 coursepage = urllib2.urlopen(url).read()
2838                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2839                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2840                                 return
2841
2842                         m = re.search('<h1>([^<]+)</h1>', coursepage)
2843                         if m:
2844                                 info['title'] = unescapeHTML(m.group(1))
2845                         else:
2846                                 info['title'] = info['id']
2847
2848                         m = re.search('<description>([^<]+)</description>', coursepage)
2849                         if m:
2850                                 info['description'] = unescapeHTML(m.group(1))
2851
2852                         links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2853                         info['list'] = [
2854                                 {
2855                                         'type': 'reference',
2856                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2857                                 }
2858                                         for vpage in links]
2859                         results = []
2860                         for entry in info['list']:
2861                                 assert entry['type'] == 'reference'
2862                                 results += self.extract(entry['url'])
2863                         return results
2864                         
2865                 else: # Root page
2866                         info = {
2867                                 'id': 'Stanford OpenClassroom',
2868                                 'type': 'playlist',
2869                         }
2870
2871                         self.report_download_webpage(info['id'])
2872                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2873                         try:
2874                                 rootpage = urllib2.urlopen(rootURL).read()
2875                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2876                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2877                                 return
2878
2879                         info['title'] = info['id']
2880
2881                         links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2882                         info['list'] = [
2883                                 {
2884                                         'type': 'reference',
2885                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2886                                 }
2887                                         for cpage in links]
2888
2889                         results = []
2890                         for entry in info['list']:
2891                                 assert entry['type'] == 'reference'
2892                                 results += self.extract(entry['url'])
2893                         return results
2894
2895 class MTVIE(InfoExtractor):
2896         """Information extractor for MTV.com"""
2897
2898         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2899         IE_NAME = u'mtv'
2900
2901         def report_webpage(self, video_id):
2902                 """Report information extraction."""
2903                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2904
2905         def report_extraction(self, video_id):
2906                 """Report information extraction."""
2907                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2908
2909         def _real_extract(self, url):
2910                 mobj = re.match(self._VALID_URL, url)
2911                 if mobj is None:
2912                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2913                         return
2914                 if not mobj.group('proto'):
2915                         url = 'http://' + url
2916                 video_id = mobj.group('videoid')
2917                 self.report_webpage(video_id)
2918
2919                 request = urllib2.Request(url)
2920                 try:
2921                         webpage = urllib2.urlopen(request).read()
2922                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2923                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2924                         return
2925
2926                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2927                 if mobj is None:
2928                         self._downloader.trouble(u'ERROR: unable to extract song name')
2929                         return
2930                 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2931                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2932                 if mobj is None:
2933                         self._downloader.trouble(u'ERROR: unable to extract performer')
2934                         return
2935                 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2936                 video_title = performer + ' - ' + song_name 
2937
2938                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2939                 if mobj is None:
2940                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
2941                         return
2942                 mtvn_uri = mobj.group(1)
2943
2944                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2945                 if mobj is None:
2946                         self._downloader.trouble(u'ERROR: unable to extract content id')
2947                         return
2948                 content_id = mobj.group(1)
2949
2950                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2951                 self.report_extraction(video_id)
2952                 request = urllib2.Request(videogen_url)
2953                 try:
2954                         metadataXml = urllib2.urlopen(request).read()
2955                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2956                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
2957                         return
2958
2959                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2960                 renditions = mdoc.findall('.//rendition')
2961
2962                 # For now, always pick the highest quality.
2963                 rendition = renditions[-1]
2964
2965                 try:
2966                         _,_,ext = rendition.attrib['type'].partition('/')
2967                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2968                         video_url = rendition.find('./src').text
2969                 except KeyError:
2970                         self._downloader.trouble('Invalid rendition field.')
2971                         return
2972
2973                 info = {
2974                         'id': video_id,
2975                         'url': video_url,
2976                         'uploader': performer,
2977                         'title': video_title,
2978                         'ext': ext,
2979                         'format': format,
2980                 }
2981
2982                 return [info]
2983
2984
2985 class YoukuIE(InfoExtractor):
2986
2987         _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2988         IE_NAME = u'Youku'
2989
2990         def __init__(self, downloader=None):
2991                 InfoExtractor.__init__(self, downloader)
2992
2993         def report_download_webpage(self, file_id):
2994                 """Report webpage download."""
2995                 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
2996
2997         def report_extraction(self, file_id):
2998                 """Report information extraction."""
2999                 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3000
3001         def _gen_sid(self):
3002                 nowTime = int(time.time() * 1000)
3003                 random1 = random.randint(1000,1998)
3004                 random2 = random.randint(1000,9999)
3005
3006                 return "%d%d%d" %(nowTime,random1,random2)
3007
3008         def _get_file_ID_mix_string(self, seed):
3009                 mixed = []
3010                 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3011                 seed = float(seed)
3012                 for i in range(len(source)):
3013                         seed  =  (seed * 211 + 30031 ) % 65536
3014                         index  =  math.floor(seed / 65536 * len(source) )
3015                         mixed.append(source[int(index)])
3016                         source.remove(source[int(index)])
3017                 #return ''.join(mixed)
3018                 return mixed
3019
3020         def _get_file_id(self, fileId, seed):
3021                 mixed = self._get_file_ID_mix_string(seed)
3022                 ids = fileId.split('*')
3023                 realId = []
3024                 for ch in ids:
3025                         if ch:
3026                                 realId.append(mixed[int(ch)])
3027                 return ''.join(realId)
3028
3029         def _real_extract(self, url):
3030                 mobj = re.match(self._VALID_URL, url)
3031                 if mobj is None:
3032                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3033                         return
3034                 video_id = mobj.group('ID')
3035
3036                 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3037
3038                 request = urllib2.Request(info_url, None, std_headers)
3039                 try:
3040                         self.report_download_webpage(video_id)
3041                         jsondata = urllib2.urlopen(request).read()
3042                 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3043                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3044                         return
3045
3046                 self.report_extraction(video_id)
3047                 try:
3048                         config = json.loads(jsondata)
3049
3050                         video_title =  config['data'][0]['title']
3051                         seed = config['data'][0]['seed']
3052
3053                         format = self._downloader.params.get('format', None)
3054                         supported_format = config['data'][0]['streamfileids'].keys()
3055
3056                         if format is None or format == 'best':
3057                                 if 'hd2' in supported_format:
3058                                         format = 'hd2'
3059                                 else:
3060                                         format = 'flv'
3061                                 ext = u'flv'
3062                         elif format == 'worst':
3063                                 format = 'mp4'
3064                                 ext = u'mp4'
3065                         else:
3066                                 format = 'flv'
3067                                 ext = u'flv'
3068
3069
3070                         fileid = config['data'][0]['streamfileids'][format]
3071                         seg_number = len(config['data'][0]['segs'][format])
3072
3073                         keys=[]
3074                         for i in xrange(seg_number):
3075                                 keys.append(config['data'][0]['segs'][format][i]['k'])
3076
3077                         #TODO check error
3078                         #youku only could be viewed from mainland china
3079                 except:
3080                         self._downloader.trouble(u'ERROR: unable to extract info section')
3081                         return
3082
3083                 files_info=[]
3084                 sid = self._gen_sid()
3085                 fileid = self._get_file_id(fileid, seed)
3086
3087                 #column 8,9 of fileid represent the segment number
3088                 #fileid[7:9] should be changed
3089                 for index, key in enumerate(keys):
3090
3091                         temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3092                         download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3093
3094                         info = {
3095                                 'id': '%s_part%02d' % (video_id, index),
3096                                 'url': download_url,
3097                                 'uploader': None,
3098                                 'title': video_title,
3099                                 'ext': ext,
3100                                 'format': u'NA'
3101                         }
3102                         files_info.append(info)
3103
3104                 return files_info
3105
3106
3107 class XNXXIE(InfoExtractor):
3108         """Information extractor for xnxx.com"""
3109
3110         _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3111         IE_NAME = u'xnxx'
3112         VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3113         VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3114         VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3115
3116         def report_webpage(self, video_id):
3117                 """Report information extraction"""
3118                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3119
3120         def report_extraction(self, video_id):
3121                 """Report information extraction"""
3122                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3123
3124         def _real_extract(self, url):
3125                 mobj = re.match(self._VALID_URL, url)
3126                 if mobj is None:
3127                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3128                         return
3129                 video_id = mobj.group(1).decode('utf-8')
3130
3131                 self.report_webpage(video_id)
3132
3133                 # Get webpage content
3134                 try:
3135                         webpage = urllib2.urlopen(url).read()
3136                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3137                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3138                         return
3139
3140                 result = re.search(self.VIDEO_URL_RE, webpage)
3141                 if result is None:
3142                         self._downloader.trouble(u'ERROR: unable to extract video url')
3143                         return
3144                 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3145
3146                 result = re.search(self.VIDEO_TITLE_RE, webpage)
3147                 if result is None:
3148                         self._downloader.trouble(u'ERROR: unable to extract video title')
3149                         return
3150                 video_title = result.group(1).decode('utf-8')
3151
3152                 result = re.search(self.VIDEO_THUMB_RE, webpage)
3153                 if result is None:
3154                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3155                         return
3156                 video_thumbnail = result.group(1).decode('utf-8')
3157
3158                 info = {'id': video_id,
3159                                 'url': video_url,
3160                                 'uploader': None,
3161                                 'upload_date': None,
3162                                 'title': video_title,
3163                                 'ext': 'flv',
3164                                 'format': 'flv',
3165                                 'thumbnail': video_thumbnail,
3166                                 'description': None,
3167                                 'player_url': None}
3168
3169                 return [info]