added YouJizz extractor
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import datetime
5 import HTMLParser
6 import httplib
7 import netrc
8 import os
9 import re
10 import socket
11 import time
12 import urllib
13 import urllib2
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 from urlparse import parse_qs, urlparse
19
20 try:
21         import cStringIO as StringIO
22 except ImportError:
23         import StringIO
24
25 from utils import *
26
27
28 class InfoExtractor(object):
29         """Information Extractor class.
30
31         Information extractors are the classes that, given a URL, extract
32         information from the video (or videos) the URL refers to. This
33         information includes the real video URL, the video title and simplified
34         title, author and others. The information is stored in a dictionary
35         which is then passed to the FileDownloader. The FileDownloader
36         processes this information possibly downloading the video to the file
37         system, among other possible outcomes. The dictionaries must include
38         the following fields:
39
40         id:             Video identifier.
41         url:            Final video URL.
42         uploader:       Nickname of the video uploader.
43         title:          Literal title.
44         ext:            Video filename extension.
45         format:         Video format.
46         player_url:     SWF Player URL (may be None).
47
48         The following fields are optional. Their primary purpose is to allow
49         youtube-dl to serve as the backend for a video search function, such
50         as the one in youtube2mp3.  They are only used when their respective
51         forced printing functions are called:
52
53         thumbnail:      Full URL to a video thumbnail image.
54         description:    One-line video description.
55
56         Subclasses of this one should re-define the _real_initialize() and
57         _real_extract() methods and define a _VALID_URL regexp.
58         Probably, they should also be added to the list of extractors.
59         """
60
61         _ready = False
62         _downloader = None
63
64         def __init__(self, downloader=None):
65                 """Constructor. Receives an optional downloader."""
66                 self._ready = False
67                 self.set_downloader(downloader)
68
69         def suitable(self, url):
70                 """Receives a URL and returns True if suitable for this IE."""
71                 return re.match(self._VALID_URL, url) is not None
72
73         def initialize(self):
74                 """Initializes an instance (authentication, etc)."""
75                 if not self._ready:
76                         self._real_initialize()
77                         self._ready = True
78
79         def extract(self, url):
80                 """Extracts URL information and returns it in list of dicts."""
81                 self.initialize()
82                 return self._real_extract(url)
83
84         def set_downloader(self, downloader):
85                 """Sets the downloader for this IE."""
86                 self._downloader = downloader
87
88         def _real_initialize(self):
89                 """Real initialization process. Redefine in subclasses."""
90                 pass
91
92         def _real_extract(self, url):
93                 """Real extraction process. Redefine in subclasses."""
94                 pass
95
96
97
98
99 class YoutubeIE(InfoExtractor):
100         """Information extractor for youtube.com."""
101
102         _VALID_URL = r"""^
103                          (
104                              (?:https?://)?                                       # http(s):// (optional)
105                              (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
106                                 tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
107                              (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
108                              (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
109                              (?:                                                  # the various things that can precede the ID:
110                                  (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
111                                  |(?:                                             # or the v= param in all its forms
112                                      (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
113                                      (?:\?|\#!?)                                  # the params delimiter ? or # or #!
114                                      (?:.+&)?                                     # any other preceding param (like /?s=tuff&v=xxxx)
115                                      v=
116                                  )
117                              )?                                                   # optional -> youtube.com/xxxx is OK
118                          )?                                                       # all until now is optional -> you can pass the naked ID
119                          ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
120                          (?(1).+)?                                                # if we found the ID, everything can follow
121                          $"""
122         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
123         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
124         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
125         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
126         _NETRC_MACHINE = 'youtube'
127         # Listed in order of quality
128         _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
129         _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
130         _video_extensions = {
131                 '13': '3gp',
132                 '17': 'mp4',
133                 '18': 'mp4',
134                 '22': 'mp4',
135                 '37': 'mp4',
136                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
137                 '43': 'webm',
138                 '44': 'webm',
139                 '45': 'webm',
140                 '46': 'webm',
141         }
142         _video_dimensions = {
143                 '5': '240x400',
144                 '6': '???',
145                 '13': '???',
146                 '17': '144x176',
147                 '18': '360x640',
148                 '22': '720x1280',
149                 '34': '360x640',
150                 '35': '480x854',
151                 '37': '1080x1920',
152                 '38': '3072x4096',
153                 '43': '360x640',
154                 '44': '480x854',
155                 '45': '720x1280',
156                 '46': '1080x1920',
157         }       
158         IE_NAME = u'youtube'
159
160         def suitable(self, url):
161                 """Receives a URL and returns True if suitable for this IE."""
162                 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
163
164         def report_lang(self):
165                 """Report attempt to set language."""
166                 self._downloader.to_screen(u'[youtube] Setting language')
167
168         def report_login(self):
169                 """Report attempt to log in."""
170                 self._downloader.to_screen(u'[youtube] Logging in')
171
172         def report_age_confirmation(self):
173                 """Report attempt to confirm age."""
174                 self._downloader.to_screen(u'[youtube] Confirming age')
175
176         def report_video_webpage_download(self, video_id):
177                 """Report attempt to download video webpage."""
178                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
179
180         def report_video_info_webpage_download(self, video_id):
181                 """Report attempt to download video info webpage."""
182                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
183
184         def report_video_subtitles_download(self, video_id):
185                 """Report attempt to download video info webpage."""
186                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
187
188         def report_information_extraction(self, video_id):
189                 """Report attempt to extract video information."""
190                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
191
192         def report_unavailable_format(self, video_id, format):
193                 """Report extracted video URL."""
194                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
195
196         def report_rtmp_download(self):
197                 """Indicate the download will use the RTMP protocol."""
198                 self._downloader.to_screen(u'[youtube] RTMP download detected')
199
200         def _closed_captions_xml_to_srt(self, xml_string):
201                 srt = ''
202                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
203                 # TODO parse xml instead of regex
204                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
205                         if not dur: dur = '4'
206                         start = float(start)
207                         end = start + float(dur)
208                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
209                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
210                         caption = unescapeHTML(caption)
211                         caption = unescapeHTML(caption) # double cycle, intentional
212                         srt += str(n+1) + '\n'
213                         srt += start + ' --> ' + end + '\n'
214                         srt += caption + '\n\n'
215                 return srt
216
217         def _print_formats(self, formats):
218                 print 'Available formats:'
219                 for x in formats:
220                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
221
222         def _real_initialize(self):
223                 if self._downloader is None:
224                         return
225
226                 username = None
227                 password = None
228                 downloader_params = self._downloader.params
229
230                 # Attempt to use provided username and password or .netrc data
231                 if downloader_params.get('username', None) is not None:
232                         username = downloader_params['username']
233                         password = downloader_params['password']
234                 elif downloader_params.get('usenetrc', False):
235                         try:
236                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
237                                 if info is not None:
238                                         username = info[0]
239                                         password = info[2]
240                                 else:
241                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
242                         except (IOError, netrc.NetrcParseError), err:
243                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
244                                 return
245
246                 # Set language
247                 request = urllib2.Request(self._LANG_URL)
248                 try:
249                         self.report_lang()
250                         urllib2.urlopen(request).read()
251                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
252                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
253                         return
254
255                 # No authentication to be performed
256                 if username is None:
257                         return
258
259                 # Log in
260                 login_form = {
261                                 'current_form': 'loginForm',
262                                 'next':         '/',
263                                 'action_login': 'Log In',
264                                 'username':     username,
265                                 'password':     password,
266                                 }
267                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
268                 try:
269                         self.report_login()
270                         login_results = urllib2.urlopen(request).read()
271                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
272                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
273                                 return
274                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
275                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
276                         return
277
278                 # Confirm age
279                 age_form = {
280                                 'next_url':             '/',
281                                 'action_confirm':       'Confirm',
282                                 }
283                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
284                 try:
285                         self.report_age_confirmation()
286                         age_results = urllib2.urlopen(request).read()
287                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
288                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
289                         return
290
291         def _real_extract(self, url):
292                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
293                 mobj = re.search(self._NEXT_URL_RE, url)
294                 if mobj:
295                         url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
296
297                 # Extract video id from URL
298                 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
299                 if mobj is None:
300                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
301                         return
302                 video_id = mobj.group(2)
303
304                 # Get video webpage
305                 self.report_video_webpage_download(video_id)
306                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
307                 try:
308                         video_webpage = urllib2.urlopen(request).read()
309                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
310                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
311                         return
312
313                 # Attempt to extract SWF player URL
314                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
315                 if mobj is not None:
316                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
317                 else:
318                         player_url = None
319
320                 # Get video info
321                 self.report_video_info_webpage_download(video_id)
322                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
323                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
324                                         % (video_id, el_type))
325                         request = urllib2.Request(video_info_url)
326                         try:
327                                 video_info_webpage = urllib2.urlopen(request).read()
328                                 video_info = parse_qs(video_info_webpage)
329                                 if 'token' in video_info:
330                                         break
331                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
332                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
333                                 return
334                 if 'token' not in video_info:
335                         if 'reason' in video_info:
336                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
337                         else:
338                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
339                         return
340
341                 # Check for "rental" videos
342                 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
343                         self._downloader.trouble(u'ERROR: "rental" videos not supported')
344                         return
345
346                 # Start extracting information
347                 self.report_information_extraction(video_id)
348
349                 # uploader
350                 if 'author' not in video_info:
351                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
352                         return
353                 video_uploader = urllib.unquote_plus(video_info['author'][0])
354
355                 # title
356                 if 'title' not in video_info:
357                         self._downloader.trouble(u'ERROR: unable to extract video title')
358                         return
359                 video_title = urllib.unquote_plus(video_info['title'][0])
360                 video_title = video_title.decode('utf-8')
361
362                 # thumbnail image
363                 if 'thumbnail_url' not in video_info:
364                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
365                         video_thumbnail = ''
366                 else:   # don't panic if we can't find it
367                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
368
369                 # upload date
370                 upload_date = u'NA'
371                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
372                 if mobj is not None:
373                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
374                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
375                         for expression in format_expressions:
376                                 try:
377                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
378                                 except:
379                                         pass
380
381                 # description
382                 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
383                 if video_description: video_description = clean_html(video_description)
384                 else: video_description = ''
385                         
386                 # closed captions
387                 video_subtitles = None
388                 if self._downloader.params.get('writesubtitles', False):
389                         try:
390                                 self.report_video_subtitles_download(video_id)
391                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
392                                 try:
393                                         srt_list = urllib2.urlopen(request).read()
394                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
395                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
396                                 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
397                                 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
398                                 if not srt_lang_list:
399                                         raise Trouble(u'WARNING: video has no closed captions')
400                                 if self._downloader.params.get('subtitleslang', False):
401                                         srt_lang = self._downloader.params.get('subtitleslang')
402                                 elif 'en' in srt_lang_list:
403                                         srt_lang = 'en'
404                                 else:
405                                         srt_lang = srt_lang_list.keys()[0]
406                                 if not srt_lang in srt_lang_list:
407                                         raise Trouble(u'WARNING: no closed captions found in the specified language')
408                                 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
409                                 try:
410                                         srt_xml = urllib2.urlopen(request).read()
411                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
412                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
413                                 if not srt_xml:
414                                         raise Trouble(u'WARNING: unable to download video subtitles')
415                                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
416                         except Trouble as trouble:
417                                 self._downloader.trouble(trouble[0])
418
419                 if 'length_seconds' not in video_info:
420                         self._downloader.trouble(u'WARNING: unable to extract video duration')
421                         video_duration = ''
422                 else:
423                         video_duration = urllib.unquote_plus(video_info['length_seconds'][0])
424
425                 # token
426                 video_token = urllib.unquote_plus(video_info['token'][0])
427
428                 # Decide which formats to download
429                 req_format = self._downloader.params.get('format', None)
430
431                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
432                         self.report_rtmp_download()
433                         video_url_list = [(None, video_info['conn'][0])]
434                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
435                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
436                         url_data = [parse_qs(uds) for uds in url_data_strs]
437                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
438                         url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
439
440                         format_limit = self._downloader.params.get('format_limit', None)
441                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
442                         if format_limit is not None and format_limit in available_formats:
443                                 format_list = available_formats[available_formats.index(format_limit):]
444                         else:
445                                 format_list = available_formats
446                         existing_formats = [x for x in format_list if x in url_map]
447                         if len(existing_formats) == 0:
448                                 self._downloader.trouble(u'ERROR: no known formats available for video')
449                                 return
450                         if self._downloader.params.get('listformats', None):
451                                 self._print_formats(existing_formats)
452                                 return
453                         if req_format is None or req_format == 'best':
454                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
455                         elif req_format == 'worst':
456                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
457                         elif req_format in ('-1', 'all'):
458                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
459                         else:
460                                 # Specific formats. We pick the first in a slash-delimeted sequence.
461                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
462                                 req_formats = req_format.split('/')
463                                 video_url_list = None
464                                 for rf in req_formats:
465                                         if rf in url_map:
466                                                 video_url_list = [(rf, url_map[rf])]
467                                                 break
468                                 if video_url_list is None:
469                                         self._downloader.trouble(u'ERROR: requested format not available')
470                                         return
471                 else:
472                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
473                         return
474
475                 results = []
476                 for format_param, video_real_url in video_url_list:
477                         # Extension
478                         video_extension = self._video_extensions.get(format_param, 'flv')
479
480                         results.append({
481                                 'id':           video_id.decode('utf-8'),
482                                 'url':          video_real_url.decode('utf-8'),
483                                 'uploader':     video_uploader.decode('utf-8'),
484                                 'upload_date':  upload_date,
485                                 'title':        video_title,
486                                 'ext':          video_extension.decode('utf-8'),
487                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
488                                 'thumbnail':    video_thumbnail.decode('utf-8'),
489                                 'description':  video_description,
490                                 'player_url':   player_url,
491                                 'subtitles':    video_subtitles,
492                                 'duration':             video_duration
493                         })
494                 return results
495
496
497 class MetacafeIE(InfoExtractor):
498         """Information Extractor for metacafe.com."""
499
500         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
501         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
502         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
503         IE_NAME = u'metacafe'
504
505         def __init__(self, downloader=None):
506                 InfoExtractor.__init__(self, downloader)
507
508         def report_disclaimer(self):
509                 """Report disclaimer retrieval."""
510                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
511
512         def report_age_confirmation(self):
513                 """Report attempt to confirm age."""
514                 self._downloader.to_screen(u'[metacafe] Confirming age')
515
516         def report_download_webpage(self, video_id):
517                 """Report webpage download."""
518                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
519
520         def report_extraction(self, video_id):
521                 """Report information extraction."""
522                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
523
524         def _real_initialize(self):
525                 # Retrieve disclaimer
526                 request = urllib2.Request(self._DISCLAIMER)
527                 try:
528                         self.report_disclaimer()
529                         disclaimer = urllib2.urlopen(request).read()
530                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
531                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
532                         return
533
534                 # Confirm age
535                 disclaimer_form = {
536                         'filters': '0',
537                         'submit': "Continue - I'm over 18",
538                         }
539                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
540                 try:
541                         self.report_age_confirmation()
542                         disclaimer = urllib2.urlopen(request).read()
543                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
544                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
545                         return
546
547         def _real_extract(self, url):
548                 # Extract id and simplified title from URL
549                 mobj = re.match(self._VALID_URL, url)
550                 if mobj is None:
551                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
552                         return
553
554                 video_id = mobj.group(1)
555
556                 # Check if video comes from YouTube
557                 mobj2 = re.match(r'^yt-(.*)$', video_id)
558                 if mobj2 is not None:
559                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
560                         return
561
562                 # Retrieve video webpage to extract further information
563                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
564                 try:
565                         self.report_download_webpage(video_id)
566                         webpage = urllib2.urlopen(request).read()
567                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
568                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
569                         return
570
571                 # Extract URL, uploader and title from webpage
572                 self.report_extraction(video_id)
573                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
574                 if mobj is not None:
575                         mediaURL = urllib.unquote(mobj.group(1))
576                         video_extension = mediaURL[-3:]
577
578                         # Extract gdaKey if available
579                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
580                         if mobj is None:
581                                 video_url = mediaURL
582                         else:
583                                 gdaKey = mobj.group(1)
584                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
585                 else:
586                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
587                         if mobj is None:
588                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
589                                 return
590                         vardict = parse_qs(mobj.group(1))
591                         if 'mediaData' not in vardict:
592                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
593                                 return
594                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
595                         if mobj is None:
596                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
597                                 return
598                         mediaURL = mobj.group(1).replace('\\/', '/')
599                         video_extension = mediaURL[-3:]
600                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
601
602                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
603                 if mobj is None:
604                         self._downloader.trouble(u'ERROR: unable to extract title')
605                         return
606                 video_title = mobj.group(1).decode('utf-8')
607
608                 mobj = re.search(r'submitter=(.*?);', webpage)
609                 if mobj is None:
610                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
611                         return
612                 video_uploader = mobj.group(1)
613
614                 return [{
615                         'id':           video_id.decode('utf-8'),
616                         'url':          video_url.decode('utf-8'),
617                         'uploader':     video_uploader.decode('utf-8'),
618                         'upload_date':  u'NA',
619                         'title':        video_title,
620                         'ext':          video_extension.decode('utf-8'),
621                         'format':       u'NA',
622                         'player_url':   None,
623                 }]
624
625
626 class DailymotionIE(InfoExtractor):
627         """Information Extractor for Dailymotion"""
628
629         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
630         IE_NAME = u'dailymotion'
631
632         def __init__(self, downloader=None):
633                 InfoExtractor.__init__(self, downloader)
634
635         def report_download_webpage(self, video_id):
636                 """Report webpage download."""
637                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
638
639         def report_extraction(self, video_id):
640                 """Report information extraction."""
641                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
642
643         def _real_extract(self, url):
644                 # Extract id and simplified title from URL
645                 mobj = re.match(self._VALID_URL, url)
646                 if mobj is None:
647                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
648                         return
649
650                 video_id = mobj.group(1).split('_')[0].split('?')[0]
651
652                 video_extension = 'mp4'
653
654                 # Retrieve video webpage to extract further information
655                 request = urllib2.Request(url)
656                 request.add_header('Cookie', 'family_filter=off')
657                 try:
658                         self.report_download_webpage(video_id)
659                         webpage = urllib2.urlopen(request).read()
660                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
661                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
662                         return
663
664                 # Extract URL, uploader and title from webpage
665                 self.report_extraction(video_id)
666                 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
667                 if mobj is None:
668                         self._downloader.trouble(u'ERROR: unable to extract media URL')
669                         return
670                 flashvars = urllib.unquote(mobj.group(1))
671
672                 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
673                         if key in flashvars:
674                                 max_quality = key
675                                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
676                                 break
677                 else:
678                         self._downloader.trouble(u'ERROR: unable to extract video URL')
679                         return
680
681                 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
682                 if mobj is None:
683                         self._downloader.trouble(u'ERROR: unable to extract video URL')
684                         return
685
686                 video_url = urllib.unquote(mobj.group(1)).replace('\\/', '/')
687
688                 # TODO: support choosing qualities
689
690                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
691                 if mobj is None:
692                         self._downloader.trouble(u'ERROR: unable to extract title')
693                         return
694                 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
695
696                 video_uploader = u'NA'
697                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
698                 if mobj is None:
699                         # lookin for official user
700                         mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
701                         if mobj_official is None:
702                                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
703                         else:
704                                 video_uploader = mobj_official.group(1)
705                 else:
706                         video_uploader = mobj.group(1)
707
708                 video_upload_date = u'NA'
709                 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
710                 if mobj is not None:
711                         video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
712
713                 return [{
714                         'id':           video_id.decode('utf-8'),
715                         'url':          video_url.decode('utf-8'),
716                         'uploader':     video_uploader.decode('utf-8'),
717                         'upload_date':  video_upload_date,
718                         'title':        video_title,
719                         'ext':          video_extension.decode('utf-8'),
720                         'format':       u'NA',
721                         'player_url':   None,
722                 }]
723
724
725 class GoogleIE(InfoExtractor):
726         """Information extractor for video.google.com."""
727
728         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
729         IE_NAME = u'video.google'
730
731         def __init__(self, downloader=None):
732                 InfoExtractor.__init__(self, downloader)
733
734         def report_download_webpage(self, video_id):
735                 """Report webpage download."""
736                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
737
738         def report_extraction(self, video_id):
739                 """Report information extraction."""
740                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
741
742         def _real_extract(self, url):
743                 # Extract id from URL
744                 mobj = re.match(self._VALID_URL, url)
745                 if mobj is None:
746                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
747                         return
748
749                 video_id = mobj.group(1)
750
751                 video_extension = 'mp4'
752
753                 # Retrieve video webpage to extract further information
754                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
755                 try:
756                         self.report_download_webpage(video_id)
757                         webpage = urllib2.urlopen(request).read()
758                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
759                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
760                         return
761
762                 # Extract URL, uploader, and title from webpage
763                 self.report_extraction(video_id)
764                 mobj = re.search(r"download_url:'([^']+)'", webpage)
765                 if mobj is None:
766                         video_extension = 'flv'
767                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
768                 if mobj is None:
769                         self._downloader.trouble(u'ERROR: unable to extract media URL')
770                         return
771                 mediaURL = urllib.unquote(mobj.group(1))
772                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
773                 mediaURL = mediaURL.replace('\\x26', '\x26')
774
775                 video_url = mediaURL
776
777                 mobj = re.search(r'<title>(.*)</title>', webpage)
778                 if mobj is None:
779                         self._downloader.trouble(u'ERROR: unable to extract title')
780                         return
781                 video_title = mobj.group(1).decode('utf-8')
782
783                 # Extract video description
784                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
785                 if mobj is None:
786                         self._downloader.trouble(u'ERROR: unable to extract video description')
787                         return
788                 video_description = mobj.group(1).decode('utf-8')
789                 if not video_description:
790                         video_description = 'No description available.'
791
792                 # Extract video thumbnail
793                 if self._downloader.params.get('forcethumbnail', False):
794                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
795                         try:
796                                 webpage = urllib2.urlopen(request).read()
797                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
798                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
799                                 return
800                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
801                         if mobj is None:
802                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
803                                 return
804                         video_thumbnail = mobj.group(1)
805                 else:   # we need something to pass to process_info
806                         video_thumbnail = ''
807
808                 return [{
809                         'id':           video_id.decode('utf-8'),
810                         'url':          video_url.decode('utf-8'),
811                         'uploader':     u'NA',
812                         'upload_date':  u'NA',
813                         'title':        video_title,
814                         'ext':          video_extension.decode('utf-8'),
815                         'format':       u'NA',
816                         'player_url':   None,
817                 }]
818
819
820 class PhotobucketIE(InfoExtractor):
821         """Information extractor for photobucket.com."""
822
823         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
824         IE_NAME = u'photobucket'
825
826         def __init__(self, downloader=None):
827                 InfoExtractor.__init__(self, downloader)
828
829         def report_download_webpage(self, video_id):
830                 """Report webpage download."""
831                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
832
833         def report_extraction(self, video_id):
834                 """Report information extraction."""
835                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
836
837         def _real_extract(self, url):
838                 # Extract id from URL
839                 mobj = re.match(self._VALID_URL, url)
840                 if mobj is None:
841                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
842                         return
843
844                 video_id = mobj.group(1)
845
846                 video_extension = 'flv'
847
848                 # Retrieve video webpage to extract further information
849                 request = urllib2.Request(url)
850                 try:
851                         self.report_download_webpage(video_id)
852                         webpage = urllib2.urlopen(request).read()
853                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
854                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
855                         return
856
857                 # Extract URL, uploader, and title from webpage
858                 self.report_extraction(video_id)
859                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
860                 if mobj is None:
861                         self._downloader.trouble(u'ERROR: unable to extract media URL')
862                         return
863                 mediaURL = urllib.unquote(mobj.group(1))
864
865                 video_url = mediaURL
866
867                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
868                 if mobj is None:
869                         self._downloader.trouble(u'ERROR: unable to extract title')
870                         return
871                 video_title = mobj.group(1).decode('utf-8')
872
873                 video_uploader = mobj.group(2).decode('utf-8')
874
875                 return [{
876                         'id':           video_id.decode('utf-8'),
877                         'url':          video_url.decode('utf-8'),
878                         'uploader':     video_uploader,
879                         'upload_date':  u'NA',
880                         'title':        video_title,
881                         'ext':          video_extension.decode('utf-8'),
882                         'format':       u'NA',
883                         'player_url':   None,
884                 }]
885
886
887 class YahooIE(InfoExtractor):
888         """Information extractor for video.yahoo.com."""
889
890         # _VALID_URL matches all Yahoo! Video URLs
891         # _VPAGE_URL matches only the extractable '/watch/' URLs
892         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
893         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
894         IE_NAME = u'video.yahoo'
895
896         def __init__(self, downloader=None):
897                 InfoExtractor.__init__(self, downloader)
898
899         def report_download_webpage(self, video_id):
900                 """Report webpage download."""
901                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
902
903         def report_extraction(self, video_id):
904                 """Report information extraction."""
905                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
906
907         def _real_extract(self, url, new_video=True):
908                 # Extract ID from URL
909                 mobj = re.match(self._VALID_URL, url)
910                 if mobj is None:
911                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
912                         return
913
914                 video_id = mobj.group(2)
915                 video_extension = 'flv'
916
917                 # Rewrite valid but non-extractable URLs as
918                 # extractable English language /watch/ URLs
919                 if re.match(self._VPAGE_URL, url) is None:
920                         request = urllib2.Request(url)
921                         try:
922                                 webpage = urllib2.urlopen(request).read()
923                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
924                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
925                                 return
926
927                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
928                         if mobj is None:
929                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
930                                 return
931                         yahoo_id = mobj.group(1)
932
933                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
934                         if mobj is None:
935                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
936                                 return
937                         yahoo_vid = mobj.group(1)
938
939                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
940                         return self._real_extract(url, new_video=False)
941
942                 # Retrieve video webpage to extract further information
943                 request = urllib2.Request(url)
944                 try:
945                         self.report_download_webpage(video_id)
946                         webpage = urllib2.urlopen(request).read()
947                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
948                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
949                         return
950
951                 # Extract uploader and title from webpage
952                 self.report_extraction(video_id)
953                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
954                 if mobj is None:
955                         self._downloader.trouble(u'ERROR: unable to extract video title')
956                         return
957                 video_title = mobj.group(1).decode('utf-8')
958
959                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
960                 if mobj is None:
961                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
962                         return
963                 video_uploader = mobj.group(1).decode('utf-8')
964
965                 # Extract video thumbnail
966                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
967                 if mobj is None:
968                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
969                         return
970                 video_thumbnail = mobj.group(1).decode('utf-8')
971
972                 # Extract video description
973                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
974                 if mobj is None:
975                         self._downloader.trouble(u'ERROR: unable to extract video description')
976                         return
977                 video_description = mobj.group(1).decode('utf-8')
978                 if not video_description:
979                         video_description = 'No description available.'
980
981                 # Extract video height and width
982                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
983                 if mobj is None:
984                         self._downloader.trouble(u'ERROR: unable to extract video height')
985                         return
986                 yv_video_height = mobj.group(1)
987
988                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
989                 if mobj is None:
990                         self._downloader.trouble(u'ERROR: unable to extract video width')
991                         return
992                 yv_video_width = mobj.group(1)
993
994                 # Retrieve video playlist to extract media URL
995                 # I'm not completely sure what all these options are, but we
996                 # seem to need most of them, otherwise the server sends a 401.
997                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
998                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
999                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1000                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1001                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1002                 try:
1003                         self.report_download_webpage(video_id)
1004                         webpage = urllib2.urlopen(request).read()
1005                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1006                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1007                         return
1008
1009                 # Extract media URL from playlist XML
1010                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1011                 if mobj is None:
1012                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1013                         return
1014                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1015                 video_url = unescapeHTML(video_url)
1016
1017                 return [{
1018                         'id':           video_id.decode('utf-8'),
1019                         'url':          video_url,
1020                         'uploader':     video_uploader,
1021                         'upload_date':  u'NA',
1022                         'title':        video_title,
1023                         'ext':          video_extension.decode('utf-8'),
1024                         'thumbnail':    video_thumbnail.decode('utf-8'),
1025                         'description':  video_description,
1026                         'thumbnail':    video_thumbnail,
1027                         'player_url':   None,
1028                 }]
1029
1030
1031 class VimeoIE(InfoExtractor):
1032         """Information extractor for vimeo.com."""
1033
1034         # _VALID_URL matches Vimeo URLs
1035         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1036         IE_NAME = u'vimeo'
1037
1038         def __init__(self, downloader=None):
1039                 InfoExtractor.__init__(self, downloader)
1040
1041         def report_download_webpage(self, video_id):
1042                 """Report webpage download."""
1043                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1044
1045         def report_extraction(self, video_id):
1046                 """Report information extraction."""
1047                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1048
1049         def _real_extract(self, url, new_video=True):
1050                 # Extract ID from URL
1051                 mobj = re.match(self._VALID_URL, url)
1052                 if mobj is None:
1053                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1054                         return
1055
1056                 video_id = mobj.group(1)
1057
1058                 # Retrieve video webpage to extract further information
1059                 request = urllib2.Request(url, None, std_headers)
1060                 try:
1061                         self.report_download_webpage(video_id)
1062                         webpage = urllib2.urlopen(request).read()
1063                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1064                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1065                         return
1066
1067                 # Now we begin extracting as much information as we can from what we
1068                 # retrieved. First we extract the information common to all extractors,
1069                 # and latter we extract those that are Vimeo specific.
1070                 self.report_extraction(video_id)
1071
1072                 # Extract the config JSON
1073                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1074                 try:
1075                         config = json.loads(config)
1076                 except:
1077                         self._downloader.trouble(u'ERROR: unable to extract info section')
1078                         return
1079                 
1080                 # Extract title
1081                 video_title = config["video"]["title"]
1082
1083                 # Extract uploader
1084                 video_uploader = config["video"]["owner"]["name"]
1085
1086                 # Extract video thumbnail
1087                 video_thumbnail = config["video"]["thumbnail"]
1088
1089                 # Extract video description
1090                 video_description = get_element_by_id("description", webpage.decode('utf8'))
1091                 if video_description: video_description = clean_html(video_description)
1092                 else: video_description = ''
1093
1094                 # Extract upload date
1095                 video_upload_date = u'NA'
1096                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1097                 if mobj is not None:
1098                         video_upload_date = mobj.group(1)
1099
1100                 # Vimeo specific: extract request signature and timestamp
1101                 sig = config['request']['signature']
1102                 timestamp = config['request']['timestamp']
1103
1104                 # Vimeo specific: extract video codec and quality information
1105                 # TODO bind to format param
1106                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1107                 for codec in codecs:
1108                         if codec[0] in config["video"]["files"]:
1109                                 video_codec = codec[0]
1110                                 video_extension = codec[1]
1111                                 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1112                                 else: quality = 'sd'
1113                                 break
1114                 else:
1115                         self._downloader.trouble(u'ERROR: no known codec found')
1116                         return
1117
1118                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1119                                         %(video_id, sig, timestamp, quality, video_codec.upper())
1120
1121                 return [{
1122                         'id':           video_id,
1123                         'url':          video_url,
1124                         'uploader':     video_uploader,
1125                         'upload_date':  video_upload_date,
1126                         'title':        video_title,
1127                         'ext':          video_extension,
1128                         'thumbnail':    video_thumbnail,
1129                         'description':  video_description,
1130                         'player_url':   None,
1131                 }]
1132
1133
1134 class GenericIE(InfoExtractor):
1135         """Generic last-resort information extractor."""
1136
1137         _VALID_URL = r'.*'
1138         IE_NAME = u'generic'
1139
1140         def __init__(self, downloader=None):
1141                 InfoExtractor.__init__(self, downloader)
1142
1143         def report_download_webpage(self, video_id):
1144                 """Report webpage download."""
1145                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1146                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1147
1148         def report_extraction(self, video_id):
1149                 """Report information extraction."""
1150                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1151
1152         def report_following_redirect(self, new_url):
1153                 """Report information extraction."""
1154                 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1155                 
1156         def _test_redirect(self, url):
1157                 """Check if it is a redirect, like url shorteners, in case restart chain."""
1158                 class HeadRequest(urllib2.Request):
1159                         def get_method(self):
1160                                 return "HEAD"
1161
1162                 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1163                         """
1164                         Subclass the HTTPRedirectHandler to make it use our 
1165                         HeadRequest also on the redirected URL
1166                         """
1167                         def redirect_request(self, req, fp, code, msg, headers, newurl): 
1168                                 if code in (301, 302, 303, 307):
1169                                         newurl = newurl.replace(' ', '%20') 
1170                                         newheaders = dict((k,v) for k,v in req.headers.items()
1171                                                                           if k.lower() not in ("content-length", "content-type"))
1172                                         return HeadRequest(newurl, 
1173                                                                            headers=newheaders,
1174                                                                            origin_req_host=req.get_origin_req_host(), 
1175                                                                            unverifiable=True) 
1176                                 else: 
1177                                         raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp) 
1178
1179                 class HTTPMethodFallback(urllib2.BaseHandler):
1180                         """
1181                         Fallback to GET if HEAD is not allowed (405 HTTP error)
1182                         """
1183                         def http_error_405(self, req, fp, code, msg, headers): 
1184                                 fp.read()
1185                                 fp.close()
1186
1187                                 newheaders = dict((k,v) for k,v in req.headers.items()
1188                                                                   if k.lower() not in ("content-length", "content-type"))
1189                                 return self.parent.open(urllib2.Request(req.get_full_url(), 
1190                                                                                                  headers=newheaders, 
1191                                                                                                  origin_req_host=req.get_origin_req_host(), 
1192                                                                                                  unverifiable=True))
1193
1194                 # Build our opener
1195                 opener = urllib2.OpenerDirector() 
1196                 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1197                                                 HTTPMethodFallback, HEADRedirectHandler,
1198                                                 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1199                         opener.add_handler(handler())
1200
1201                 response = opener.open(HeadRequest(url))
1202                 new_url = response.geturl()
1203                 
1204                 if url == new_url: return False
1205                 
1206                 self.report_following_redirect(new_url)
1207                 self._downloader.download([new_url])
1208                 return True
1209
1210         def _real_extract(self, url):
1211                 if self._test_redirect(url): return
1212
1213                 video_id = url.split('/')[-1]
1214                 request = urllib2.Request(url)
1215                 try:
1216                         self.report_download_webpage(video_id)
1217                         webpage = urllib2.urlopen(request).read()
1218                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1219                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1220                         return
1221                 except ValueError, err:
1222                         # since this is the last-resort InfoExtractor, if
1223                         # this error is thrown, it'll be thrown here
1224                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1225                         return
1226
1227                 self.report_extraction(video_id)
1228                 # Start with something easy: JW Player in SWFObject
1229                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1230                 if mobj is None:
1231                         # Broaden the search a little bit
1232                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1233                 if mobj is None:
1234                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1235                         return
1236
1237                 # It's possible that one of the regexes
1238                 # matched, but returned an empty group:
1239                 if mobj.group(1) is None:
1240                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1241                         return
1242
1243                 video_url = urllib.unquote(mobj.group(1))
1244                 video_id = os.path.basename(video_url)
1245
1246                 # here's a fun little line of code for you:
1247                 video_extension = os.path.splitext(video_id)[1][1:]
1248                 video_id = os.path.splitext(video_id)[0]
1249
1250                 # it's tempting to parse this further, but you would
1251                 # have to take into account all the variations like
1252                 #   Video Title - Site Name
1253                 #   Site Name | Video Title
1254                 #   Video Title - Tagline | Site Name
1255                 # and so on and so forth; it's just not practical
1256                 mobj = re.search(r'<title>(.*)</title>', webpage)
1257                 if mobj is None:
1258                         self._downloader.trouble(u'ERROR: unable to extract title')
1259                         return
1260                 video_title = mobj.group(1).decode('utf-8')
1261
1262                 # video uploader is domain name
1263                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1264                 if mobj is None:
1265                         self._downloader.trouble(u'ERROR: unable to extract title')
1266                         return
1267                 video_uploader = mobj.group(1).decode('utf-8')
1268
1269                 return [{
1270                         'id':           video_id.decode('utf-8'),
1271                         'url':          video_url.decode('utf-8'),
1272                         'uploader':     video_uploader,
1273                         'upload_date':  u'NA',
1274                         'title':        video_title,
1275                         'ext':          video_extension.decode('utf-8'),
1276                         'format':       u'NA',
1277                         'player_url':   None,
1278                 }]
1279
1280
1281 class YoutubeSearchIE(InfoExtractor):
1282         """Information Extractor for YouTube search queries."""
1283         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1284         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1285         _max_youtube_results = 1000
1286         IE_NAME = u'youtube:search'
1287
1288         def __init__(self, downloader=None):
1289                 InfoExtractor.__init__(self, downloader)
1290
1291         def report_download_page(self, query, pagenum):
1292                 """Report attempt to download search page with given number."""
1293                 query = query.decode(preferredencoding())
1294                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1295
1296         def _real_extract(self, query):
1297                 mobj = re.match(self._VALID_URL, query)
1298                 if mobj is None:
1299                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1300                         return
1301
1302                 prefix, query = query.split(':')
1303                 prefix = prefix[8:]
1304                 query = query.encode('utf-8')
1305                 if prefix == '':
1306                         self._download_n_results(query, 1)
1307                         return
1308                 elif prefix == 'all':
1309                         self._download_n_results(query, self._max_youtube_results)
1310                         return
1311                 else:
1312                         try:
1313                                 n = long(prefix)
1314                                 if n <= 0:
1315                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1316                                         return
1317                                 elif n > self._max_youtube_results:
1318                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1319                                         n = self._max_youtube_results
1320                                 self._download_n_results(query, n)
1321                                 return
1322                         except ValueError: # parsing prefix as integer fails
1323                                 self._download_n_results(query, 1)
1324                                 return
1325
1326         def _download_n_results(self, query, n):
1327                 """Downloads a specified number of results for a query"""
1328
1329                 video_ids = []
1330                 pagenum = 0
1331                 limit = n
1332
1333                 while (50 * pagenum) < limit:
1334                         self.report_download_page(query, pagenum+1)
1335                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1336                         request = urllib2.Request(result_url)
1337                         try:
1338                                 data = urllib2.urlopen(request).read()
1339                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1340                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1341                                 return
1342                         api_response = json.loads(data)['data']
1343
1344                         new_ids = list(video['id'] for video in api_response['items'])
1345                         video_ids += new_ids
1346
1347                         limit = min(n, api_response['totalItems'])
1348                         pagenum += 1
1349
1350                 if len(video_ids) > n:
1351                         video_ids = video_ids[:n]
1352                 for id in video_ids:
1353                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1354                 return
1355
1356
1357 class GoogleSearchIE(InfoExtractor):
1358         """Information Extractor for Google Video search queries."""
1359         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1360         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1361         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1362         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1363         _max_google_results = 1000
1364         IE_NAME = u'video.google:search'
1365
1366         def __init__(self, downloader=None):
1367                 InfoExtractor.__init__(self, downloader)
1368
1369         def report_download_page(self, query, pagenum):
1370                 """Report attempt to download playlist page with given number."""
1371                 query = query.decode(preferredencoding())
1372                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1373
1374         def _real_extract(self, query):
1375                 mobj = re.match(self._VALID_URL, query)
1376                 if mobj is None:
1377                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1378                         return
1379
1380                 prefix, query = query.split(':')
1381                 prefix = prefix[8:]
1382                 query = query.encode('utf-8')
1383                 if prefix == '':
1384                         self._download_n_results(query, 1)
1385                         return
1386                 elif prefix == 'all':
1387                         self._download_n_results(query, self._max_google_results)
1388                         return
1389                 else:
1390                         try:
1391                                 n = long(prefix)
1392                                 if n <= 0:
1393                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1394                                         return
1395                                 elif n > self._max_google_results:
1396                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1397                                         n = self._max_google_results
1398                                 self._download_n_results(query, n)
1399                                 return
1400                         except ValueError: # parsing prefix as integer fails
1401                                 self._download_n_results(query, 1)
1402                                 return
1403
1404         def _download_n_results(self, query, n):
1405                 """Downloads a specified number of results for a query"""
1406
1407                 video_ids = []
1408                 pagenum = 0
1409
1410                 while True:
1411                         self.report_download_page(query, pagenum)
1412                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1413                         request = urllib2.Request(result_url)
1414                         try:
1415                                 page = urllib2.urlopen(request).read()
1416                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1417                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1418                                 return
1419
1420                         # Extract video identifiers
1421                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1422                                 video_id = mobj.group(1)
1423                                 if video_id not in video_ids:
1424                                         video_ids.append(video_id)
1425                                         if len(video_ids) == n:
1426                                                 # Specified n videos reached
1427                                                 for id in video_ids:
1428                                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1429                                                 return
1430
1431                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1432                                 for id in video_ids:
1433                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1434                                 return
1435
1436                         pagenum = pagenum + 1
1437
1438
1439 class YahooSearchIE(InfoExtractor):
1440         """Information Extractor for Yahoo! Video search queries."""
1441         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1442         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1443         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1444         _MORE_PAGES_INDICATOR = r'\s*Next'
1445         _max_yahoo_results = 1000
1446         IE_NAME = u'video.yahoo:search'
1447
1448         def __init__(self, downloader=None):
1449                 InfoExtractor.__init__(self, downloader)
1450
1451         def report_download_page(self, query, pagenum):
1452                 """Report attempt to download playlist page with given number."""
1453                 query = query.decode(preferredencoding())
1454                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1455
1456         def _real_extract(self, query):
1457                 mobj = re.match(self._VALID_URL, query)
1458                 if mobj is None:
1459                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1460                         return
1461
1462                 prefix, query = query.split(':')
1463                 prefix = prefix[8:]
1464                 query = query.encode('utf-8')
1465                 if prefix == '':
1466                         self._download_n_results(query, 1)
1467                         return
1468                 elif prefix == 'all':
1469                         self._download_n_results(query, self._max_yahoo_results)
1470                         return
1471                 else:
1472                         try:
1473                                 n = long(prefix)
1474                                 if n <= 0:
1475                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1476                                         return
1477                                 elif n > self._max_yahoo_results:
1478                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1479                                         n = self._max_yahoo_results
1480                                 self._download_n_results(query, n)
1481                                 return
1482                         except ValueError: # parsing prefix as integer fails
1483                                 self._download_n_results(query, 1)
1484                                 return
1485
1486         def _download_n_results(self, query, n):
1487                 """Downloads a specified number of results for a query"""
1488
1489                 video_ids = []
1490                 already_seen = set()
1491                 pagenum = 1
1492
1493                 while True:
1494                         self.report_download_page(query, pagenum)
1495                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1496                         request = urllib2.Request(result_url)
1497                         try:
1498                                 page = urllib2.urlopen(request).read()
1499                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1500                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1501                                 return
1502
1503                         # Extract video identifiers
1504                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1505                                 video_id = mobj.group(1)
1506                                 if video_id not in already_seen:
1507                                         video_ids.append(video_id)
1508                                         already_seen.add(video_id)
1509                                         if len(video_ids) == n:
1510                                                 # Specified n videos reached
1511                                                 for id in video_ids:
1512                                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1513                                                 return
1514
1515                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1516                                 for id in video_ids:
1517                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1518                                 return
1519
1520                         pagenum = pagenum + 1
1521
1522
1523 class YoutubePlaylistIE(InfoExtractor):
1524         """Information Extractor for YouTube playlists."""
1525
1526         _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1527         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1528         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1529         _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1530         IE_NAME = u'youtube:playlist'
1531
1532         def __init__(self, downloader=None):
1533                 InfoExtractor.__init__(self, downloader)
1534
1535         def report_download_page(self, playlist_id, pagenum):
1536                 """Report attempt to download playlist page with given number."""
1537                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1538
1539         def _real_extract(self, url):
1540                 # Extract playlist id
1541                 mobj = re.match(self._VALID_URL, url)
1542                 if mobj is None:
1543                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1544                         return
1545
1546                 # Single video case
1547                 if mobj.group(3) is not None:
1548                         self._downloader.download([mobj.group(3)])
1549                         return
1550
1551                 # Download playlist pages
1552                 # prefix is 'p' as default for playlists but there are other types that need extra care
1553                 playlist_prefix = mobj.group(1)
1554                 if playlist_prefix == 'a':
1555                         playlist_access = 'artist'
1556                 else:
1557                         playlist_prefix = 'p'
1558                         playlist_access = 'view_play_list'
1559                 playlist_id = mobj.group(2)
1560                 video_ids = []
1561                 pagenum = 1
1562
1563                 while True:
1564                         self.report_download_page(playlist_id, pagenum)
1565                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1566                         request = urllib2.Request(url)
1567                         try:
1568                                 page = urllib2.urlopen(request).read()
1569                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1570                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1571                                 return
1572
1573                         # Extract video identifiers
1574                         ids_in_page = []
1575                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1576                                 if mobj.group(1) not in ids_in_page:
1577                                         ids_in_page.append(mobj.group(1))
1578                         video_ids.extend(ids_in_page)
1579
1580                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1581                                 break
1582                         pagenum = pagenum + 1
1583
1584                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1585                 playlistend = self._downloader.params.get('playlistend', -1)
1586                 if playlistend == -1:
1587                         video_ids = video_ids[playliststart:]
1588                 else:
1589                         video_ids = video_ids[playliststart:playlistend]
1590
1591                 for id in video_ids:
1592                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1593                 return
1594
1595
1596 class YoutubeChannelIE(InfoExtractor):
1597         """Information Extractor for YouTube channels."""
1598
1599         _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1600         _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1601         _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1602         IE_NAME = u'youtube:channel'
1603
1604         def report_download_page(self, channel_id, pagenum):
1605                 """Report attempt to download channel page with given number."""
1606                 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1607
1608         def _real_extract(self, url):
1609                 # Extract channel id
1610                 mobj = re.match(self._VALID_URL, url)
1611                 if mobj is None:
1612                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1613                         return
1614
1615                 # Download channel pages
1616                 channel_id = mobj.group(1)
1617                 video_ids = []
1618                 pagenum = 1
1619
1620                 while True:
1621                         self.report_download_page(channel_id, pagenum)
1622                         url = self._TEMPLATE_URL % (channel_id, pagenum)
1623                         request = urllib2.Request(url)
1624                         try:
1625                                 page = urllib2.urlopen(request).read()
1626                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1627                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1628                                 return
1629
1630                         # Extract video identifiers
1631                         ids_in_page = []
1632                         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1633                                 if mobj.group(1) not in ids_in_page:
1634                                         ids_in_page.append(mobj.group(1))
1635                         video_ids.extend(ids_in_page)
1636
1637                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1638                                 break
1639                         pagenum = pagenum + 1
1640
1641                 for id in video_ids:
1642                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1643                 return
1644
1645
1646 class YoutubeUserIE(InfoExtractor):
1647         """Information Extractor for YouTube users."""
1648
1649         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1650         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1651         _GDATA_PAGE_SIZE = 50
1652         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1653         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1654         IE_NAME = u'youtube:user'
1655
1656         def __init__(self, downloader=None):
1657                 InfoExtractor.__init__(self, downloader)
1658
1659         def report_download_page(self, username, start_index):
1660                 """Report attempt to download user page."""
1661                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1662                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1663
1664         def _real_extract(self, url):
1665                 # Extract username
1666                 mobj = re.match(self._VALID_URL, url)
1667                 if mobj is None:
1668                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1669                         return
1670
1671                 username = mobj.group(1)
1672
1673                 # Download video ids using YouTube Data API. Result size per
1674                 # query is limited (currently to 50 videos) so we need to query
1675                 # page by page until there are no video ids - it means we got
1676                 # all of them.
1677
1678                 video_ids = []
1679                 pagenum = 0
1680
1681                 while True:
1682                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1683                         self.report_download_page(username, start_index)
1684
1685                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1686
1687                         try:
1688                                 page = urllib2.urlopen(request).read()
1689                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1690                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1691                                 return
1692
1693                         # Extract video identifiers
1694                         ids_in_page = []
1695
1696                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1697                                 if mobj.group(1) not in ids_in_page:
1698                                         ids_in_page.append(mobj.group(1))
1699
1700                         video_ids.extend(ids_in_page)
1701
1702                         # A little optimization - if current page is not
1703                         # "full", ie. does not contain PAGE_SIZE video ids then
1704                         # we can assume that this page is the last one - there
1705                         # are no more ids on further pages - no need to query
1706                         # again.
1707
1708                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1709                                 break
1710
1711                         pagenum += 1
1712
1713                 all_ids_count = len(video_ids)
1714                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1715                 playlistend = self._downloader.params.get('playlistend', -1)
1716
1717                 if playlistend == -1:
1718                         video_ids = video_ids[playliststart:]
1719                 else:
1720                         video_ids = video_ids[playliststart:playlistend]
1721
1722                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1723                                 (username, all_ids_count, len(video_ids)))
1724
1725                 for video_id in video_ids:
1726                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1727
1728
1729 class BlipTVUserIE(InfoExtractor):
1730         """Information Extractor for blip.tv users."""
1731
1732         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1733         _PAGE_SIZE = 12
1734         IE_NAME = u'blip.tv:user'
1735
1736         def __init__(self, downloader=None):
1737                 InfoExtractor.__init__(self, downloader)
1738
1739         def report_download_page(self, username, pagenum):
1740                 """Report attempt to download user page."""
1741                 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1742                                 (self.IE_NAME, username, pagenum))
1743
1744         def _real_extract(self, url):
1745                 # Extract username
1746                 mobj = re.match(self._VALID_URL, url)
1747                 if mobj is None:
1748                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1749                         return
1750
1751                 username = mobj.group(1)
1752
1753                 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1754
1755                 request = urllib2.Request(url)
1756
1757                 try:
1758                         page = urllib2.urlopen(request).read().decode('utf-8')
1759                         mobj = re.search(r'data-users-id="([^"]+)"', page)
1760                         page_base = page_base % mobj.group(1)
1761                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1762                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1763                         return
1764
1765
1766                 # Download video ids using BlipTV Ajax calls. Result size per
1767                 # query is limited (currently to 12 videos) so we need to query
1768                 # page by page until there are no video ids - it means we got
1769                 # all of them.
1770
1771                 video_ids = []
1772                 pagenum = 1
1773
1774                 while True:
1775                         self.report_download_page(username, pagenum)
1776
1777                         request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1778
1779                         try:
1780                                 page = urllib2.urlopen(request).read().decode('utf-8')
1781                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1782                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1783                                 return
1784
1785                         # Extract video identifiers
1786                         ids_in_page = []
1787
1788                         for mobj in re.finditer(r'href="/([^"]+)"', page):
1789                                 if mobj.group(1) not in ids_in_page:
1790                                         ids_in_page.append(unescapeHTML(mobj.group(1)))
1791
1792                         video_ids.extend(ids_in_page)
1793
1794                         # A little optimization - if current page is not
1795                         # "full", ie. does not contain PAGE_SIZE video ids then
1796                         # we can assume that this page is the last one - there
1797                         # are no more ids on further pages - no need to query
1798                         # again.
1799
1800                         if len(ids_in_page) < self._PAGE_SIZE:
1801                                 break
1802
1803                         pagenum += 1
1804
1805                 all_ids_count = len(video_ids)
1806                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1807                 playlistend = self._downloader.params.get('playlistend', -1)
1808
1809                 if playlistend == -1:
1810                         video_ids = video_ids[playliststart:]
1811                 else:
1812                         video_ids = video_ids[playliststart:playlistend]
1813
1814                 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1815                                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1816
1817                 for video_id in video_ids:
1818                         self._downloader.download([u'http://blip.tv/'+video_id])
1819
1820
1821 class DepositFilesIE(InfoExtractor):
1822         """Information extractor for depositfiles.com"""
1823
1824         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1825         IE_NAME = u'DepositFiles'
1826
1827         def __init__(self, downloader=None):
1828                 InfoExtractor.__init__(self, downloader)
1829
1830         def report_download_webpage(self, file_id):
1831                 """Report webpage download."""
1832                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1833
1834         def report_extraction(self, file_id):
1835                 """Report information extraction."""
1836                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1837
1838         def _real_extract(self, url):
1839                 file_id = url.split('/')[-1]
1840                 # Rebuild url in english locale
1841                 url = 'http://depositfiles.com/en/files/' + file_id
1842
1843                 # Retrieve file webpage with 'Free download' button pressed
1844                 free_download_indication = { 'gateway_result' : '1' }
1845                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1846                 try:
1847                         self.report_download_webpage(file_id)
1848                         webpage = urllib2.urlopen(request).read()
1849                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1850                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1851                         return
1852
1853                 # Search for the real file URL
1854                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1855                 if (mobj is None) or (mobj.group(1) is None):
1856                         # Try to figure out reason of the error.
1857                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1858                         if (mobj is not None) and (mobj.group(1) is not None):
1859                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1860                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1861                         else:
1862                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1863                         return
1864
1865                 file_url = mobj.group(1)
1866                 file_extension = os.path.splitext(file_url)[1][1:]
1867
1868                 # Search for file title
1869                 mobj = re.search(r'<b title="(.*?)">', webpage)
1870                 if mobj is None:
1871                         self._downloader.trouble(u'ERROR: unable to extract title')
1872                         return
1873                 file_title = mobj.group(1).decode('utf-8')
1874
1875                 return [{
1876                         'id':           file_id.decode('utf-8'),
1877                         'url':          file_url.decode('utf-8'),
1878                         'uploader':     u'NA',
1879                         'upload_date':  u'NA',
1880                         'title':        file_title,
1881                         'ext':          file_extension.decode('utf-8'),
1882                         'format':       u'NA',
1883                         'player_url':   None,
1884                 }]
1885
1886
1887 class FacebookIE(InfoExtractor):
1888         """Information Extractor for Facebook"""
1889
1890         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1891         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1892         _NETRC_MACHINE = 'facebook'
1893         _available_formats = ['video', 'highqual', 'lowqual']
1894         _video_extensions = {
1895                 'video': 'mp4',
1896                 'highqual': 'mp4',
1897                 'lowqual': 'mp4',
1898         }
1899         IE_NAME = u'facebook'
1900
1901         def __init__(self, downloader=None):
1902                 InfoExtractor.__init__(self, downloader)
1903
1904         def _reporter(self, message):
1905                 """Add header and report message."""
1906                 self._downloader.to_screen(u'[facebook] %s' % message)
1907
1908         def report_login(self):
1909                 """Report attempt to log in."""
1910                 self._reporter(u'Logging in')
1911
1912         def report_video_webpage_download(self, video_id):
1913                 """Report attempt to download video webpage."""
1914                 self._reporter(u'%s: Downloading video webpage' % video_id)
1915
1916         def report_information_extraction(self, video_id):
1917                 """Report attempt to extract video information."""
1918                 self._reporter(u'%s: Extracting video information' % video_id)
1919
1920         def _parse_page(self, video_webpage):
1921                 """Extract video information from page"""
1922                 # General data
1923                 data = {'title': r'\("video_title", "(.*?)"\)',
1924                         'description': r'<div class="datawrap">(.*?)</div>',
1925                         'owner': r'\("video_owner_name", "(.*?)"\)',
1926                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1927                         }
1928                 video_info = {}
1929                 for piece in data.keys():
1930                         mobj = re.search(data[piece], video_webpage)
1931                         if mobj is not None:
1932                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1933
1934                 # Video urls
1935                 video_urls = {}
1936                 for fmt in self._available_formats:
1937                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1938                         if mobj is not None:
1939                                 # URL is in a Javascript segment inside an escaped Unicode format within
1940                                 # the generally utf-8 page
1941                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1942                 video_info['video_urls'] = video_urls
1943
1944                 return video_info
1945
1946         def _real_initialize(self):
1947                 if self._downloader is None:
1948                         return
1949
1950                 useremail = None
1951                 password = None
1952                 downloader_params = self._downloader.params
1953
1954                 # Attempt to use provided username and password or .netrc data
1955                 if downloader_params.get('username', None) is not None:
1956                         useremail = downloader_params['username']
1957                         password = downloader_params['password']
1958                 elif downloader_params.get('usenetrc', False):
1959                         try:
1960                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1961                                 if info is not None:
1962                                         useremail = info[0]
1963                                         password = info[2]
1964                                 else:
1965                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1966                         except (IOError, netrc.NetrcParseError), err:
1967                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1968                                 return
1969
1970                 if useremail is None:
1971                         return
1972
1973                 # Log in
1974                 login_form = {
1975                         'email': useremail,
1976                         'pass': password,
1977                         'login': 'Log+In'
1978                         }
1979                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1980                 try:
1981                         self.report_login()
1982                         login_results = urllib2.urlopen(request).read()
1983                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1984                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1985                                 return
1986                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1987                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1988                         return
1989
1990         def _real_extract(self, url):
1991                 mobj = re.match(self._VALID_URL, url)
1992                 if mobj is None:
1993                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1994                         return
1995                 video_id = mobj.group('ID')
1996
1997                 # Get video webpage
1998                 self.report_video_webpage_download(video_id)
1999                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2000                 try:
2001                         page = urllib2.urlopen(request)
2002                         video_webpage = page.read()
2003                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2004                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2005                         return
2006
2007                 # Start extracting information
2008                 self.report_information_extraction(video_id)
2009
2010                 # Extract information
2011                 video_info = self._parse_page(video_webpage)
2012
2013                 # uploader
2014                 if 'owner' not in video_info:
2015                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2016                         return
2017                 video_uploader = video_info['owner']
2018
2019                 # title
2020                 if 'title' not in video_info:
2021                         self._downloader.trouble(u'ERROR: unable to extract video title')
2022                         return
2023                 video_title = video_info['title']
2024                 video_title = video_title.decode('utf-8')
2025
2026                 # thumbnail image
2027                 if 'thumbnail' not in video_info:
2028                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2029                         video_thumbnail = ''
2030                 else:
2031                         video_thumbnail = video_info['thumbnail']
2032
2033                 # upload date
2034                 upload_date = u'NA'
2035                 if 'upload_date' in video_info:
2036                         upload_time = video_info['upload_date']
2037                         timetuple = email.utils.parsedate_tz(upload_time)
2038                         if timetuple is not None:
2039                                 try:
2040                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2041                                 except:
2042                                         pass
2043
2044                 # description
2045                 video_description = video_info.get('description', 'No description available.')
2046
2047                 url_map = video_info['video_urls']
2048                 if len(url_map.keys()) > 0:
2049                         # Decide which formats to download
2050                         req_format = self._downloader.params.get('format', None)
2051                         format_limit = self._downloader.params.get('format_limit', None)
2052
2053                         if format_limit is not None and format_limit in self._available_formats:
2054                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2055                         else:
2056                                 format_list = self._available_formats
2057                         existing_formats = [x for x in format_list if x in url_map]
2058                         if len(existing_formats) == 0:
2059                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2060                                 return
2061                         if req_format is None:
2062                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2063                         elif req_format == 'worst':
2064                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2065                         elif req_format == '-1':
2066                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2067                         else:
2068                                 # Specific format
2069                                 if req_format not in url_map:
2070                                         self._downloader.trouble(u'ERROR: requested format not available')
2071                                         return
2072                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2073
2074                 results = []
2075                 for format_param, video_real_url in video_url_list:
2076                         # Extension
2077                         video_extension = self._video_extensions.get(format_param, 'mp4')
2078
2079                         results.append({
2080                                 'id':           video_id.decode('utf-8'),
2081                                 'url':          video_real_url.decode('utf-8'),
2082                                 'uploader':     video_uploader.decode('utf-8'),
2083                                 'upload_date':  upload_date,
2084                                 'title':        video_title,
2085                                 'ext':          video_extension.decode('utf-8'),
2086                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2087                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2088                                 'description':  video_description.decode('utf-8'),
2089                                 'player_url':   None,
2090                         })
2091                 return results
2092
2093 class BlipTVIE(InfoExtractor):
2094         """Information extractor for blip.tv"""
2095
2096         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2097         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2098         IE_NAME = u'blip.tv'
2099
2100         def report_extraction(self, file_id):
2101                 """Report information extraction."""
2102                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2103
2104         def report_direct_download(self, title):
2105                 """Report information extraction."""
2106                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2107
2108         def _real_extract(self, url):
2109                 mobj = re.match(self._VALID_URL, url)
2110                 if mobj is None:
2111                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2112                         return
2113
2114                 if '?' in url:
2115                         cchar = '&'
2116                 else:
2117                         cchar = '?'
2118                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2119                 request = urllib2.Request(json_url.encode('utf-8'))
2120                 self.report_extraction(mobj.group(1))
2121                 info = None
2122                 try:
2123                         urlh = urllib2.urlopen(request)
2124                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2125                                 basename = url.split('/')[-1]
2126                                 title,ext = os.path.splitext(basename)
2127                                 title = title.decode('UTF-8')
2128                                 ext = ext.replace('.', '')
2129                                 self.report_direct_download(title)
2130                                 info = {
2131                                         'id': title,
2132                                         'url': url,
2133                                         'title': title,
2134                                         'ext': ext,
2135                                         'urlhandle': urlh
2136                                 }
2137                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2138                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2139                         return
2140                 if info is None: # Regular URL
2141                         try:
2142                                 json_code = urlh.read()
2143                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2144                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2145                                 return
2146
2147                         try:
2148                                 json_data = json.loads(json_code)
2149                                 if 'Post' in json_data:
2150                                         data = json_data['Post']
2151                                 else:
2152                                         data = json_data
2153
2154                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2155                                 video_url = data['media']['url']
2156                                 umobj = re.match(self._URL_EXT, video_url)
2157                                 if umobj is None:
2158                                         raise ValueError('Can not determine filename extension')
2159                                 ext = umobj.group(1)
2160
2161                                 info = {
2162                                         'id': data['item_id'],
2163                                         'url': video_url,
2164                                         'uploader': data['display_name'],
2165                                         'upload_date': upload_date,
2166                                         'title': data['title'],
2167                                         'ext': ext,
2168                                         'format': data['media']['mimeType'],
2169                                         'thumbnail': data['thumbnailUrl'],
2170                                         'description': data['description'],
2171                                         'player_url': data['embedUrl']
2172                                 }
2173                         except (ValueError,KeyError), err:
2174                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2175                                 return
2176
2177                 std_headers['User-Agent'] = 'iTunes/10.6.1'
2178                 return [info]
2179
2180
2181 class MyVideoIE(InfoExtractor):
2182         """Information Extractor for myvideo.de."""
2183
2184         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2185         IE_NAME = u'myvideo'
2186
2187         def __init__(self, downloader=None):
2188                 InfoExtractor.__init__(self, downloader)
2189         
2190         def report_download_webpage(self, video_id):
2191                 """Report webpage download."""
2192                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2193
2194         def report_extraction(self, video_id):
2195                 """Report information extraction."""
2196                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2197
2198         def _real_extract(self,url):
2199                 mobj = re.match(self._VALID_URL, url)
2200                 if mobj is None:
2201                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2202                         return
2203
2204                 video_id = mobj.group(1)
2205
2206                 # Get video webpage
2207                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2208                 try:
2209                         self.report_download_webpage(video_id)
2210                         webpage = urllib2.urlopen(request).read()
2211                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2212                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2213                         return
2214
2215                 self.report_extraction(video_id)
2216                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2217                                  webpage)
2218                 if mobj is None:
2219                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2220                         return
2221                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2222
2223                 mobj = re.search('<title>([^<]+)</title>', webpage)
2224                 if mobj is None:
2225                         self._downloader.trouble(u'ERROR: unable to extract title')
2226                         return
2227
2228                 video_title = mobj.group(1)
2229
2230                 return [{
2231                         'id':           video_id,
2232                         'url':          video_url,
2233                         'uploader':     u'NA',
2234                         'upload_date':  u'NA',
2235                         'title':        video_title,
2236                         'ext':          u'flv',
2237                         'format':       u'NA',
2238                         'player_url':   None,
2239                 }]
2240
2241 class ComedyCentralIE(InfoExtractor):
2242         """Information extractor for The Daily Show and Colbert Report """
2243
2244         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2245         IE_NAME = u'comedycentral'
2246
2247         def report_extraction(self, episode_id):
2248                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2249
2250         def report_config_download(self, episode_id):
2251                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2252
2253         def report_index_download(self, episode_id):
2254                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2255
2256         def report_player_url(self, episode_id):
2257                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2258
2259         def _real_extract(self, url):
2260                 mobj = re.match(self._VALID_URL, url)
2261                 if mobj is None:
2262                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2263                         return
2264
2265                 if mobj.group('shortname'):
2266                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
2267                                 url = u'http://www.thedailyshow.com/full-episodes/'
2268                         else:
2269                                 url = u'http://www.colbertnation.com/full-episodes/'
2270                         mobj = re.match(self._VALID_URL, url)
2271                         assert mobj is not None
2272
2273                 dlNewest = not mobj.group('episode')
2274                 if dlNewest:
2275                         epTitle = mobj.group('showname')
2276                 else:
2277                         epTitle = mobj.group('episode')
2278
2279                 req = urllib2.Request(url)
2280                 self.report_extraction(epTitle)
2281                 try:
2282                         htmlHandle = urllib2.urlopen(req)
2283                         html = htmlHandle.read()
2284                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2285                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2286                         return
2287                 if dlNewest:
2288                         url = htmlHandle.geturl()
2289                         mobj = re.match(self._VALID_URL, url)
2290                         if mobj is None:
2291                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2292                                 return
2293                         if mobj.group('episode') == '':
2294                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2295                                 return
2296                         epTitle = mobj.group('episode')
2297
2298                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2299                 if len(mMovieParams) == 0:
2300                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2301                         return
2302
2303                 playerUrl_raw = mMovieParams[0][0]
2304                 self.report_player_url(epTitle)
2305                 try:
2306                         urlHandle = urllib2.urlopen(playerUrl_raw)
2307                         playerUrl = urlHandle.geturl()
2308                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2309                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2310                         return
2311
2312                 uri = mMovieParams[0][1]
2313                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2314                 self.report_index_download(epTitle)
2315                 try:
2316                         indexXml = urllib2.urlopen(indexUrl).read()
2317                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2318                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2319                         return
2320
2321                 results = []
2322
2323                 idoc = xml.etree.ElementTree.fromstring(indexXml)
2324                 itemEls = idoc.findall('.//item')
2325                 for itemEl in itemEls:
2326                         mediaId = itemEl.findall('./guid')[0].text
2327                         shortMediaId = mediaId.split(':')[-1]
2328                         showId = mediaId.split(':')[-2].replace('.com', '')
2329                         officialTitle = itemEl.findall('./title')[0].text
2330                         officialDate = itemEl.findall('./pubDate')[0].text
2331
2332                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2333                                                 urllib.urlencode({'uri': mediaId}))
2334                         configReq = urllib2.Request(configUrl)
2335                         self.report_config_download(epTitle)
2336                         try:
2337                                 configXml = urllib2.urlopen(configReq).read()
2338                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2339                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2340                                 return
2341
2342                         cdoc = xml.etree.ElementTree.fromstring(configXml)
2343                         turls = []
2344                         for rendition in cdoc.findall('.//rendition'):
2345                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2346                                 turls.append(finfo)
2347
2348                         if len(turls) == 0:
2349                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2350                                 continue
2351
2352                         # For now, just pick the highest bitrate
2353                         format,video_url = turls[-1]
2354
2355                         effTitle = showId + u'-' + epTitle
2356                         info = {
2357                                 'id': shortMediaId,
2358                                 'url': video_url,
2359                                 'uploader': showId,
2360                                 'upload_date': officialDate,
2361                                 'title': effTitle,
2362                                 'ext': 'mp4',
2363                                 'format': format,
2364                                 'thumbnail': None,
2365                                 'description': officialTitle,
2366                                 'player_url': playerUrl
2367                         }
2368
2369                         results.append(info)
2370                         
2371                 return results
2372
2373
2374 class EscapistIE(InfoExtractor):
2375         """Information extractor for The Escapist """
2376
2377         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2378         IE_NAME = u'escapist'
2379
2380         def report_extraction(self, showName):
2381                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2382
2383         def report_config_download(self, showName):
2384                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2385
2386         def _real_extract(self, url):
2387                 mobj = re.match(self._VALID_URL, url)
2388                 if mobj is None:
2389                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2390                         return
2391                 showName = mobj.group('showname')
2392                 videoId = mobj.group('episode')
2393
2394                 self.report_extraction(showName)
2395                 try:
2396                         webPage = urllib2.urlopen(url)
2397                         webPageBytes = webPage.read()
2398                         m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2399                         webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2400                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2401                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2402                         return
2403
2404                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2405                 description = unescapeHTML(descMatch.group(1))
2406                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2407                 imgUrl = unescapeHTML(imgMatch.group(1))
2408                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2409                 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2410                 configUrlMatch = re.search('config=(.*)$', playerUrl)
2411                 configUrl = urllib2.unquote(configUrlMatch.group(1))
2412
2413                 self.report_config_download(showName)
2414                 try:
2415                         configJSON = urllib2.urlopen(configUrl).read()
2416                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2417                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2418                         return
2419
2420                 # Technically, it's JavaScript, not JSON
2421                 configJSON = configJSON.replace("'", '"')
2422
2423                 try:
2424                         config = json.loads(configJSON)
2425                 except (ValueError,), err:
2426                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2427                         return
2428
2429                 playlist = config['playlist']
2430                 videoUrl = playlist[1]['url']
2431
2432                 info = {
2433                         'id': videoId,
2434                         'url': videoUrl,
2435                         'uploader': showName,
2436                         'upload_date': None,
2437                         'title': showName,
2438                         'ext': 'flv',
2439                         'format': 'flv',
2440                         'thumbnail': imgUrl,
2441                         'description': description,
2442                         'player_url': playerUrl,
2443                 }
2444
2445                 return [info]
2446
2447
2448 class CollegeHumorIE(InfoExtractor):
2449         """Information extractor for collegehumor.com"""
2450
2451         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2452         IE_NAME = u'collegehumor'
2453
2454         def report_webpage(self, video_id):
2455                 """Report information extraction."""
2456                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2457
2458         def report_extraction(self, video_id):
2459                 """Report information extraction."""
2460                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2461
2462         def _real_extract(self, url):
2463                 mobj = re.match(self._VALID_URL, url)
2464                 if mobj is None:
2465                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2466                         return
2467                 video_id = mobj.group('videoid')
2468
2469                 self.report_webpage(video_id)
2470                 request = urllib2.Request(url)
2471                 try:
2472                         webpage = urllib2.urlopen(request).read()
2473                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2474                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2475                         return
2476
2477                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2478                 if m is None:
2479                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2480                         return
2481                 internal_video_id = m.group('internalvideoid')
2482
2483                 info = {
2484                         'id': video_id,
2485                         'internal_id': internal_video_id,
2486                 }
2487
2488                 self.report_extraction(video_id)
2489                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2490                 try:
2491                         metaXml = urllib2.urlopen(xmlUrl).read()
2492                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2493                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2494                         return
2495
2496                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2497                 try:
2498                         videoNode = mdoc.findall('./video')[0]
2499                         info['description'] = videoNode.findall('./description')[0].text
2500                         info['title'] = videoNode.findall('./caption')[0].text
2501                         info['url'] = videoNode.findall('./file')[0].text
2502                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2503                         info['ext'] = info['url'].rpartition('.')[2]
2504                         info['format'] = info['ext']
2505                 except IndexError:
2506                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2507                         return
2508
2509                 return [info]
2510
2511
2512 class XVideosIE(InfoExtractor):
2513         """Information extractor for xvideos.com"""
2514
2515         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2516         IE_NAME = u'xvideos'
2517
2518         def report_webpage(self, video_id):
2519                 """Report information extraction."""
2520                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2521
2522         def report_extraction(self, video_id):
2523                 """Report information extraction."""
2524                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2525
2526         def _real_extract(self, url):
2527                 mobj = re.match(self._VALID_URL, url)
2528                 if mobj is None:
2529                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2530                         return
2531                 video_id = mobj.group(1).decode('utf-8')
2532
2533                 self.report_webpage(video_id)
2534
2535                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2536                 try:
2537                         webpage = urllib2.urlopen(request).read()
2538                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2539                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2540                         return
2541
2542                 self.report_extraction(video_id)
2543
2544
2545                 # Extract video URL
2546                 mobj = re.search(r'flv_url=(.+?)&', webpage)
2547                 if mobj is None:
2548                         self._downloader.trouble(u'ERROR: unable to extract video url')
2549                         return
2550                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2551
2552
2553                 # Extract title
2554                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2555                 if mobj is None:
2556                         self._downloader.trouble(u'ERROR: unable to extract video title')
2557                         return
2558                 video_title = mobj.group(1).decode('utf-8')
2559
2560
2561                 # Extract video thumbnail
2562                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2563                 if mobj is None:
2564                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2565                         return
2566                 video_thumbnail = mobj.group(0).decode('utf-8')
2567
2568                 info = {
2569                         'id': video_id,
2570                         'url': video_url,
2571                         'uploader': None,
2572                         'upload_date': None,
2573                         'title': video_title,
2574                         'ext': 'flv',
2575                         'format': 'flv',
2576                         'thumbnail': video_thumbnail,
2577                         'description': None,
2578                         'player_url': None,
2579                 }
2580
2581                 return [info]
2582
2583
2584 class SoundcloudIE(InfoExtractor):
2585         """Information extractor for soundcloud.com
2586            To access the media, the uid of the song and a stream token
2587            must be extracted from the page source and the script must make
2588            a request to media.soundcloud.com/crossdomain.xml. Then
2589            the media can be grabbed by requesting from an url composed
2590            of the stream token and uid
2591          """
2592
2593         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2594         IE_NAME = u'soundcloud'
2595
2596         def __init__(self, downloader=None):
2597                 InfoExtractor.__init__(self, downloader)
2598
2599         def report_webpage(self, video_id):
2600                 """Report information extraction."""
2601                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2602
2603         def report_extraction(self, video_id):
2604                 """Report information extraction."""
2605                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2606
2607         def _real_extract(self, url):
2608                 mobj = re.match(self._VALID_URL, url)
2609                 if mobj is None:
2610                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2611                         return
2612
2613                 # extract uploader (which is in the url)
2614                 uploader = mobj.group(1).decode('utf-8')
2615                 # extract simple title (uploader + slug of song title)
2616                 slug_title =  mobj.group(2).decode('utf-8')
2617                 simple_title = uploader + u'-' + slug_title
2618
2619                 self.report_webpage('%s/%s' % (uploader, slug_title))
2620
2621                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2622                 try:
2623                         webpage = urllib2.urlopen(request).read()
2624                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2625                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2626                         return
2627
2628                 self.report_extraction('%s/%s' % (uploader, slug_title))
2629
2630                 # extract uid and stream token that soundcloud hands out for access
2631                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2632                 if mobj:
2633                         video_id = mobj.group(1)
2634                         stream_token = mobj.group(2)
2635
2636                 # extract unsimplified title
2637                 mobj = re.search('"title":"(.*?)",', webpage)
2638                 if mobj:
2639                         title = mobj.group(1).decode('utf-8')
2640                 else:
2641                         title = simple_title
2642
2643                 # construct media url (with uid/token)
2644                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2645                 mediaURL = mediaURL % (video_id, stream_token)
2646
2647                 # description
2648                 description = u'No description available'
2649                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2650                 if mobj:
2651                         description = mobj.group(1)
2652                 
2653                 # upload date
2654                 upload_date = None
2655                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2656                 if mobj:
2657                         try:
2658                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2659                         except Exception, e:
2660                                 self._downloader.to_stderr(str(e))
2661
2662                 # for soundcloud, a request to a cross domain is required for cookies
2663                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2664
2665                 return [{
2666                         'id':           video_id.decode('utf-8'),
2667                         'url':          mediaURL,
2668                         'uploader':     uploader.decode('utf-8'),
2669                         'upload_date':  upload_date,
2670                         'title':        title,
2671                         'ext':          u'mp3',
2672                         'format':       u'NA',
2673                         'player_url':   None,
2674                         'description': description.decode('utf-8')
2675                 }]
2676
2677
2678 class InfoQIE(InfoExtractor):
2679         """Information extractor for infoq.com"""
2680
2681         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2682         IE_NAME = u'infoq'
2683
2684         def report_webpage(self, video_id):
2685                 """Report information extraction."""
2686                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2687
2688         def report_extraction(self, video_id):
2689                 """Report information extraction."""
2690                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2691
2692         def _real_extract(self, url):
2693                 mobj = re.match(self._VALID_URL, url)
2694                 if mobj is None:
2695                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2696                         return
2697
2698                 self.report_webpage(url)
2699
2700                 request = urllib2.Request(url)
2701                 try:
2702                         webpage = urllib2.urlopen(request).read()
2703                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2704                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2705                         return
2706
2707                 self.report_extraction(url)
2708
2709
2710                 # Extract video URL
2711                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2712                 if mobj is None:
2713                         self._downloader.trouble(u'ERROR: unable to extract video url')
2714                         return
2715                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2716
2717
2718                 # Extract title
2719                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2720                 if mobj is None:
2721                         self._downloader.trouble(u'ERROR: unable to extract video title')
2722                         return
2723                 video_title = mobj.group(1).decode('utf-8')
2724
2725                 # Extract description
2726                 video_description = u'No description available.'
2727                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2728                 if mobj is not None:
2729                         video_description = mobj.group(1).decode('utf-8')
2730
2731                 video_filename = video_url.split('/')[-1]
2732                 video_id, extension = video_filename.split('.')
2733
2734                 info = {
2735                         'id': video_id,
2736                         'url': video_url,
2737                         'uploader': None,
2738                         'upload_date': None,
2739                         'title': video_title,
2740                         'ext': extension,
2741                         'format': extension, # Extension is always(?) mp4, but seems to be flv
2742                         'thumbnail': None,
2743                         'description': video_description,
2744                         'player_url': None,
2745                 }
2746
2747                 return [info]
2748
2749 class MixcloudIE(InfoExtractor):
2750         """Information extractor for www.mixcloud.com"""
2751         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2752         IE_NAME = u'mixcloud'
2753
2754         def __init__(self, downloader=None):
2755                 InfoExtractor.__init__(self, downloader)
2756
2757         def report_download_json(self, file_id):
2758                 """Report JSON download."""
2759                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2760
2761         def report_extraction(self, file_id):
2762                 """Report information extraction."""
2763                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2764
2765         def get_urls(self, jsonData, fmt, bitrate='best'):
2766                 """Get urls from 'audio_formats' section in json"""
2767                 file_url = None
2768                 try:
2769                         bitrate_list = jsonData[fmt]
2770                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2771                                 bitrate = max(bitrate_list) # select highest
2772
2773                         url_list = jsonData[fmt][bitrate]
2774                 except TypeError: # we have no bitrate info.
2775                         url_list = jsonData[fmt]
2776                 return url_list
2777
2778         def check_urls(self, url_list):
2779                 """Returns 1st active url from list"""
2780                 for url in url_list:
2781                         try:
2782                                 urllib2.urlopen(url)
2783                                 return url
2784                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2785                                 url = None
2786
2787                 return None
2788
2789         def _print_formats(self, formats):
2790                 print 'Available formats:'
2791                 for fmt in formats.keys():
2792                         for b in formats[fmt]:
2793                                 try:
2794                                         ext = formats[fmt][b][0]
2795                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2796                                 except TypeError: # we have no bitrate info
2797                                         ext = formats[fmt][0]
2798                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2799                                         break
2800
2801         def _real_extract(self, url):
2802                 mobj = re.match(self._VALID_URL, url)
2803                 if mobj is None:
2804                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2805                         return
2806                 # extract uploader & filename from url
2807                 uploader = mobj.group(1).decode('utf-8')
2808                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2809
2810                 # construct API request
2811                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2812                 # retrieve .json file with links to files
2813                 request = urllib2.Request(file_url)
2814                 try:
2815                         self.report_download_json(file_url)
2816                         jsonData = urllib2.urlopen(request).read()
2817                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2818                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2819                         return
2820
2821                 # parse JSON
2822                 json_data = json.loads(jsonData)
2823                 player_url = json_data['player_swf_url']
2824                 formats = dict(json_data['audio_formats'])
2825
2826                 req_format = self._downloader.params.get('format', None)
2827                 bitrate = None
2828
2829                 if self._downloader.params.get('listformats', None):
2830                         self._print_formats(formats)
2831                         return
2832
2833                 if req_format is None or req_format == 'best':
2834                         for format_param in formats.keys():
2835                                 url_list = self.get_urls(formats, format_param)
2836                                 # check urls
2837                                 file_url = self.check_urls(url_list)
2838                                 if file_url is not None:
2839                                         break # got it!
2840                 else:
2841                         if req_format not in formats.keys():
2842                                 self._downloader.trouble(u'ERROR: format is not available')
2843                                 return
2844
2845                         url_list = self.get_urls(formats, req_format)
2846                         file_url = self.check_urls(url_list)
2847                         format_param = req_format
2848
2849                 return [{
2850                         'id': file_id.decode('utf-8'),
2851                         'url': file_url.decode('utf-8'),
2852                         'uploader':     uploader.decode('utf-8'),
2853                         'upload_date': u'NA',
2854                         'title': json_data['name'],
2855                         'ext': file_url.split('.')[-1].decode('utf-8'),
2856                         'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2857                         'thumbnail': json_data['thumbnail_url'],
2858                         'description': json_data['description'],
2859                         'player_url': player_url.decode('utf-8'),
2860                 }]
2861
2862 class StanfordOpenClassroomIE(InfoExtractor):
2863         """Information extractor for Stanford's Open ClassRoom"""
2864
2865         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2866         IE_NAME = u'stanfordoc'
2867
2868         def report_download_webpage(self, objid):
2869                 """Report information extraction."""
2870                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2871
2872         def report_extraction(self, video_id):
2873                 """Report information extraction."""
2874                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2875
2876         def _real_extract(self, url):
2877                 mobj = re.match(self._VALID_URL, url)
2878                 if mobj is None:
2879                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2880                         return
2881
2882                 if mobj.group('course') and mobj.group('video'): # A specific video
2883                         course = mobj.group('course')
2884                         video = mobj.group('video')
2885                         info = {
2886                                 'id': course + '_' + video,
2887                         }
2888
2889                         self.report_extraction(info['id'])
2890                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2891                         xmlUrl = baseUrl + video + '.xml'
2892                         try:
2893                                 metaXml = urllib2.urlopen(xmlUrl).read()
2894                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2895                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2896                                 return
2897                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2898                         try:
2899                                 info['title'] = mdoc.findall('./title')[0].text
2900                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2901                         except IndexError:
2902                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2903                                 return
2904                         info['ext'] = info['url'].rpartition('.')[2]
2905                         info['format'] = info['ext']
2906                         return [info]
2907                 elif mobj.group('course'): # A course page
2908                         course = mobj.group('course')
2909                         info = {
2910                                 'id': course,
2911                                 'type': 'playlist',
2912                         }
2913
2914                         self.report_download_webpage(info['id'])
2915                         try:
2916                                 coursepage = urllib2.urlopen(url).read()
2917                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2918                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2919                                 return
2920
2921                         m = re.search('<h1>([^<]+)</h1>', coursepage)
2922                         if m:
2923                                 info['title'] = unescapeHTML(m.group(1))
2924                         else:
2925                                 info['title'] = info['id']
2926
2927                         m = re.search('<description>([^<]+)</description>', coursepage)
2928                         if m:
2929                                 info['description'] = unescapeHTML(m.group(1))
2930
2931                         links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2932                         info['list'] = [
2933                                 {
2934                                         'type': 'reference',
2935                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2936                                 }
2937                                         for vpage in links]
2938                         results = []
2939                         for entry in info['list']:
2940                                 assert entry['type'] == 'reference'
2941                                 results += self.extract(entry['url'])
2942                         return results
2943                         
2944                 else: # Root page
2945                         info = {
2946                                 'id': 'Stanford OpenClassroom',
2947                                 'type': 'playlist',
2948                         }
2949
2950                         self.report_download_webpage(info['id'])
2951                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2952                         try:
2953                                 rootpage = urllib2.urlopen(rootURL).read()
2954                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2955                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2956                                 return
2957
2958                         info['title'] = info['id']
2959
2960                         links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2961                         info['list'] = [
2962                                 {
2963                                         'type': 'reference',
2964                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2965                                 }
2966                                         for cpage in links]
2967
2968                         results = []
2969                         for entry in info['list']:
2970                                 assert entry['type'] == 'reference'
2971                                 results += self.extract(entry['url'])
2972                         return results
2973
2974 class MTVIE(InfoExtractor):
2975         """Information extractor for MTV.com"""
2976
2977         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2978         IE_NAME = u'mtv'
2979
2980         def report_webpage(self, video_id):
2981                 """Report information extraction."""
2982                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2983
2984         def report_extraction(self, video_id):
2985                 """Report information extraction."""
2986                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2987
2988         def _real_extract(self, url):
2989                 mobj = re.match(self._VALID_URL, url)
2990                 if mobj is None:
2991                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2992                         return
2993                 if not mobj.group('proto'):
2994                         url = 'http://' + url
2995                 video_id = mobj.group('videoid')
2996                 self.report_webpage(video_id)
2997
2998                 request = urllib2.Request(url)
2999                 try:
3000                         webpage = urllib2.urlopen(request).read()
3001                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3002                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3003                         return
3004
3005                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3006                 if mobj is None:
3007                         self._downloader.trouble(u'ERROR: unable to extract song name')
3008                         return
3009                 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3010                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3011                 if mobj is None:
3012                         self._downloader.trouble(u'ERROR: unable to extract performer')
3013                         return
3014                 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3015                 video_title = performer + ' - ' + song_name 
3016
3017                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3018                 if mobj is None:
3019                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3020                         return
3021                 mtvn_uri = mobj.group(1)
3022
3023                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3024                 if mobj is None:
3025                         self._downloader.trouble(u'ERROR: unable to extract content id')
3026                         return
3027                 content_id = mobj.group(1)
3028
3029                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3030                 self.report_extraction(video_id)
3031                 request = urllib2.Request(videogen_url)
3032                 try:
3033                         metadataXml = urllib2.urlopen(request).read()
3034                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3035                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
3036                         return
3037
3038                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3039                 renditions = mdoc.findall('.//rendition')
3040
3041                 # For now, always pick the highest quality.
3042                 rendition = renditions[-1]
3043
3044                 try:
3045                         _,_,ext = rendition.attrib['type'].partition('/')
3046                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3047                         video_url = rendition.find('./src').text
3048                 except KeyError:
3049                         self._downloader.trouble('Invalid rendition field.')
3050                         return
3051
3052                 info = {
3053                         'id': video_id,
3054                         'url': video_url,
3055                         'uploader': performer,
3056                         'title': video_title,
3057                         'ext': ext,
3058                         'format': format,
3059                 }
3060
3061                 return [info]
3062
3063
3064 class YoukuIE(InfoExtractor):
3065
3066         _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3067         IE_NAME = u'Youku'
3068
3069         def __init__(self, downloader=None):
3070                 InfoExtractor.__init__(self, downloader)
3071
3072         def report_download_webpage(self, file_id):
3073                 """Report webpage download."""
3074                 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3075
3076         def report_extraction(self, file_id):
3077                 """Report information extraction."""
3078                 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3079
3080         def _gen_sid(self):
3081                 nowTime = int(time.time() * 1000)
3082                 random1 = random.randint(1000,1998)
3083                 random2 = random.randint(1000,9999)
3084
3085                 return "%d%d%d" %(nowTime,random1,random2)
3086
3087         def _get_file_ID_mix_string(self, seed):
3088                 mixed = []
3089                 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3090                 seed = float(seed)
3091                 for i in range(len(source)):
3092                         seed  =  (seed * 211 + 30031 ) % 65536
3093                         index  =  math.floor(seed / 65536 * len(source) )
3094                         mixed.append(source[int(index)])
3095                         source.remove(source[int(index)])
3096                 #return ''.join(mixed)
3097                 return mixed
3098
3099         def _get_file_id(self, fileId, seed):
3100                 mixed = self._get_file_ID_mix_string(seed)
3101                 ids = fileId.split('*')
3102                 realId = []
3103                 for ch in ids:
3104                         if ch:
3105                                 realId.append(mixed[int(ch)])
3106                 return ''.join(realId)
3107
3108         def _real_extract(self, url):
3109                 mobj = re.match(self._VALID_URL, url)
3110                 if mobj is None:
3111                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3112                         return
3113                 video_id = mobj.group('ID')
3114
3115                 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3116
3117                 request = urllib2.Request(info_url, None, std_headers)
3118                 try:
3119                         self.report_download_webpage(video_id)
3120                         jsondata = urllib2.urlopen(request).read()
3121                 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3122                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3123                         return
3124
3125                 self.report_extraction(video_id)
3126                 try:
3127                         config = json.loads(jsondata)
3128
3129                         video_title =  config['data'][0]['title']
3130                         seed = config['data'][0]['seed']
3131
3132                         format = self._downloader.params.get('format', None)
3133                         supported_format = config['data'][0]['streamfileids'].keys()
3134
3135                         if format is None or format == 'best':
3136                                 if 'hd2' in supported_format:
3137                                         format = 'hd2'
3138                                 else:
3139                                         format = 'flv'
3140                                 ext = u'flv'
3141                         elif format == 'worst':
3142                                 format = 'mp4'
3143                                 ext = u'mp4'
3144                         else:
3145                                 format = 'flv'
3146                                 ext = u'flv'
3147
3148
3149                         fileid = config['data'][0]['streamfileids'][format]
3150                         seg_number = len(config['data'][0]['segs'][format])
3151
3152                         keys=[]
3153                         for i in xrange(seg_number):
3154                                 keys.append(config['data'][0]['segs'][format][i]['k'])
3155
3156                         #TODO check error
3157                         #youku only could be viewed from mainland china
3158                 except:
3159                         self._downloader.trouble(u'ERROR: unable to extract info section')
3160                         return
3161
3162                 files_info=[]
3163                 sid = self._gen_sid()
3164                 fileid = self._get_file_id(fileid, seed)
3165
3166                 #column 8,9 of fileid represent the segment number
3167                 #fileid[7:9] should be changed
3168                 for index, key in enumerate(keys):
3169
3170                         temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3171                         download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3172
3173                         info = {
3174                                 'id': '%s_part%02d' % (video_id, index),
3175                                 'url': download_url,
3176                                 'uploader': None,
3177                                 'title': video_title,
3178                                 'ext': ext,
3179                                 'format': u'NA'
3180                         }
3181                         files_info.append(info)
3182
3183                 return files_info
3184
3185
3186 class XNXXIE(InfoExtractor):
3187         """Information extractor for xnxx.com"""
3188
3189         _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3190         IE_NAME = u'xnxx'
3191         VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3192         VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3193         VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3194
3195         def report_webpage(self, video_id):
3196                 """Report information extraction"""
3197                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3198
3199         def report_extraction(self, video_id):
3200                 """Report information extraction"""
3201                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3202
3203         def _real_extract(self, url):
3204                 mobj = re.match(self._VALID_URL, url)
3205                 if mobj is None:
3206                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3207                         return
3208                 video_id = mobj.group(1).decode('utf-8')
3209
3210                 self.report_webpage(video_id)
3211
3212                 # Get webpage content
3213                 try:
3214                         webpage = urllib2.urlopen(url).read()
3215                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3216                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3217                         return
3218
3219                 result = re.search(self.VIDEO_URL_RE, webpage)
3220                 if result is None:
3221                         self._downloader.trouble(u'ERROR: unable to extract video url')
3222                         return
3223                 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3224
3225                 result = re.search(self.VIDEO_TITLE_RE, webpage)
3226                 if result is None:
3227                         self._downloader.trouble(u'ERROR: unable to extract video title')
3228                         return
3229                 video_title = result.group(1).decode('utf-8')
3230
3231                 result = re.search(self.VIDEO_THUMB_RE, webpage)
3232                 if result is None:
3233                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3234                         return
3235                 video_thumbnail = result.group(1).decode('utf-8')
3236
3237                 info = {'id': video_id,
3238                                 'url': video_url,
3239                                 'uploader': None,
3240                                 'upload_date': None,
3241                                 'title': video_title,
3242                                 'ext': 'flv',
3243                                 'format': 'flv',
3244                                 'thumbnail': video_thumbnail,
3245                                 'description': None,
3246                                 'player_url': None}
3247
3248                 return [info]
3249
3250
3251 class GooglePlusIE(InfoExtractor):
3252         """Information extractor for plus.google.com."""
3253
3254         _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3255         IE_NAME = u'plus.google'
3256
3257         def __init__(self, downloader=None):
3258                 InfoExtractor.__init__(self, downloader)
3259
3260         def report_extract_entry(self, url):
3261                 """Report downloading extry"""
3262                 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3263
3264         def report_date(self, upload_date):
3265                 """Report downloading extry"""
3266                 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3267
3268         def report_uploader(self, uploader):
3269                 """Report downloading extry"""
3270                 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3271
3272         def report_title(self, video_title):
3273                 """Report downloading extry"""
3274                 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3275
3276         def report_extract_vid_page(self, video_page):
3277                 """Report information extraction."""
3278                 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3279
3280         def _real_extract(self, url):
3281                 # Extract id from URL
3282                 mobj = re.match(self._VALID_URL, url)
3283                 if mobj is None:
3284                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3285                         return
3286
3287                 post_url = mobj.group(0)
3288                 video_id = mobj.group(2)
3289
3290                 video_extension = 'flv'
3291
3292                 # Step 1, Retrieve post webpage to extract further information
3293                 self.report_extract_entry(post_url)
3294                 request = urllib2.Request(post_url)
3295                 try:
3296                         webpage = urllib2.urlopen(request).read()
3297                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3298                         self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % str(err))
3299                         return
3300
3301                 # Extract update date
3302                 upload_date = u'NA'
3303                 pattern = 'title="Timestamp">(.*?)</a>'
3304                 mobj = re.search(pattern, webpage)
3305                 if mobj:
3306                         upload_date = mobj.group(1)
3307                         # Convert timestring to a format suitable for filename
3308                         upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3309                         upload_date = upload_date.strftime('%Y%m%d')
3310                 self.report_date(upload_date)
3311
3312                 # Extract uploader
3313                 uploader = u'NA'
3314                 pattern = r'rel\="author".*?>(.*?)</a>'
3315                 mobj = re.search(pattern, webpage)
3316                 if mobj:
3317                         uploader = mobj.group(1)
3318                 self.report_uploader(uploader)
3319
3320                 # Extract title
3321                 # Get the first line for title
3322                 video_title = u'NA'
3323                 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3324                 mobj = re.search(pattern, webpage)
3325                 if mobj:
3326                         video_title = mobj.group(1)
3327                 self.report_title(video_title)
3328
3329                 # Step 2, Stimulate clicking the image box to launch video
3330                 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3331                 mobj = re.search(pattern, webpage)
3332                 if mobj is None:
3333                         self._downloader.trouble(u'ERROR: unable to extract video page URL')
3334
3335                 video_page = mobj.group(1)
3336                 request = urllib2.Request(video_page)
3337                 try:
3338                         webpage = urllib2.urlopen(request).read()
3339                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3340                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3341                         return
3342                 self.report_extract_vid_page(video_page)
3343
3344
3345                 # Extract video links on video page
3346                 """Extract video links of all sizes"""
3347                 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3348                 mobj = re.findall(pattern, webpage)
3349                 if len(mobj) == 0:
3350                         self._downloader.trouble(u'ERROR: unable to extract video links')
3351
3352                 # Sort in resolution
3353                 links = sorted(mobj)
3354
3355                 # Choose the lowest of the sort, i.e. highest resolution
3356                 video_url = links[-1]
3357                 # Only get the url. The resolution part in the tuple has no use anymore
3358                 video_url = video_url[-1]
3359                 # Treat escaped \u0026 style hex
3360                 video_url = unicode(video_url, "unicode_escape")
3361
3362
3363                 return [{
3364                         'id':           video_id.decode('utf-8'),
3365                         'url':          video_url,
3366                         'uploader':     uploader.decode('utf-8'),
3367                         'upload_date':  upload_date.decode('utf-8'),
3368                         'title':        video_title.decode('utf-8'),
3369                         'ext':          video_extension.decode('utf-8'),
3370                         'format':       u'NA',
3371                         'player_url':   None,
3372                 }]
3373
3374
3375
3376 class YouPornIE(InfoExtractor):
3377         """Information extractor for youporn.com."""
3378
3379         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3380         IE_NAME = u'youporn'
3381         VIDEO_TITLE_RE = r'videoTitleArea">(?P<title>.*)</h1>'
3382         VIDEO_DATE_RE = r'Date:</b>(?P<date>.*)</li>'
3383         VIDEO_UPLOADER_RE = r'Submitted:</b>(?P<uploader>.*)</li>'
3384         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3385         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3386
3387         def __init__(self, downloader=None):
3388                 InfoExtractor.__init__(self, downloader)
3389
3390         def report_id(self, video_id):
3391                 """Report finding video ID"""
3392                 self._downloader.to_screen(u'[youporn] Video ID: %s' % video_id)
3393
3394         def report_webpage(self, url):
3395                 """Report downloading page"""
3396                 self._downloader.to_screen(u'[youporn] Downloaded page: %s' % url)
3397
3398         def report_title(self, video_title):
3399                 """Report dfinding title"""
3400                 self._downloader.to_screen(u'[youporn] Title: %s' % video_title)
3401         
3402         def report_uploader(self, uploader):
3403                 """Report dfinding title"""
3404                 self._downloader.to_screen(u'[youporn] Uploader: %s' % uploader)
3405
3406         def report_upload_date(self, video_date):
3407                 """Report finding date"""
3408                 self._downloader.to_screen(u'[youporn] Date: %s' % video_date)
3409
3410         def _print_formats(self, formats):
3411                 """Print all available formats"""
3412                 print 'Available formats:'
3413                 print u'ext\t\tformat'
3414                 print u'---------------------------------'
3415                 for format in formats:
3416                         print u'%s\t\t%s'  % (format['ext'], format['format'])
3417
3418         def _specific(self, req_format, formats):
3419                 for x in formats:
3420                         if(x["format"]==req_format):
3421                                 return x
3422                 return None
3423
3424
3425         def _real_extract(self, url):
3426                 mobj = re.match(self._VALID_URL, url)
3427                 if mobj is None:
3428                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3429                         return
3430
3431                 video_id = mobj.group('videoid').decode('utf-8')
3432                 self.report_id(video_id)
3433
3434                 # Get webpage content
3435                 try:
3436                         webpage = urllib2.urlopen(url).read()
3437                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3438                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3439                         return
3440                 self.report_webpage(url)
3441
3442                 # Get the video title
3443                 result = re.search(self.VIDEO_TITLE_RE, webpage)
3444                 if result is None:
3445                         self._downloader.trouble(u'ERROR: unable to extract video title')
3446                         return
3447                 video_title = result.group('title').decode('utf-8').strip()
3448                 self.report_title(video_title)
3449
3450                 # Get the video date
3451                 result = re.search(self.VIDEO_DATE_RE, webpage)
3452                 if result is None:
3453                         self._downloader.trouble(u'ERROR: unable to extract video date')
3454                         return
3455                 upload_date = result.group('date').decode('utf-8').strip()
3456                 self.report_upload_date(upload_date)
3457
3458                 # Get the video uploader
3459                 result = re.search(self.VIDEO_UPLOADER_RE, webpage)
3460                 if result is None:
3461                         self._downloader.trouble(u'ERROR: unable to extract uploader')
3462                         return
3463                 video_uploader = result.group('uploader').decode('utf-8').strip()
3464                 video_uploader = clean_html( video_uploader )
3465                 self.report_uploader(video_uploader)
3466
3467                 # Get all of the formats available
3468                 result = re.search(self.DOWNLOAD_LIST_RE, webpage)
3469                 if result is None:
3470                         self._downloader.trouble(u'ERROR: unable to extract download list')
3471                         return
3472                 download_list_html = result.group('download_list').decode('utf-8').strip()
3473
3474                 # Get all of the links from the page
3475                 links = re.findall(self.LINK_RE, download_list_html)
3476                 if(len(links) == 0):
3477                         self._downloader.trouble(u'ERROR: no known formats available for video')
3478                         return
3479                 
3480                 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))   
3481
3482                 formats = []
3483                 for link in links:
3484
3485                         # A link looks like this:
3486                         # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3487                         # A path looks like this:
3488                         # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3489                         video_url = unescapeHTML( link.decode('utf-8') )
3490                         path = urlparse( video_url ).path
3491                         extension = os.path.splitext( path )[1][1:]
3492                         format = path.split('/')[4].split('_')[:2]
3493                         size = format[0]
3494                         bitrate = format[1]
3495                         format = "-".join( format )
3496                         title = u'%s-%s-%s' % (video_title, size, bitrate)
3497
3498                         formats.append({
3499                                 'id': video_id,
3500                                 'url': video_url,
3501                                 'uploader': video_uploader,
3502                                 'upload_date': upload_date,
3503                                 'title': title,
3504                                 'ext': extension,
3505                                 'format': format,
3506                                 'thumbnail': None,
3507                                 'description': None,
3508                                 'player_url': None
3509                         })
3510
3511                 if self._downloader.params.get('listformats', None):
3512                         self._print_formats(formats)
3513                         return
3514
3515                 req_format = self._downloader.params.get('format', None)
3516                 #format_limit = self._downloader.params.get('format_limit', None)
3517                 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3518
3519
3520                 if req_format is None or req_format == 'best':
3521                         return [formats[0]]
3522                 elif req_format == 'worst':
3523                         return [formats[-1]]
3524                 elif req_format in ('-1', 'all'):
3525                         return formats
3526                 else:
3527                         format = self._specific( req_format, formats )
3528                         if result is None:
3529                                 self._downloader.trouble(u'ERROR: requested format not available')
3530                                 return
3531                         return [format]
3532
3533                 
3534
3535
3536 class PornotubeIE(InfoExtractor):
3537         """Information extractor for pornotube.com."""
3538
3539         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3540         IE_NAME = u'pornotube'
3541         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3542         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3543
3544
3545         def __init__(self, downloader=None):
3546                 InfoExtractor.__init__(self, downloader)
3547
3548         def report_extract_entry(self, url):
3549                 """Report downloading extry"""
3550                 self._downloader.to_screen(u'[pornotube] Downloading entry: %s' % url.decode('utf-8'))
3551
3552         def report_date(self, upload_date):
3553                 """Report finding uploaded date"""
3554                 self._downloader.to_screen(u'[pornotube] Entry date: %s' % upload_date)
3555
3556         def report_webpage(self, url):
3557                 """Report downloading page"""
3558                 self._downloader.to_screen(u'[pornotube] Downloaded page: %s' % url)
3559
3560         def report_title(self, video_title):
3561                 """Report downloading extry"""
3562                 self._downloader.to_screen(u'[pornotube] Title: %s' % video_title.decode('utf-8'))
3563
3564         def _real_extract(self, url):
3565                 mobj = re.match(self._VALID_URL, url)
3566                 if mobj is None:
3567                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3568                         return
3569
3570                 video_id = mobj.group('videoid').decode('utf-8')
3571                 video_title = mobj.group('title').decode('utf-8')
3572                 self.report_title(video_title);
3573
3574                 # Get webpage content
3575                 try:
3576                         webpage = urllib2.urlopen(url).read()
3577                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3578                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3579                         return
3580                 self.report_webpage(url)
3581
3582                 # Get the video URL
3583                 result = re.search(self.VIDEO_URL_RE, webpage)
3584                 if result is None:
3585                         self._downloader.trouble(u'ERROR: unable to extract video url')
3586                         return
3587                 video_url = urllib.unquote(result.group('url').decode('utf-8'))
3588                 self.report_extract_entry(video_url)
3589
3590                 #Get the uploaded date
3591                 result = re.search(self.VIDEO_UPLOADED_RE, webpage)
3592                 if result is None:
3593                         self._downloader.trouble(u'ERROR: unable to extract video title')
3594                         return
3595                 upload_date = result.group('date').decode('utf-8')
3596                 self.report_date(upload_date);
3597
3598
3599                 info = {'id': video_id,
3600                                 'url': video_url,
3601                                 'uploader': None,
3602                                 'upload_date': upload_date,
3603                                 'title': video_title,
3604                                 'ext': 'flv',
3605                                 'format': 'flv',
3606                                 'thumbnail': None,
3607                                 'description': None,
3608                                 'player_url': None}
3609
3610                 return [info]
3611
3612
3613
3614
3615 class YouJizzIE(InfoExtractor):
3616         """Information extractor for youjizz.com."""
3617
3618         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/([^.]+).html$'
3619         IE_NAME = u'youjizz'
3620         VIDEO_TITLE_RE = r'<title>(?P<title>.*)</title>'
3621         EMBED_PAGE_RE = r'http://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)'
3622         SOURCE_RE = r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);'
3623
3624         def __init__(self, downloader=None):
3625                 InfoExtractor.__init__(self, downloader)
3626
3627         def report_extract_entry(self, url):
3628                 """Report downloading extry"""
3629                 self._downloader.to_screen(u'[youjizz] Downloading entry: %s' % url.decode('utf-8'))
3630
3631         def report_webpage(self, url):
3632                 """Report downloading page"""
3633                 self._downloader.to_screen(u'[youjizz] Downloaded page: %s' % url)
3634
3635         def report_title(self, video_title):
3636                 """Report downloading extry"""
3637                 self._downloader.to_screen(u'[youjizz] Title: %s' % video_title.decode('utf-8'))
3638
3639         def report_embed_page(self, embed_page):
3640                 """Report downloading extry"""
3641                 self._downloader.to_screen(u'[youjizz] Embed Page: %s' % embed_page.decode('utf-8'))
3642
3643         def _real_extract(self, url):
3644                 # Get webpage content
3645                 try:
3646                         webpage = urllib2.urlopen(url).read()
3647                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3648                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3649                         return
3650                 self.report_webpage(url)
3651
3652                 # Get the video title
3653                 result = re.search(self.VIDEO_TITLE_RE, webpage)
3654                 if result is None:
3655                         self._downloader.trouble(u'ERROR: unable to extract video title')
3656                         return
3657                 video_title = result.group('title').decode('utf-8').strip()
3658                 self.report_title(video_title)
3659
3660                 # Get the embed page
3661                 result = re.search(self.EMBED_PAGE_RE, webpage)
3662                 if result is None:
3663                         self._downloader.trouble(u'ERROR: unable to extract embed page')
3664                         return
3665
3666                 embed_page_url = result.group(0).decode('utf-8').strip()
3667                 video_id = result.group('videoid').decode('utf-8')
3668                 self.report_embed_page(embed_page_url)
3669         
3670                 try:
3671                         webpage = urllib2.urlopen(embed_page_url).read()
3672                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3673                         self._downloader.trouble(u'ERROR: unable to download video embed page: %s' % err)
3674                         return
3675                 
3676                 # Get the video URL
3677                 result = re.search(self.SOURCE_RE, webpage)
3678                 if result is None:
3679                         self._downloader.trouble(u'ERROR: unable to extract video url')
3680                         return
3681                 video_url = result.group('source').decode('utf-8')
3682                 self.report_extract_entry(video_url)
3683
3684                 info = {'id': video_id,
3685                                 'url': video_url,
3686                                 'uploader': None,
3687                                 'upload_date': None,
3688                                 'title': video_title,
3689                                 'ext': 'flv',
3690                                 'format': 'flv',
3691                                 'thumbnail': None,
3692                                 'description': None,
3693                                 'player_url': embed_page_url}
3694
3695                 return [info]
3696