94803aa95bc834c6454c32ad595f7ce256a7db93
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import datetime
5 import netrc
6 import os
7 import re
8 import socket
9 import time
10 import email.utils
11 import xml.etree.ElementTree
12 import random
13 import math
14
15 from utils import *
16
17
18 class InfoExtractor(object):
19         """Information Extractor class.
20
21         Information extractors are the classes that, given a URL, extract
22         information about the video (or videos) the URL refers to. This
23         information includes the real video URL, the video title, author and
24         others. The information is stored in a dictionary which is then 
25         passed to the FileDownloader. The FileDownloader processes this
26         information possibly downloading the video to the file system, among
27         other possible outcomes.
28
29         The dictionaries must include the following fields:
30
31         id:             Video identifier.
32         url:            Final video URL.
33         uploader:       Nickname of the video uploader, unescaped.
34         upload_date:    Video upload date (YYYYMMDD).
35         title:          Video title, unescaped.
36         ext:            Video filename extension.
37
38         The following fields are optional:
39
40         format:         The video format, defaults to ext (used for --get-format)
41         thumbnail:      Full URL to a video thumbnail image.
42         description:    One-line video description.
43         player_url:     SWF Player URL (used for rtmpdump).
44         subtitles:      The .srt file contents.
45         urlhandle:              [internal] The urlHandle to be used to download the file,
46                         like returned by urllib.request.urlopen
47
48         The fields should all be Unicode strings.
49
50         Subclasses of this one should re-define the _real_initialize() and
51         _real_extract() methods and define a _VALID_URL regexp.
52         Probably, they should also be added to the list of extractors.
53
54         _real_extract() must return a *list* of information dictionaries as
55         described above.
56
57         Finally, the _WORKING attribute should be set to False for broken IEs
58         in order to warn the users and skip the tests.
59         """
60
61         _ready = False
62         _downloader = None
63         _WORKING = True
64
65         def __init__(self, downloader=None):
66                 """Constructor. Receives an optional downloader."""
67                 self._ready = False
68                 self.set_downloader(downloader)
69
70         def suitable(self, url):
71                 """Receives a URL and returns True if suitable for this IE."""
72                 return re.match(self._VALID_URL, url) is not None
73
74         def working(self):
75                 """Getter method for _WORKING."""
76                 return self._WORKING
77
78         def initialize(self):
79                 """Initializes an instance (authentication, etc)."""
80                 if not self._ready:
81                         self._real_initialize()
82                         self._ready = True
83
84         def extract(self, url):
85                 """Extracts URL information and returns it in list of dicts."""
86                 self.initialize()
87                 return self._real_extract(url)
88
89         def set_downloader(self, downloader):
90                 """Sets the downloader for this IE."""
91                 self._downloader = downloader
92
93         def _real_initialize(self):
94                 """Real initialization process. Redefine in subclasses."""
95                 pass
96
97         def _real_extract(self, url):
98                 """Real extraction process. Redefine in subclasses."""
99                 pass
100
101
102 class YoutubeIE(InfoExtractor):
103         """Information extractor for youtube.com."""
104
105         _VALID_URL = r"""^
106                          (
107                              (?:https?://)?                                       # http(s):// (optional)
108                              (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
109                                 tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
110                              (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
111                              (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
112                              (?:                                                  # the various things that can precede the ID:
113                                  (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
114                                  |(?:                                             # or the v= param in all its forms
115                                      (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
116                                      (?:\?|\#!?)                                  # the params delimiter ? or # or #!
117                                      (?:.+&)?                                     # any other preceding param (like /?s=tuff&v=xxxx)
118                                      v=
119                                  )
120                              )?                                                   # optional -> youtube.com/xxxx is OK
121                          )?                                                       # all until now is optional -> you can pass the naked ID
122                          ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
123                          (?(1).+)?                                                # if we found the ID, everything can follow
124                          $"""
125         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
126         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
127         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
128         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
129         _NETRC_MACHINE = 'youtube'
130         # Listed in order of quality
131         _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
132         _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
133         _video_extensions = {
134                 '13': '3gp',
135                 '17': 'mp4',
136                 '18': 'mp4',
137                 '22': 'mp4',
138                 '37': 'mp4',
139                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
140                 '43': 'webm',
141                 '44': 'webm',
142                 '45': 'webm',
143                 '46': 'webm',
144         }
145         _video_dimensions = {
146                 '5': '240x400',
147                 '6': '???',
148                 '13': '???',
149                 '17': '144x176',
150                 '18': '360x640',
151                 '22': '720x1280',
152                 '34': '360x640',
153                 '35': '480x854',
154                 '37': '1080x1920',
155                 '38': '3072x4096',
156                 '43': '360x640',
157                 '44': '480x854',
158                 '45': '720x1280',
159                 '46': '1080x1920',
160         }       
161         IE_NAME = u'youtube'
162
163         def suitable(self, url):
164                 """Receives a URL and returns True if suitable for this IE."""
165                 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
166
167         def report_lang(self):
168                 """Report attempt to set language."""
169                 self._downloader.to_screen(u'[youtube] Setting language')
170
171         def report_login(self):
172                 """Report attempt to log in."""
173                 self._downloader.to_screen(u'[youtube] Logging in')
174
175         def report_age_confirmation(self):
176                 """Report attempt to confirm age."""
177                 self._downloader.to_screen(u'[youtube] Confirming age')
178
179         def report_video_webpage_download(self, video_id):
180                 """Report attempt to download video webpage."""
181                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
182
183         def report_video_info_webpage_download(self, video_id):
184                 """Report attempt to download video info webpage."""
185                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
186
187         def report_video_subtitles_download(self, video_id):
188                 """Report attempt to download video info webpage."""
189                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
190
191         def report_information_extraction(self, video_id):
192                 """Report attempt to extract video information."""
193                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
194
195         def report_unavailable_format(self, video_id, format):
196                 """Report extracted video URL."""
197                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
198
199         def report_rtmp_download(self):
200                 """Indicate the download will use the RTMP protocol."""
201                 self._downloader.to_screen(u'[youtube] RTMP download detected')
202
203         def _closed_captions_xml_to_srt(self, xml_string):
204                 srt = ''
205                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
206                 # TODO parse xml instead of regex
207                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
208                         if not dur: dur = '4'
209                         start = float(start)
210                         end = start + float(dur)
211                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
212                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
213                         caption = unescapeHTML(caption)
214                         caption = unescapeHTML(caption) # double cycle, intentional
215                         srt += str(n+1) + '\n'
216                         srt += start + ' --> ' + end + '\n'
217                         srt += caption + '\n\n'
218                 return srt
219
220         def _print_formats(self, formats):
221                 print('Available formats:')
222                 for x in formats:
223                         print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
224
225         def _real_initialize(self):
226                 if self._downloader is None:
227                         return
228
229                 username = None
230                 password = None
231                 downloader_params = self._downloader.params
232
233                 # Attempt to use provided username and password or .netrc data
234                 if downloader_params.get('username', None) is not None:
235                         username = downloader_params['username']
236                         password = downloader_params['password']
237                 elif downloader_params.get('usenetrc', False):
238                         try:
239                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
240                                 if info is not None:
241                                         username = info[0]
242                                         password = info[2]
243                                 else:
244                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
245                         except (IOError, netrc.NetrcParseError) as err:
246                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
247                                 return
248
249                 # Set language
250                 request = compat_urllib_request.Request(self._LANG_URL)
251                 try:
252                         self.report_lang()
253                         compat_urllib_request.urlopen(request).read()
254                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
255                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
256                         return
257
258                 # No authentication to be performed
259                 if username is None:
260                         return
261
262                 # Log in
263                 login_form = {
264                                 'current_form': 'loginForm',
265                                 'next':         '/',
266                                 'action_login': 'Log In',
267                                 'username':     username,
268                                 'password':     password,
269                                 }
270                 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
271                 try:
272                         self.report_login()
273                         login_results = compat_urllib_request.urlopen(request).read()
274                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
275                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
276                                 return
277                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
278                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
279                         return
280
281                 # Confirm age
282                 age_form = {
283                                 'next_url':             '/',
284                                 'action_confirm':       'Confirm',
285                                 }
286                 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
287                 try:
288                         self.report_age_confirmation()
289                         age_results = compat_urllib_request.urlopen(request).read()
290                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
291                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
292                         return
293
294         def _real_extract(self, url):
295                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
296                 mobj = re.search(self._NEXT_URL_RE, url)
297                 if mobj:
298                         url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
299
300                 # Extract video id from URL
301                 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
302                 if mobj is None:
303                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
304                         return
305                 video_id = mobj.group(2)
306
307                 # Get video webpage
308                 self.report_video_webpage_download(video_id)
309                 request = compat_urllib_request.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
310                 try:
311                         video_webpage_bytes = compat_urllib_request.urlopen(request).read()
312                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
313                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
314                         return
315
316                 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
317
318                 # Attempt to extract SWF player URL
319                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
320                 if mobj is not None:
321                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
322                 else:
323                         player_url = None
324
325                 # Get video info
326                 self.report_video_info_webpage_download(video_id)
327                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
328                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
329                                         % (video_id, el_type))
330                         request = compat_urllib_request.Request(video_info_url)
331                         try:
332                                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
333                                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
334                                 video_info = compat_parse_qs(video_info_webpage)
335                                 if 'token' in video_info:
336                                         break
337                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
338                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
339                                 return
340                 if 'token' not in video_info:
341                         if 'reason' in video_info:
342                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
343                         else:
344                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
345                         return
346
347                 # Check for "rental" videos
348                 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
349                         self._downloader.trouble(u'ERROR: "rental" videos not supported')
350                         return
351
352                 # Start extracting information
353                 self.report_information_extraction(video_id)
354
355                 # uploader
356                 if 'author' not in video_info:
357                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
358                         return
359                 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
360
361                 # title
362                 if 'title' not in video_info:
363                         self._downloader.trouble(u'ERROR: unable to extract video title')
364                         return
365                 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
366
367                 # thumbnail image
368                 if 'thumbnail_url' not in video_info:
369                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
370                         video_thumbnail = ''
371                 else:   # don't panic if we can't find it
372                         video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
373
374                 # upload date
375                 upload_date = None
376                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
377                 if mobj is not None:
378                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
379                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
380                         for expression in format_expressions:
381                                 try:
382                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
383                                 except:
384                                         pass
385
386                 # description
387                 video_description = get_element_by_id("eow-description", video_webpage)
388                 if video_description:
389                         video_description = clean_html(video_description)
390                 else:
391                         video_description = ''
392
393                 # closed captions
394                 video_subtitles = None
395                 if self._downloader.params.get('writesubtitles', False):
396                         try:
397                                 self.report_video_subtitles_download(video_id)
398                                 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
399                                 try:
400                                         srt_list = compat_urllib_request.urlopen(request).read()
401                                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
402                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
403                                 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
404                                 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
405                                 if not srt_lang_list:
406                                         raise Trouble(u'WARNING: video has no closed captions')
407                                 if self._downloader.params.get('subtitleslang', False):
408                                         srt_lang = self._downloader.params.get('subtitleslang')
409                                 elif 'en' in srt_lang_list:
410                                         srt_lang = 'en'
411                                 else:
412                                         srt_lang = srt_lang_list.keys()[0]
413                                 if not srt_lang in srt_lang_list:
414                                         raise Trouble(u'WARNING: no closed captions found in the specified language')
415                                 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
416                                 try:
417                                         srt_xml = compat_urllib_request.urlopen(request).read()
418                                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
419                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
420                                 if not srt_xml:
421                                         raise Trouble(u'WARNING: unable to download video subtitles')
422                                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
423                         except Trouble as trouble:
424                                 self._downloader.trouble(trouble[0])
425
426                 if 'length_seconds' not in video_info:
427                         self._downloader.trouble(u'WARNING: unable to extract video duration')
428                         video_duration = ''
429                 else:
430                         video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
431
432                 # token
433                 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
434
435                 # Decide which formats to download
436                 req_format = self._downloader.params.get('format', None)
437
438                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
439                         self.report_rtmp_download()
440                         video_url_list = [(None, video_info['conn'][0])]
441                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
442                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
443                         url_data = [compat_parse_qs(uds) for uds in url_data_strs]
444                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
445                         url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
446
447                         format_limit = self._downloader.params.get('format_limit', None)
448                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
449                         if format_limit is not None and format_limit in available_formats:
450                                 format_list = available_formats[available_formats.index(format_limit):]
451                         else:
452                                 format_list = available_formats
453                         existing_formats = [x for x in format_list if x in url_map]
454                         if len(existing_formats) == 0:
455                                 self._downloader.trouble(u'ERROR: no known formats available for video')
456                                 return
457                         if self._downloader.params.get('listformats', None):
458                                 self._print_formats(existing_formats)
459                                 return
460                         if req_format is None or req_format == 'best':
461                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
462                         elif req_format == 'worst':
463                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
464                         elif req_format in ('-1', 'all'):
465                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
466                         else:
467                                 # Specific formats. We pick the first in a slash-delimeted sequence.
468                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
469                                 req_formats = req_format.split('/')
470                                 video_url_list = None
471                                 for rf in req_formats:
472                                         if rf in url_map:
473                                                 video_url_list = [(rf, url_map[rf])]
474                                                 break
475                                 if video_url_list is None:
476                                         self._downloader.trouble(u'ERROR: requested format not available')
477                                         return
478                 else:
479                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
480                         return
481
482                 results = []
483                 for format_param, video_real_url in video_url_list:
484                         # Extension
485                         video_extension = self._video_extensions.get(format_param, 'flv')
486
487                         video_format = '{} - {}'.format(format_param if format_param else video_extension,
488                                                             self._video_dimensions.get(format_param, '???'))
489
490                         results.append({
491                                 'id':           video_id,
492                                 'url':          video_real_url,
493                                 'uploader':     video_uploader,
494                                 'upload_date':  upload_date,
495                                 'title':        video_title,
496                                 'ext':          video_extension,
497                                 'format':       video_format,
498                                 'thumbnail':    video_thumbnail,
499                                 'description':  video_description,
500                                 'player_url':   player_url,
501                                 'subtitles':    video_subtitles,
502                                 'duration':             video_duration
503                         })
504                 return results
505
506
507 class MetacafeIE(InfoExtractor):
508         """Information Extractor for metacafe.com."""
509
510         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
511         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
512         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
513         IE_NAME = u'metacafe'
514
515         def __init__(self, downloader=None):
516                 InfoExtractor.__init__(self, downloader)
517
518         def report_disclaimer(self):
519                 """Report disclaimer retrieval."""
520                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
521
522         def report_age_confirmation(self):
523                 """Report attempt to confirm age."""
524                 self._downloader.to_screen(u'[metacafe] Confirming age')
525
526         def report_download_webpage(self, video_id):
527                 """Report webpage download."""
528                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
529
530         def report_extraction(self, video_id):
531                 """Report information extraction."""
532                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
533
534         def _real_initialize(self):
535                 # Retrieve disclaimer
536                 request = compat_urllib_request.Request(self._DISCLAIMER)
537                 try:
538                         self.report_disclaimer()
539                         disclaimer = compat_urllib_request.urlopen(request).read()
540                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
541                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
542                         return
543
544                 # Confirm age
545                 disclaimer_form = {
546                         'filters': '0',
547                         'submit': "Continue - I'm over 18",
548                         }
549                 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
550                 try:
551                         self.report_age_confirmation()
552                         disclaimer = compat_urllib_request.urlopen(request).read()
553                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
554                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
555                         return
556
557         def _real_extract(self, url):
558                 # Extract id and simplified title from URL
559                 mobj = re.match(self._VALID_URL, url)
560                 if mobj is None:
561                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
562                         return
563
564                 video_id = mobj.group(1)
565
566                 # Check if video comes from YouTube
567                 mobj2 = re.match(r'^yt-(.*)$', video_id)
568                 if mobj2 is not None:
569                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
570                         return
571
572                 # Retrieve video webpage to extract further information
573                 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
574                 try:
575                         self.report_download_webpage(video_id)
576                         webpage = compat_urllib_request.urlopen(request).read()
577                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
578                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
579                         return
580
581                 # Extract URL, uploader and title from webpage
582                 self.report_extraction(video_id)
583                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
584                 if mobj is not None:
585                         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
586                         video_extension = mediaURL[-3:]
587
588                         # Extract gdaKey if available
589                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
590                         if mobj is None:
591                                 video_url = mediaURL
592                         else:
593                                 gdaKey = mobj.group(1)
594                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
595                 else:
596                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
597                         if mobj is None:
598                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
599                                 return
600                         vardict = compat_parse_qs(mobj.group(1))
601                         if 'mediaData' not in vardict:
602                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
603                                 return
604                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
605                         if mobj is None:
606                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
607                                 return
608                         mediaURL = mobj.group(1).replace('\\/', '/')
609                         video_extension = mediaURL[-3:]
610                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
611
612                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
613                 if mobj is None:
614                         self._downloader.trouble(u'ERROR: unable to extract title')
615                         return
616                 video_title = mobj.group(1).decode('utf-8')
617
618                 mobj = re.search(r'submitter=(.*?);', webpage)
619                 if mobj is None:
620                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
621                         return
622                 video_uploader = mobj.group(1)
623
624                 return [{
625                         'id':           video_id.decode('utf-8'),
626                         'url':          video_url.decode('utf-8'),
627                         'uploader':     video_uploader.decode('utf-8'),
628                         'upload_date':  None,
629                         'title':        video_title,
630                         'ext':          video_extension.decode('utf-8'),
631                 }]
632
633
634 class DailymotionIE(InfoExtractor):
635         """Information Extractor for Dailymotion"""
636
637         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
638         IE_NAME = u'dailymotion'
639
640         def __init__(self, downloader=None):
641                 InfoExtractor.__init__(self, downloader)
642
643         def report_download_webpage(self, video_id):
644                 """Report webpage download."""
645                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
646
647         def report_extraction(self, video_id):
648                 """Report information extraction."""
649                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
650
651         def _real_extract(self, url):
652                 # Extract id and simplified title from URL
653                 mobj = re.match(self._VALID_URL, url)
654                 if mobj is None:
655                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
656                         return
657
658                 video_id = mobj.group(1).split('_')[0].split('?')[0]
659
660                 video_extension = 'mp4'
661
662                 # Retrieve video webpage to extract further information
663                 request = compat_urllib_request.Request(url)
664                 request.add_header('Cookie', 'family_filter=off')
665                 try:
666                         self.report_download_webpage(video_id)
667                         webpage = compat_urllib_request.urlopen(request).read()
668                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
669                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
670                         return
671
672                 # Extract URL, uploader and title from webpage
673                 self.report_extraction(video_id)
674                 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
675                 if mobj is None:
676                         self._downloader.trouble(u'ERROR: unable to extract media URL')
677                         return
678                 flashvars = compat_urllib_parse.unquote(mobj.group(1))
679
680                 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
681                         if key in flashvars:
682                                 max_quality = key
683                                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
684                                 break
685                 else:
686                         self._downloader.trouble(u'ERROR: unable to extract video URL')
687                         return
688
689                 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
690                 if mobj is None:
691                         self._downloader.trouble(u'ERROR: unable to extract video URL')
692                         return
693
694                 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
695
696                 # TODO: support choosing qualities
697
698                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
699                 if mobj is None:
700                         self._downloader.trouble(u'ERROR: unable to extract title')
701                         return
702                 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
703
704                 video_uploader = None
705                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
706                 if mobj is None:
707                         # lookin for official user
708                         mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
709                         if mobj_official is None:
710                                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
711                         else:
712                                 video_uploader = mobj_official.group(1)
713                 else:
714                         video_uploader = mobj.group(1)
715
716                 video_upload_date = None
717                 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
718                 if mobj is not None:
719                         video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
720
721                 return [{
722                         'id':           video_id.decode('utf-8'),
723                         'url':          video_url.decode('utf-8'),
724                         'uploader':     video_uploader.decode('utf-8'),
725                         'upload_date':  video_upload_date,
726                         'title':        video_title,
727                         'ext':          video_extension.decode('utf-8'),
728                 }]
729
730
731 class GoogleIE(InfoExtractor):
732         """Information extractor for video.google.com."""
733
734         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
735         IE_NAME = u'video.google'
736
737         def __init__(self, downloader=None):
738                 InfoExtractor.__init__(self, downloader)
739
740         def report_download_webpage(self, video_id):
741                 """Report webpage download."""
742                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
743
744         def report_extraction(self, video_id):
745                 """Report information extraction."""
746                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
747
748         def _real_extract(self, url):
749                 # Extract id from URL
750                 mobj = re.match(self._VALID_URL, url)
751                 if mobj is None:
752                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
753                         return
754
755                 video_id = mobj.group(1)
756
757                 video_extension = 'mp4'
758
759                 # Retrieve video webpage to extract further information
760                 request = compat_urllib_request.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
761                 try:
762                         self.report_download_webpage(video_id)
763                         webpage = compat_urllib_request.urlopen(request).read()
764                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
765                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
766                         return
767
768                 # Extract URL, uploader, and title from webpage
769                 self.report_extraction(video_id)
770                 mobj = re.search(r"download_url:'([^']+)'", webpage)
771                 if mobj is None:
772                         video_extension = 'flv'
773                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
774                 if mobj is None:
775                         self._downloader.trouble(u'ERROR: unable to extract media URL')
776                         return
777                 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
778                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
779                 mediaURL = mediaURL.replace('\\x26', '\x26')
780
781                 video_url = mediaURL
782
783                 mobj = re.search(r'<title>(.*)</title>', webpage)
784                 if mobj is None:
785                         self._downloader.trouble(u'ERROR: unable to extract title')
786                         return
787                 video_title = mobj.group(1).decode('utf-8')
788
789                 # Extract video description
790                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
791                 if mobj is None:
792                         self._downloader.trouble(u'ERROR: unable to extract video description')
793                         return
794                 video_description = mobj.group(1).decode('utf-8')
795                 if not video_description:
796                         video_description = 'No description available.'
797
798                 # Extract video thumbnail
799                 if self._downloader.params.get('forcethumbnail', False):
800                         request = compat_urllib_request.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
801                         try:
802                                 webpage = compat_urllib_request.urlopen(request).read()
803                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
804                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
805                                 return
806                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
807                         if mobj is None:
808                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
809                                 return
810                         video_thumbnail = mobj.group(1)
811                 else:   # we need something to pass to process_info
812                         video_thumbnail = ''
813
814                 return [{
815                         'id':           video_id.decode('utf-8'),
816                         'url':          video_url.decode('utf-8'),
817                         'uploader':     None,
818                         'upload_date':  None,
819                         'title':        video_title,
820                         'ext':          video_extension.decode('utf-8'),
821                 }]
822
823
824 class PhotobucketIE(InfoExtractor):
825         """Information extractor for photobucket.com."""
826
827         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
828         IE_NAME = u'photobucket'
829
830         def __init__(self, downloader=None):
831                 InfoExtractor.__init__(self, downloader)
832
833         def report_download_webpage(self, video_id):
834                 """Report webpage download."""
835                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
836
837         def report_extraction(self, video_id):
838                 """Report information extraction."""
839                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
840
841         def _real_extract(self, url):
842                 # Extract id from URL
843                 mobj = re.match(self._VALID_URL, url)
844                 if mobj is None:
845                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
846                         return
847
848                 video_id = mobj.group(1)
849
850                 video_extension = 'flv'
851
852                 # Retrieve video webpage to extract further information
853                 request = compat_urllib_request.Request(url)
854                 try:
855                         self.report_download_webpage(video_id)
856                         webpage = compat_urllib_request.urlopen(request).read()
857                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
858                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
859                         return
860
861                 # Extract URL, uploader, and title from webpage
862                 self.report_extraction(video_id)
863                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
864                 if mobj is None:
865                         self._downloader.trouble(u'ERROR: unable to extract media URL')
866                         return
867                 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
868
869                 video_url = mediaURL
870
871                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
872                 if mobj is None:
873                         self._downloader.trouble(u'ERROR: unable to extract title')
874                         return
875                 video_title = mobj.group(1).decode('utf-8')
876
877                 video_uploader = mobj.group(2).decode('utf-8')
878
879                 return [{
880                         'id':           video_id.decode('utf-8'),
881                         'url':          video_url.decode('utf-8'),
882                         'uploader':     video_uploader,
883                         'upload_date':  None,
884                         'title':        video_title,
885                         'ext':          video_extension.decode('utf-8'),
886                 }]
887
888
889 class YahooIE(InfoExtractor):
890         """Information extractor for video.yahoo.com."""
891
892         # _VALID_URL matches all Yahoo! Video URLs
893         # _VPAGE_URL matches only the extractable '/watch/' URLs
894         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
895         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
896         IE_NAME = u'video.yahoo'
897
898         def __init__(self, downloader=None):
899                 InfoExtractor.__init__(self, downloader)
900
901         def report_download_webpage(self, video_id):
902                 """Report webpage download."""
903                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
904
905         def report_extraction(self, video_id):
906                 """Report information extraction."""
907                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
908
909         def _real_extract(self, url, new_video=True):
910                 # Extract ID from URL
911                 mobj = re.match(self._VALID_URL, url)
912                 if mobj is None:
913                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
914                         return
915
916                 video_id = mobj.group(2)
917                 video_extension = 'flv'
918
919                 # Rewrite valid but non-extractable URLs as
920                 # extractable English language /watch/ URLs
921                 if re.match(self._VPAGE_URL, url) is None:
922                         request = compat_urllib_request.Request(url)
923                         try:
924                                 webpage = compat_urllib_request.urlopen(request).read()
925                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
926                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
927                                 return
928
929                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
930                         if mobj is None:
931                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
932                                 return
933                         yahoo_id = mobj.group(1)
934
935                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
936                         if mobj is None:
937                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
938                                 return
939                         yahoo_vid = mobj.group(1)
940
941                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
942                         return self._real_extract(url, new_video=False)
943
944                 # Retrieve video webpage to extract further information
945                 request = compat_urllib_request.Request(url)
946                 try:
947                         self.report_download_webpage(video_id)
948                         webpage = compat_urllib_request.urlopen(request).read()
949                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
950                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
951                         return
952
953                 # Extract uploader and title from webpage
954                 self.report_extraction(video_id)
955                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
956                 if mobj is None:
957                         self._downloader.trouble(u'ERROR: unable to extract video title')
958                         return
959                 video_title = mobj.group(1).decode('utf-8')
960
961                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
962                 if mobj is None:
963                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
964                         return
965                 video_uploader = mobj.group(1).decode('utf-8')
966
967                 # Extract video thumbnail
968                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
969                 if mobj is None:
970                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
971                         return
972                 video_thumbnail = mobj.group(1).decode('utf-8')
973
974                 # Extract video description
975                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
976                 if mobj is None:
977                         self._downloader.trouble(u'ERROR: unable to extract video description')
978                         return
979                 video_description = mobj.group(1).decode('utf-8')
980                 if not video_description:
981                         video_description = 'No description available.'
982
983                 # Extract video height and width
984                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
985                 if mobj is None:
986                         self._downloader.trouble(u'ERROR: unable to extract video height')
987                         return
988                 yv_video_height = mobj.group(1)
989
990                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
991                 if mobj is None:
992                         self._downloader.trouble(u'ERROR: unable to extract video width')
993                         return
994                 yv_video_width = mobj.group(1)
995
996                 # Retrieve video playlist to extract media URL
997                 # I'm not completely sure what all these options are, but we
998                 # seem to need most of them, otherwise the server sends a 401.
999                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1000                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1001                 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1002                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1003                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1004                 try:
1005                         self.report_download_webpage(video_id)
1006                         webpage = compat_urllib_request.urlopen(request).read()
1007                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1008                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1009                         return
1010
1011                 # Extract media URL from playlist XML
1012                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1013                 if mobj is None:
1014                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1015                         return
1016                 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1017                 video_url = unescapeHTML(video_url)
1018
1019                 return [{
1020                         'id':           video_id.decode('utf-8'),
1021                         'url':          video_url,
1022                         'uploader':     video_uploader,
1023                         'upload_date':  None,
1024                         'title':        video_title,
1025                         'ext':          video_extension.decode('utf-8'),
1026                         'thumbnail':    video_thumbnail.decode('utf-8'),
1027                         'description':  video_description,
1028                 }]
1029
1030
1031 class VimeoIE(InfoExtractor):
1032         """Information extractor for vimeo.com."""
1033
1034         # _VALID_URL matches Vimeo URLs
1035         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
1036         IE_NAME = u'vimeo'
1037
1038         def __init__(self, downloader=None):
1039                 InfoExtractor.__init__(self, downloader)
1040
1041         def report_download_webpage(self, video_id):
1042                 """Report webpage download."""
1043                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1044
1045         def report_extraction(self, video_id):
1046                 """Report information extraction."""
1047                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1048
1049         def _real_extract(self, url, new_video=True):
1050                 # Extract ID from URL
1051                 mobj = re.match(self._VALID_URL, url)
1052                 if mobj is None:
1053                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1054                         return
1055
1056                 video_id = mobj.group(1)
1057
1058                 # Retrieve video webpage to extract further information
1059                 request = compat_urllib_request.Request(url, None, std_headers)
1060                 try:
1061                         self.report_download_webpage(video_id)
1062                         webpage = compat_urllib_request.urlopen(request).read()
1063                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1064                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1065                         return
1066
1067                 # Now we begin extracting as much information as we can from what we
1068                 # retrieved. First we extract the information common to all extractors,
1069                 # and latter we extract those that are Vimeo specific.
1070                 self.report_extraction(video_id)
1071
1072                 # Extract the config JSON
1073                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1074                 try:
1075                         config = json.loads(config)
1076                 except:
1077                         self._downloader.trouble(u'ERROR: unable to extract info section')
1078                         return
1079                 
1080                 # Extract title
1081                 video_title = config["video"]["title"]
1082
1083                 # Extract uploader
1084                 video_uploader = config["video"]["owner"]["name"]
1085
1086                 # Extract video thumbnail
1087                 video_thumbnail = config["video"]["thumbnail"]
1088
1089                 # Extract video description
1090                 video_description = get_element_by_id("description", webpage.decode('utf8'))
1091                 if video_description: video_description = clean_html(video_description)
1092                 else: video_description = ''
1093
1094                 # Extract upload date
1095                 video_upload_date = None
1096                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1097                 if mobj is not None:
1098                         video_upload_date = mobj.group(1)
1099
1100                 # Vimeo specific: extract request signature and timestamp
1101                 sig = config['request']['signature']
1102                 timestamp = config['request']['timestamp']
1103
1104                 # Vimeo specific: extract video codec and quality information
1105                 # First consider quality, then codecs, then take everything
1106                 # TODO bind to format param
1107                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1108                 files = { 'hd': [], 'sd': [], 'other': []}
1109                 for codec_name, codec_extension in codecs:
1110                         if codec_name in config["video"]["files"]:
1111                                 if 'hd' in config["video"]["files"][codec_name]:
1112                                         files['hd'].append((codec_name, codec_extension, 'hd'))
1113                                 elif 'sd' in config["video"]["files"][codec_name]:
1114                                         files['sd'].append((codec_name, codec_extension, 'sd'))
1115                                 else:
1116                                         files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1117
1118                 for quality in ('hd', 'sd', 'other'):
1119                         if len(files[quality]) > 0:
1120                                 video_quality = files[quality][0][2]
1121                                 video_codec = files[quality][0][0]
1122                                 video_extension = files[quality][0][1]
1123                                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1124                                 break
1125                 else:
1126                         self._downloader.trouble(u'ERROR: no known codec found')
1127                         return
1128
1129                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1130                                         %(video_id, sig, timestamp, video_quality, video_codec.upper())
1131
1132                 return [{
1133                         'id':           video_id,
1134                         'url':          video_url,
1135                         'uploader':     video_uploader,
1136                         'upload_date':  video_upload_date,
1137                         'title':        video_title,
1138                         'ext':          video_extension,
1139                         'thumbnail':    video_thumbnail,
1140                         'description':  video_description,
1141                 }]
1142
1143
1144 class ArteTvIE(InfoExtractor):
1145         """arte.tv information extractor."""
1146
1147         _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1148         _LIVE_URL = r'index-[0-9]+\.html$'
1149
1150         IE_NAME = u'arte.tv'
1151
1152         def __init__(self, downloader=None):
1153                 InfoExtractor.__init__(self, downloader)
1154
1155         def report_download_webpage(self, video_id):
1156                 """Report webpage download."""
1157                 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1158
1159         def report_extraction(self, video_id):
1160                 """Report information extraction."""
1161                 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1162
1163         def fetch_webpage(self, url):
1164                 self._downloader.increment_downloads()
1165                 request = compat_urllib_request.Request(url)
1166                 try:
1167                         self.report_download_webpage(url)
1168                         webpage = compat_urllib_request.urlopen(request).read()
1169                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1170                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1171                         return
1172                 except ValueError as err:
1173                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1174                         return
1175                 return webpage
1176
1177         def grep_webpage(self, url, regex, regexFlags, matchTuples):
1178                 page = self.fetch_webpage(url)
1179                 mobj = re.search(regex, page, regexFlags)
1180                 info = {}
1181
1182                 if mobj is None:
1183                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1184                         return
1185
1186                 for (i, key, err) in matchTuples:
1187                         if mobj.group(i) is None:
1188                                 self._downloader.trouble(err)
1189                                 return
1190                         else:
1191                                 info[key] = mobj.group(i)
1192
1193                 return info
1194
1195         def extractLiveStream(self, url):
1196                 video_lang = url.split('/')[-4]
1197                 info = self.grep_webpage(
1198                         url,
1199                         r'src="(.*?/videothek_js.*?\.js)',
1200                         0,
1201                         [
1202                                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1203                         ]
1204                 )
1205                 http_host = url.split('/')[2]
1206                 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1207                 info = self.grep_webpage(
1208                         next_url,
1209                         r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1210                                 '(http://.*?\.swf).*?' +
1211                                 '(rtmp://.*?)\'',
1212                         re.DOTALL,
1213                         [
1214                                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1215                                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1216                                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1217                         ]
1218                 )
1219                 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1220
1221         def extractPlus7Stream(self, url):
1222                 video_lang = url.split('/')[-3]
1223                 info = self.grep_webpage(
1224                         url,
1225                         r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1226                         0,
1227                         [
1228                                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1229                         ]
1230                 )
1231                 next_url = compat_urllib_parse.unquote(info.get('url'))
1232                 info = self.grep_webpage(
1233                         next_url,
1234                         r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1235                         0,
1236                         [
1237                                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1238                         ]
1239                 )
1240                 next_url = compat_urllib_parse.unquote(info.get('url'))
1241
1242                 info = self.grep_webpage(
1243                         next_url,
1244                         r'<video id="(.*?)".*?>.*?' +
1245                                 '<name>(.*?)</name>.*?' +
1246                                 '<dateVideo>(.*?)</dateVideo>.*?' +
1247                                 '<url quality="hd">(.*?)</url>',
1248                         re.DOTALL,
1249                         [
1250                                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1251                                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1252                                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1253                                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1254                         ]
1255                 )
1256
1257                 return {
1258                         'id':           info.get('id'),
1259                         'url':          compat_urllib_parse.unquote(info.get('url')),
1260                         'uploader':     u'arte.tv',
1261                         'upload_date':  info.get('date'),
1262                         'title':        info.get('title'),
1263                         'ext':          u'mp4',
1264                         'format':       u'NA',
1265                         'player_url':   None,
1266                 }
1267
1268         def _real_extract(self, url):
1269                 video_id = url.split('/')[-1]
1270                 self.report_extraction(video_id)
1271
1272                 if re.search(self._LIVE_URL, video_id) is not None:
1273                         self.extractLiveStream(url)
1274                         return
1275                 else:
1276                         info = self.extractPlus7Stream(url)
1277
1278                 return [info]
1279
1280
1281 class GenericIE(InfoExtractor):
1282         """Generic last-resort information extractor."""
1283
1284         _VALID_URL = r'.*'
1285         IE_NAME = u'generic'
1286
1287         def __init__(self, downloader=None):
1288                 InfoExtractor.__init__(self, downloader)
1289
1290         def report_download_webpage(self, video_id):
1291                 """Report webpage download."""
1292                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1293                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1294
1295         def report_extraction(self, video_id):
1296                 """Report information extraction."""
1297                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1298
1299         def report_following_redirect(self, new_url):
1300                 """Report information extraction."""
1301                 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1302                 
1303         def _test_redirect(self, url):
1304                 """Check if it is a redirect, like url shorteners, in case restart chain."""
1305                 class HeadRequest(compat_urllib_request.Request):
1306                         def get_method(self):
1307                                 return "HEAD"
1308
1309                 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1310                         """
1311                         Subclass the HTTPRedirectHandler to make it use our 
1312                         HeadRequest also on the redirected URL
1313                         """
1314                         def redirect_request(self, req, fp, code, msg, headers, newurl): 
1315                                 if code in (301, 302, 303, 307):
1316                                         newurl = newurl.replace(' ', '%20') 
1317                                         newheaders = dict((k,v) for k,v in req.headers.items()
1318                                                                           if k.lower() not in ("content-length", "content-type"))
1319                                         return HeadRequest(newurl, 
1320                                                                            headers=newheaders,
1321                                                                            origin_req_host=req.get_origin_req_host(), 
1322                                                                            unverifiable=True) 
1323                                 else: 
1324                                         raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp) 
1325
1326                 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1327                         """
1328                         Fallback to GET if HEAD is not allowed (405 HTTP error)
1329                         """
1330                         def http_error_405(self, req, fp, code, msg, headers): 
1331                                 fp.read()
1332                                 fp.close()
1333
1334                                 newheaders = dict((k,v) for k,v in req.headers.items()
1335                                                                   if k.lower() not in ("content-length", "content-type"))
1336                                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(), 
1337                                                                                                  headers=newheaders, 
1338                                                                                                  origin_req_host=req.get_origin_req_host(), 
1339                                                                                                  unverifiable=True))
1340
1341                 # Build our opener
1342                 opener = compat_urllib_request.OpenerDirector() 
1343                 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1344                                                 HTTPMethodFallback, HEADRedirectHandler,
1345                                                 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1346                         opener.add_handler(handler())
1347
1348                 response = opener.open(HeadRequest(url))
1349                 new_url = response.geturl()
1350
1351                 if url == new_url:
1352                         return False
1353
1354                 self.report_following_redirect(new_url)
1355                 self._downloader.download([new_url])
1356                 return True
1357
1358         def _real_extract(self, url):
1359                 if self._test_redirect(url): return
1360
1361                 video_id = url.split('/')[-1]
1362                 request = compat_urllib_request.Request(url)
1363                 try:
1364                         self.report_download_webpage(video_id)
1365                         webpage = compat_urllib_request.urlopen(request).read()
1366                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1367                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1368                         return
1369                 except ValueError as err:
1370                         # since this is the last-resort InfoExtractor, if
1371                         # this error is thrown, it'll be thrown here
1372                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1373                         return
1374
1375                 self.report_extraction(video_id)
1376                 # Start with something easy: JW Player in SWFObject
1377                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1378                 if mobj is None:
1379                         # Broaden the search a little bit
1380                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1381                 if mobj is None:
1382                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1383                         return
1384
1385                 # It's possible that one of the regexes
1386                 # matched, but returned an empty group:
1387                 if mobj.group(1) is None:
1388                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1389                         return
1390
1391                 video_url = compat_urllib_parse.unquote(mobj.group(1))
1392                 video_id = os.path.basename(video_url)
1393
1394                 # here's a fun little line of code for you:
1395                 video_extension = os.path.splitext(video_id)[1][1:]
1396                 video_id = os.path.splitext(video_id)[0]
1397
1398                 # it's tempting to parse this further, but you would
1399                 # have to take into account all the variations like
1400                 #   Video Title - Site Name
1401                 #   Site Name | Video Title
1402                 #   Video Title - Tagline | Site Name
1403                 # and so on and so forth; it's just not practical
1404                 mobj = re.search(r'<title>(.*)</title>', webpage)
1405                 if mobj is None:
1406                         self._downloader.trouble(u'ERROR: unable to extract title')
1407                         return
1408                 video_title = mobj.group(1).decode('utf-8')
1409
1410                 # video uploader is domain name
1411                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1412                 if mobj is None:
1413                         self._downloader.trouble(u'ERROR: unable to extract title')
1414                         return
1415                 video_uploader = mobj.group(1).decode('utf-8')
1416
1417                 return [{
1418                         'id':           video_id.decode('utf-8'),
1419                         'url':          video_url.decode('utf-8'),
1420                         'uploader':     video_uploader,
1421                         'upload_date':  None,
1422                         'title':        video_title,
1423                         'ext':          video_extension.decode('utf-8'),
1424                 }]
1425
1426
1427 class YoutubeSearchIE(InfoExtractor):
1428         """Information Extractor for YouTube search queries."""
1429         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1430         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1431         _max_youtube_results = 1000
1432         IE_NAME = u'youtube:search'
1433
1434         def __init__(self, downloader=None):
1435                 InfoExtractor.__init__(self, downloader)
1436
1437         def report_download_page(self, query, pagenum):
1438                 """Report attempt to download search page with given number."""
1439                 query = query.decode(preferredencoding())
1440                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1441
1442         def _real_extract(self, query):
1443                 mobj = re.match(self._VALID_URL, query)
1444                 if mobj is None:
1445                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1446                         return
1447
1448                 prefix, query = query.split(':')
1449                 prefix = prefix[8:]
1450                 query = query.encode('utf-8')
1451                 if prefix == '':
1452                         self._download_n_results(query, 1)
1453                         return
1454                 elif prefix == 'all':
1455                         self._download_n_results(query, self._max_youtube_results)
1456                         return
1457                 else:
1458                         try:
1459                                 n = int(prefix)
1460                                 if n <= 0:
1461                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1462                                         return
1463                                 elif n > self._max_youtube_results:
1464                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1465                                         n = self._max_youtube_results
1466                                 self._download_n_results(query, n)
1467                                 return
1468                         except ValueError: # parsing prefix as integer fails
1469                                 self._download_n_results(query, 1)
1470                                 return
1471
1472         def _download_n_results(self, query, n):
1473                 """Downloads a specified number of results for a query"""
1474
1475                 video_ids = []
1476                 pagenum = 0
1477                 limit = n
1478
1479                 while (50 * pagenum) < limit:
1480                         self.report_download_page(query, pagenum+1)
1481                         result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1482                         request = compat_urllib_request.Request(result_url)
1483                         try:
1484                                 data = compat_urllib_request.urlopen(request).read()
1485                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1486                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1487                                 return
1488                         api_response = json.loads(data)['data']
1489
1490                         new_ids = list(video['id'] for video in api_response['items'])
1491                         video_ids += new_ids
1492
1493                         limit = min(n, api_response['totalItems'])
1494                         pagenum += 1
1495
1496                 if len(video_ids) > n:
1497                         video_ids = video_ids[:n]
1498                 for id in video_ids:
1499                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1500                 return
1501
1502
1503 class GoogleSearchIE(InfoExtractor):
1504         """Information Extractor for Google Video search queries."""
1505         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1506         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1507         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1508         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1509         _max_google_results = 1000
1510         IE_NAME = u'video.google:search'
1511
1512         def __init__(self, downloader=None):
1513                 InfoExtractor.__init__(self, downloader)
1514
1515         def report_download_page(self, query, pagenum):
1516                 """Report attempt to download playlist page with given number."""
1517                 query = query.decode(preferredencoding())
1518                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1519
1520         def _real_extract(self, query):
1521                 mobj = re.match(self._VALID_URL, query)
1522                 if mobj is None:
1523                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1524                         return
1525
1526                 prefix, query = query.split(':')
1527                 prefix = prefix[8:]
1528                 query = query.encode('utf-8')
1529                 if prefix == '':
1530                         self._download_n_results(query, 1)
1531                         return
1532                 elif prefix == 'all':
1533                         self._download_n_results(query, self._max_google_results)
1534                         return
1535                 else:
1536                         try:
1537                                 n = int(prefix)
1538                                 if n <= 0:
1539                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1540                                         return
1541                                 elif n > self._max_google_results:
1542                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1543                                         n = self._max_google_results
1544                                 self._download_n_results(query, n)
1545                                 return
1546                         except ValueError: # parsing prefix as integer fails
1547                                 self._download_n_results(query, 1)
1548                                 return
1549
1550         def _download_n_results(self, query, n):
1551                 """Downloads a specified number of results for a query"""
1552
1553                 video_ids = []
1554                 pagenum = 0
1555
1556                 while True:
1557                         self.report_download_page(query, pagenum)
1558                         result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1559                         request = compat_urllib_request.Request(result_url)
1560                         try:
1561                                 page = compat_urllib_request.urlopen(request).read()
1562                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1563                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1564                                 return
1565
1566                         # Extract video identifiers
1567                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1568                                 video_id = mobj.group(1)
1569                                 if video_id not in video_ids:
1570                                         video_ids.append(video_id)
1571                                         if len(video_ids) == n:
1572                                                 # Specified n videos reached
1573                                                 for id in video_ids:
1574                                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1575                                                 return
1576
1577                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1578                                 for id in video_ids:
1579                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1580                                 return
1581
1582                         pagenum = pagenum + 1
1583
1584
1585 class YahooSearchIE(InfoExtractor):
1586         """Information Extractor for Yahoo! Video search queries."""
1587         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1588         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1589         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1590         _MORE_PAGES_INDICATOR = r'\s*Next'
1591         _max_yahoo_results = 1000
1592         IE_NAME = u'video.yahoo:search'
1593
1594         def __init__(self, downloader=None):
1595                 InfoExtractor.__init__(self, downloader)
1596
1597         def report_download_page(self, query, pagenum):
1598                 """Report attempt to download playlist page with given number."""
1599                 query = query.decode(preferredencoding())
1600                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1601
1602         def _real_extract(self, query):
1603                 mobj = re.match(self._VALID_URL, query)
1604                 if mobj is None:
1605                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1606                         return
1607
1608                 prefix, query = query.split(':')
1609                 prefix = prefix[8:]
1610                 query = query.encode('utf-8')
1611                 if prefix == '':
1612                         self._download_n_results(query, 1)
1613                         return
1614                 elif prefix == 'all':
1615                         self._download_n_results(query, self._max_yahoo_results)
1616                         return
1617                 else:
1618                         try:
1619                                 n = int(prefix)
1620                                 if n <= 0:
1621                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1622                                         return
1623                                 elif n > self._max_yahoo_results:
1624                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1625                                         n = self._max_yahoo_results
1626                                 self._download_n_results(query, n)
1627                                 return
1628                         except ValueError: # parsing prefix as integer fails
1629                                 self._download_n_results(query, 1)
1630                                 return
1631
1632         def _download_n_results(self, query, n):
1633                 """Downloads a specified number of results for a query"""
1634
1635                 video_ids = []
1636                 already_seen = set()
1637                 pagenum = 1
1638
1639                 while True:
1640                         self.report_download_page(query, pagenum)
1641                         result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1642                         request = compat_urllib_request.Request(result_url)
1643                         try:
1644                                 page = compat_urllib_request.urlopen(request).read()
1645                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1646                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1647                                 return
1648
1649                         # Extract video identifiers
1650                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1651                                 video_id = mobj.group(1)
1652                                 if video_id not in already_seen:
1653                                         video_ids.append(video_id)
1654                                         already_seen.add(video_id)
1655                                         if len(video_ids) == n:
1656                                                 # Specified n videos reached
1657                                                 for id in video_ids:
1658                                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1659                                                 return
1660
1661                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1662                                 for id in video_ids:
1663                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1664                                 return
1665
1666                         pagenum = pagenum + 1
1667
1668
1669 class YoutubePlaylistIE(InfoExtractor):
1670         """Information Extractor for YouTube playlists."""
1671
1672         _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1673         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1674         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1675         _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1676         IE_NAME = u'youtube:playlist'
1677
1678         def __init__(self, downloader=None):
1679                 InfoExtractor.__init__(self, downloader)
1680
1681         def report_download_page(self, playlist_id, pagenum):
1682                 """Report attempt to download playlist page with given number."""
1683                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1684
1685         def _real_extract(self, url):
1686                 # Extract playlist id
1687                 mobj = re.match(self._VALID_URL, url)
1688                 if mobj is None:
1689                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1690                         return
1691
1692                 # Single video case
1693                 if mobj.group(3) is not None:
1694                         self._downloader.download([mobj.group(3)])
1695                         return
1696
1697                 # Download playlist pages
1698                 # prefix is 'p' as default for playlists but there are other types that need extra care
1699                 playlist_prefix = mobj.group(1)
1700                 if playlist_prefix == 'a':
1701                         playlist_access = 'artist'
1702                 else:
1703                         playlist_prefix = 'p'
1704                         playlist_access = 'view_play_list'
1705                 playlist_id = mobj.group(2)
1706                 video_ids = []
1707                 pagenum = 1
1708
1709                 while True:
1710                         self.report_download_page(playlist_id, pagenum)
1711                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1712                         request = compat_urllib_request.Request(url)
1713                         try:
1714                                 page = compat_urllib_request.urlopen(request).read()
1715                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1716                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1717                                 return
1718
1719                         # Extract video identifiers
1720                         ids_in_page = []
1721                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1722                                 if mobj.group(1) not in ids_in_page:
1723                                         ids_in_page.append(mobj.group(1))
1724                         video_ids.extend(ids_in_page)
1725
1726                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1727                                 break
1728                         pagenum = pagenum + 1
1729
1730                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1731                 playlistend = self._downloader.params.get('playlistend', -1)
1732                 if playlistend == -1:
1733                         video_ids = video_ids[playliststart:]
1734                 else:
1735                         video_ids = video_ids[playliststart:playlistend]
1736
1737                 for id in video_ids:
1738                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1739                 return
1740
1741
1742 class YoutubeChannelIE(InfoExtractor):
1743         """Information Extractor for YouTube channels."""
1744
1745         _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1746         _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1747         _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1748         IE_NAME = u'youtube:channel'
1749
1750         def report_download_page(self, channel_id, pagenum):
1751                 """Report attempt to download channel page with given number."""
1752                 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1753
1754         def _real_extract(self, url):
1755                 # Extract channel id
1756                 mobj = re.match(self._VALID_URL, url)
1757                 if mobj is None:
1758                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1759                         return
1760
1761                 # Download channel pages
1762                 channel_id = mobj.group(1)
1763                 video_ids = []
1764                 pagenum = 1
1765
1766                 while True:
1767                         self.report_download_page(channel_id, pagenum)
1768                         url = self._TEMPLATE_URL % (channel_id, pagenum)
1769                         request = compat_urllib_request.Request(url)
1770                         try:
1771                                 page = compat_urllib_request.urlopen(request).read()
1772                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1773                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1774                                 return
1775
1776                         # Extract video identifiers
1777                         ids_in_page = []
1778                         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1779                                 if mobj.group(1) not in ids_in_page:
1780                                         ids_in_page.append(mobj.group(1))
1781                         video_ids.extend(ids_in_page)
1782
1783                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1784                                 break
1785                         pagenum = pagenum + 1
1786
1787                 for id in video_ids:
1788                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1789                 return
1790
1791
1792 class YoutubeUserIE(InfoExtractor):
1793         """Information Extractor for YouTube users."""
1794
1795         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1796         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1797         _GDATA_PAGE_SIZE = 50
1798         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1799         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1800         IE_NAME = u'youtube:user'
1801
1802         def __init__(self, downloader=None):
1803                 InfoExtractor.__init__(self, downloader)
1804
1805         def report_download_page(self, username, start_index):
1806                 """Report attempt to download user page."""
1807                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1808                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1809
1810         def _real_extract(self, url):
1811                 # Extract username
1812                 mobj = re.match(self._VALID_URL, url)
1813                 if mobj is None:
1814                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1815                         return
1816
1817                 username = mobj.group(1)
1818
1819                 # Download video ids using YouTube Data API. Result size per
1820                 # query is limited (currently to 50 videos) so we need to query
1821                 # page by page until there are no video ids - it means we got
1822                 # all of them.
1823
1824                 video_ids = []
1825                 pagenum = 0
1826
1827                 while True:
1828                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1829                         self.report_download_page(username, start_index)
1830
1831                         request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1832
1833                         try:
1834                                 page = compat_urllib_request.urlopen(request).read()
1835                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1836                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1837                                 return
1838
1839                         # Extract video identifiers
1840                         ids_in_page = []
1841
1842                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1843                                 if mobj.group(1) not in ids_in_page:
1844                                         ids_in_page.append(mobj.group(1))
1845
1846                         video_ids.extend(ids_in_page)
1847
1848                         # A little optimization - if current page is not
1849                         # "full", ie. does not contain PAGE_SIZE video ids then
1850                         # we can assume that this page is the last one - there
1851                         # are no more ids on further pages - no need to query
1852                         # again.
1853
1854                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1855                                 break
1856
1857                         pagenum += 1
1858
1859                 all_ids_count = len(video_ids)
1860                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1861                 playlistend = self._downloader.params.get('playlistend', -1)
1862
1863                 if playlistend == -1:
1864                         video_ids = video_ids[playliststart:]
1865                 else:
1866                         video_ids = video_ids[playliststart:playlistend]
1867
1868                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1869                                 (username, all_ids_count, len(video_ids)))
1870
1871                 for video_id in video_ids:
1872                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1873
1874
1875 class BlipTVUserIE(InfoExtractor):
1876         """Information Extractor for blip.tv users."""
1877
1878         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1879         _PAGE_SIZE = 12
1880         IE_NAME = u'blip.tv:user'
1881
1882         def __init__(self, downloader=None):
1883                 InfoExtractor.__init__(self, downloader)
1884
1885         def report_download_page(self, username, pagenum):
1886                 """Report attempt to download user page."""
1887                 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1888                                 (self.IE_NAME, username, pagenum))
1889
1890         def _real_extract(self, url):
1891                 # Extract username
1892                 mobj = re.match(self._VALID_URL, url)
1893                 if mobj is None:
1894                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1895                         return
1896
1897                 username = mobj.group(1)
1898
1899                 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1900
1901                 request = compat_urllib_request.Request(url)
1902
1903                 try:
1904                         page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1905                         mobj = re.search(r'data-users-id="([^"]+)"', page)
1906                         page_base = page_base % mobj.group(1)
1907                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1908                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1909                         return
1910
1911
1912                 # Download video ids using BlipTV Ajax calls. Result size per
1913                 # query is limited (currently to 12 videos) so we need to query
1914                 # page by page until there are no video ids - it means we got
1915                 # all of them.
1916
1917                 video_ids = []
1918                 pagenum = 1
1919
1920                 while True:
1921                         self.report_download_page(username, pagenum)
1922
1923                         request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1924
1925                         try:
1926                                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1927                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1928                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1929                                 return
1930
1931                         # Extract video identifiers
1932                         ids_in_page = []
1933
1934                         for mobj in re.finditer(r'href="/([^"]+)"', page):
1935                                 if mobj.group(1) not in ids_in_page:
1936                                         ids_in_page.append(unescapeHTML(mobj.group(1)))
1937
1938                         video_ids.extend(ids_in_page)
1939
1940                         # A little optimization - if current page is not
1941                         # "full", ie. does not contain PAGE_SIZE video ids then
1942                         # we can assume that this page is the last one - there
1943                         # are no more ids on further pages - no need to query
1944                         # again.
1945
1946                         if len(ids_in_page) < self._PAGE_SIZE:
1947                                 break
1948
1949                         pagenum += 1
1950
1951                 all_ids_count = len(video_ids)
1952                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1953                 playlistend = self._downloader.params.get('playlistend', -1)
1954
1955                 if playlistend == -1:
1956                         video_ids = video_ids[playliststart:]
1957                 else:
1958                         video_ids = video_ids[playliststart:playlistend]
1959
1960                 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1961                                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1962
1963                 for video_id in video_ids:
1964                         self._downloader.download([u'http://blip.tv/'+video_id])
1965
1966
1967 class DepositFilesIE(InfoExtractor):
1968         """Information extractor for depositfiles.com"""
1969
1970         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1971         IE_NAME = u'DepositFiles'
1972
1973         def __init__(self, downloader=None):
1974                 InfoExtractor.__init__(self, downloader)
1975
1976         def report_download_webpage(self, file_id):
1977                 """Report webpage download."""
1978                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1979
1980         def report_extraction(self, file_id):
1981                 """Report information extraction."""
1982                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1983
1984         def _real_extract(self, url):
1985                 file_id = url.split('/')[-1]
1986                 # Rebuild url in english locale
1987                 url = 'http://depositfiles.com/en/files/' + file_id
1988
1989                 # Retrieve file webpage with 'Free download' button pressed
1990                 free_download_indication = { 'gateway_result' : '1' }
1991                 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1992                 try:
1993                         self.report_download_webpage(file_id)
1994                         webpage = compat_urllib_request.urlopen(request).read()
1995                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1996                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1997                         return
1998
1999                 # Search for the real file URL
2000                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2001                 if (mobj is None) or (mobj.group(1) is None):
2002                         # Try to figure out reason of the error.
2003                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2004                         if (mobj is not None) and (mobj.group(1) is not None):
2005                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2006                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2007                         else:
2008                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2009                         return
2010
2011                 file_url = mobj.group(1)
2012                 file_extension = os.path.splitext(file_url)[1][1:]
2013
2014                 # Search for file title
2015                 mobj = re.search(r'<b title="(.*?)">', webpage)
2016                 if mobj is None:
2017                         self._downloader.trouble(u'ERROR: unable to extract title')
2018                         return
2019                 file_title = mobj.group(1).decode('utf-8')
2020
2021                 return [{
2022                         'id':           file_id.decode('utf-8'),
2023                         'url':          file_url.decode('utf-8'),
2024                         'uploader':     None,
2025                         'upload_date':  None,
2026                         'title':        file_title,
2027                         'ext':          file_extension.decode('utf-8'),
2028                 }]
2029
2030
2031 class FacebookIE(InfoExtractor):
2032         """Information Extractor for Facebook"""
2033
2034         _WORKING = False
2035         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2036         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2037         _NETRC_MACHINE = 'facebook'
2038         _available_formats = ['video', 'highqual', 'lowqual']
2039         _video_extensions = {
2040                 'video': 'mp4',
2041                 'highqual': 'mp4',
2042                 'lowqual': 'mp4',
2043         }
2044         IE_NAME = u'facebook'
2045
2046         def __init__(self, downloader=None):
2047                 InfoExtractor.__init__(self, downloader)
2048
2049         def _reporter(self, message):
2050                 """Add header and report message."""
2051                 self._downloader.to_screen(u'[facebook] %s' % message)
2052
2053         def report_login(self):
2054                 """Report attempt to log in."""
2055                 self._reporter(u'Logging in')
2056
2057         def report_video_webpage_download(self, video_id):
2058                 """Report attempt to download video webpage."""
2059                 self._reporter(u'%s: Downloading video webpage' % video_id)
2060
2061         def report_information_extraction(self, video_id):
2062                 """Report attempt to extract video information."""
2063                 self._reporter(u'%s: Extracting video information' % video_id)
2064
2065         def _parse_page(self, video_webpage):
2066                 """Extract video information from page"""
2067                 # General data
2068                 data = {'title': r'\("video_title", "(.*?)"\)',
2069                         'description': r'<div class="datawrap">(.*?)</div>',
2070                         'owner': r'\("video_owner_name", "(.*?)"\)',
2071                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2072                         }
2073                 video_info = {}
2074                 for piece in data.keys():
2075                         mobj = re.search(data[piece], video_webpage)
2076                         if mobj is not None:
2077                                 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2078
2079                 # Video urls
2080                 video_urls = {}
2081                 for fmt in self._available_formats:
2082                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2083                         if mobj is not None:
2084                                 # URL is in a Javascript segment inside an escaped Unicode format within
2085                                 # the generally utf-8 page
2086                                 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2087                 video_info['video_urls'] = video_urls
2088
2089                 return video_info
2090
2091         def _real_initialize(self):
2092                 if self._downloader is None:
2093                         return
2094
2095                 useremail = None
2096                 password = None
2097                 downloader_params = self._downloader.params
2098
2099                 # Attempt to use provided username and password or .netrc data
2100                 if downloader_params.get('username', None) is not None:
2101                         useremail = downloader_params['username']
2102                         password = downloader_params['password']
2103                 elif downloader_params.get('usenetrc', False):
2104                         try:
2105                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2106                                 if info is not None:
2107                                         useremail = info[0]
2108                                         password = info[2]
2109                                 else:
2110                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2111                         except (IOError, netrc.NetrcParseError) as err:
2112                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2113                                 return
2114
2115                 if useremail is None:
2116                         return
2117
2118                 # Log in
2119                 login_form = {
2120                         'email': useremail,
2121                         'pass': password,
2122                         'login': 'Log+In'
2123                         }
2124                 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2125                 try:
2126                         self.report_login()
2127                         login_results = compat_urllib_request.urlopen(request).read()
2128                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2129                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2130                                 return
2131                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2132                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2133                         return
2134
2135         def _real_extract(self, url):
2136                 mobj = re.match(self._VALID_URL, url)
2137                 if mobj is None:
2138                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2139                         return
2140                 video_id = mobj.group('ID')
2141
2142                 # Get video webpage
2143                 self.report_video_webpage_download(video_id)
2144                 request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2145                 try:
2146                         page = compat_urllib_request.urlopen(request)
2147                         video_webpage = page.read()
2148                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2149                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2150                         return
2151
2152                 # Start extracting information
2153                 self.report_information_extraction(video_id)
2154
2155                 # Extract information
2156                 video_info = self._parse_page(video_webpage)
2157
2158                 # uploader
2159                 if 'owner' not in video_info:
2160                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2161                         return
2162                 video_uploader = video_info['owner']
2163
2164                 # title
2165                 if 'title' not in video_info:
2166                         self._downloader.trouble(u'ERROR: unable to extract video title')
2167                         return
2168                 video_title = video_info['title']
2169                 video_title = video_title.decode('utf-8')
2170
2171                 # thumbnail image
2172                 if 'thumbnail' not in video_info:
2173                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2174                         video_thumbnail = ''
2175                 else:
2176                         video_thumbnail = video_info['thumbnail']
2177
2178                 # upload date
2179                 upload_date = None
2180                 if 'upload_date' in video_info:
2181                         upload_time = video_info['upload_date']
2182                         timetuple = email.utils.parsedate_tz(upload_time)
2183                         if timetuple is not None:
2184                                 try:
2185                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2186                                 except:
2187                                         pass
2188
2189                 # description
2190                 video_description = video_info.get('description', 'No description available.')
2191
2192                 url_map = video_info['video_urls']
2193                 if len(url_map.keys()) > 0:
2194                         # Decide which formats to download
2195                         req_format = self._downloader.params.get('format', None)
2196                         format_limit = self._downloader.params.get('format_limit', None)
2197
2198                         if format_limit is not None and format_limit in self._available_formats:
2199                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2200                         else:
2201                                 format_list = self._available_formats
2202                         existing_formats = [x for x in format_list if x in url_map]
2203                         if len(existing_formats) == 0:
2204                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2205                                 return
2206                         if req_format is None:
2207                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2208                         elif req_format == 'worst':
2209                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2210                         elif req_format == '-1':
2211                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2212                         else:
2213                                 # Specific format
2214                                 if req_format not in url_map:
2215                                         self._downloader.trouble(u'ERROR: requested format not available')
2216                                         return
2217                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2218
2219                 results = []
2220                 for format_param, video_real_url in video_url_list:
2221                         # Extension
2222                         video_extension = self._video_extensions.get(format_param, 'mp4')
2223
2224                         results.append({
2225                                 'id':           video_id.decode('utf-8'),
2226                                 'url':          video_real_url.decode('utf-8'),
2227                                 'uploader':     video_uploader.decode('utf-8'),
2228                                 'upload_date':  upload_date,
2229                                 'title':        video_title,
2230                                 'ext':          video_extension.decode('utf-8'),
2231                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2232                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2233                                 'description':  video_description.decode('utf-8'),
2234                         })
2235                 return results
2236
2237 class BlipTVIE(InfoExtractor):
2238         """Information extractor for blip.tv"""
2239
2240         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2241         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2242         IE_NAME = u'blip.tv'
2243
2244         def report_extraction(self, file_id):
2245                 """Report information extraction."""
2246                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2247
2248         def report_direct_download(self, title):
2249                 """Report information extraction."""
2250                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2251
2252         def _real_extract(self, url):
2253                 mobj = re.match(self._VALID_URL, url)
2254                 if mobj is None:
2255                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2256                         return
2257
2258                 if '?' in url:
2259                         cchar = '&'
2260                 else:
2261                         cchar = '?'
2262                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2263                 request = compat_urllib_request.Request(json_url.encode('utf-8'))
2264                 self.report_extraction(mobj.group(1))
2265                 info = None
2266                 try:
2267                         urlh = compat_urllib_request.urlopen(request)
2268                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2269                                 basename = url.split('/')[-1]
2270                                 title,ext = os.path.splitext(basename)
2271                                 title = title.decode('UTF-8')
2272                                 ext = ext.replace('.', '')
2273                                 self.report_direct_download(title)
2274                                 info = {
2275                                         'id': title,
2276                                         'url': url,
2277                                         'uploader': None,
2278                                         'upload_date': None,
2279                                         'title': title,
2280                                         'ext': ext,
2281                                         'urlhandle': urlh
2282                                 }
2283                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2284                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2285                         return
2286                 if info is None: # Regular URL
2287                         try:
2288                                 json_code = urlh.read()
2289                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2290                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2291                                 return
2292
2293                         try:
2294                                 json_data = json.loads(json_code)
2295                                 if 'Post' in json_data:
2296                                         data = json_data['Post']
2297                                 else:
2298                                         data = json_data
2299
2300                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2301                                 video_url = data['media']['url']
2302                                 umobj = re.match(self._URL_EXT, video_url)
2303                                 if umobj is None:
2304                                         raise ValueError('Can not determine filename extension')
2305                                 ext = umobj.group(1)
2306
2307                                 info = {
2308                                         'id': data['item_id'],
2309                                         'url': video_url,
2310                                         'uploader': data['display_name'],
2311                                         'upload_date': upload_date,
2312                                         'title': data['title'],
2313                                         'ext': ext,
2314                                         'format': data['media']['mimeType'],
2315                                         'thumbnail': data['thumbnailUrl'],
2316                                         'description': data['description'],
2317                                         'player_url': data['embedUrl']
2318                                 }
2319                         except (ValueError,KeyError) as err:
2320                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2321                                 return
2322
2323                 std_headers['User-Agent'] = 'iTunes/10.6.1'
2324                 return [info]
2325
2326
2327 class MyVideoIE(InfoExtractor):
2328         """Information Extractor for myvideo.de."""
2329
2330         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2331         IE_NAME = u'myvideo'
2332
2333         def __init__(self, downloader=None):
2334                 InfoExtractor.__init__(self, downloader)
2335         
2336         def report_download_webpage(self, video_id):
2337                 """Report webpage download."""
2338                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2339
2340         def report_extraction(self, video_id):
2341                 """Report information extraction."""
2342                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2343
2344         def _real_extract(self,url):
2345                 mobj = re.match(self._VALID_URL, url)
2346                 if mobj is None:
2347                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2348                         return
2349
2350                 video_id = mobj.group(1)
2351
2352                 # Get video webpage
2353                 request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
2354                 try:
2355                         self.report_download_webpage(video_id)
2356                         webpage = compat_urllib_request.urlopen(request).read()
2357                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2358                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2359                         return
2360
2361                 self.report_extraction(video_id)
2362                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2363                                  webpage)
2364                 if mobj is None:
2365                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2366                         return
2367                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2368
2369                 mobj = re.search('<title>([^<]+)</title>', webpage)
2370                 if mobj is None:
2371                         self._downloader.trouble(u'ERROR: unable to extract title')
2372                         return
2373
2374                 video_title = mobj.group(1)
2375
2376                 return [{
2377                         'id':           video_id,
2378                         'url':          video_url,
2379                         'uploader':     None,
2380                         'upload_date':  None,
2381                         'title':        video_title,
2382                         'ext':          u'flv',
2383                 }]
2384
2385 class ComedyCentralIE(InfoExtractor):
2386         """Information extractor for The Daily Show and Colbert Report """
2387
2388         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2389         IE_NAME = u'comedycentral'
2390
2391         _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2392
2393         _video_extensions = {
2394                 '3500': 'mp4',
2395                 '2200': 'mp4',
2396                 '1700': 'mp4',
2397                 '1200': 'mp4',
2398                 '750': 'mp4',
2399                 '400': 'mp4',
2400         }
2401         _video_dimensions = {
2402                 '3500': '1280x720',
2403                 '2200': '960x540',
2404                 '1700': '768x432',
2405                 '1200': '640x360',
2406                 '750': '512x288',
2407                 '400': '384x216',
2408         }
2409
2410         def report_extraction(self, episode_id):
2411                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2412
2413         def report_config_download(self, episode_id):
2414                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2415
2416         def report_index_download(self, episode_id):
2417                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2418
2419         def report_player_url(self, episode_id):
2420                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2421
2422
2423         def _print_formats(self, formats):
2424                 print('Available formats:')
2425                 for x in formats:
2426                         print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2427
2428
2429         def _real_extract(self, url):
2430                 mobj = re.match(self._VALID_URL, url)
2431                 if mobj is None:
2432                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2433                         return
2434
2435                 if mobj.group('shortname'):
2436                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
2437                                 url = u'http://www.thedailyshow.com/full-episodes/'
2438                         else:
2439                                 url = u'http://www.colbertnation.com/full-episodes/'
2440                         mobj = re.match(self._VALID_URL, url)
2441                         assert mobj is not None
2442
2443                 dlNewest = not mobj.group('episode')
2444                 if dlNewest:
2445                         epTitle = mobj.group('showname')
2446                 else:
2447                         epTitle = mobj.group('episode')
2448
2449                 req = compat_urllib_request.Request(url)
2450                 self.report_extraction(epTitle)
2451                 try:
2452                         htmlHandle = compat_urllib_request.urlopen(req)
2453                         html = htmlHandle.read()
2454                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2455                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2456                         return
2457                 if dlNewest:
2458                         url = htmlHandle.geturl()
2459                         mobj = re.match(self._VALID_URL, url)
2460                         if mobj is None:
2461                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2462                                 return
2463                         if mobj.group('episode') == '':
2464                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2465                                 return
2466                         epTitle = mobj.group('episode')
2467
2468                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2469
2470                 if len(mMovieParams) == 0:
2471                         # The Colbert Report embeds the information in a without
2472                         # a URL prefix; so extract the alternate reference
2473                         # and then add the URL prefix manually.
2474
2475                         altMovieParams = re.findall('data-mgid="([^"]*episode.*?:.*?)"', html)
2476                         if len(altMovieParams) == 0:
2477                                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2478                                 return
2479                         else:
2480                                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2481                 
2482                 playerUrl_raw = mMovieParams[0][0]
2483                 self.report_player_url(epTitle)
2484                 try:
2485                         urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2486                         playerUrl = urlHandle.geturl()
2487                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2488                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2489                         return
2490
2491                 uri = mMovieParams[0][1]
2492                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2493                 self.report_index_download(epTitle)
2494                 try:
2495                         indexXml = compat_urllib_request.urlopen(indexUrl).read()
2496                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2497                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2498                         return
2499
2500                 results = []
2501
2502                 idoc = xml.etree.ElementTree.fromstring(indexXml)
2503                 itemEls = idoc.findall('.//item')
2504                 for itemEl in itemEls:
2505                         mediaId = itemEl.findall('./guid')[0].text
2506                         shortMediaId = mediaId.split(':')[-1]
2507                         showId = mediaId.split(':')[-2].replace('.com', '')
2508                         officialTitle = itemEl.findall('./title')[0].text
2509                         officialDate = itemEl.findall('./pubDate')[0].text
2510
2511                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2512                                                 compat_urllib_parse.urlencode({'uri': mediaId}))
2513                         configReq = compat_urllib_request.Request(configUrl)
2514                         self.report_config_download(epTitle)
2515                         try:
2516                                 configXml = compat_urllib_request.urlopen(configReq).read()
2517                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2518                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2519                                 return
2520
2521                         cdoc = xml.etree.ElementTree.fromstring(configXml)
2522                         turls = []
2523                         for rendition in cdoc.findall('.//rendition'):
2524                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2525                                 turls.append(finfo)
2526
2527                         if len(turls) == 0:
2528                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2529                                 continue
2530                         
2531                         if self._downloader.params.get('listformats', None):
2532                                 self._print_formats([i[0] for i in turls])
2533                                 return
2534
2535                         # For now, just pick the highest bitrate
2536                         format,video_url = turls[-1]
2537
2538                         # Get the format arg from the arg stream
2539                         req_format = self._downloader.params.get('format', None)
2540
2541                         # Select format if we can find one
2542                         for f,v in turls:
2543                                 if f == req_format:
2544                                         format, video_url = f, v
2545                                         break
2546
2547                         # Patch to download from alternative CDN, which does not
2548                         # break on current RTMPDump builds
2549                         broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2550                         better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2551
2552                         if video_url.startswith(broken_cdn):
2553                                 video_url = video_url.replace(broken_cdn, better_cdn)
2554
2555                         effTitle = showId + u'-' + epTitle
2556                         info = {
2557                                 'id': shortMediaId,
2558                                 'url': video_url,
2559                                 'uploader': showId,
2560                                 'upload_date': officialDate,
2561                                 'title': effTitle,
2562                                 'ext': 'mp4',
2563                                 'format': format,
2564                                 'thumbnail': None,
2565                                 'description': officialTitle,
2566                                 'player_url': None #playerUrl
2567                         }
2568
2569                         results.append(info)
2570                         
2571                 return results
2572
2573
2574 class EscapistIE(InfoExtractor):
2575         """Information extractor for The Escapist """
2576
2577         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2578         IE_NAME = u'escapist'
2579
2580         def report_extraction(self, showName):
2581                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2582
2583         def report_config_download(self, showName):
2584                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2585
2586         def _real_extract(self, url):
2587                 mobj = re.match(self._VALID_URL, url)
2588                 if mobj is None:
2589                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2590                         return
2591                 showName = mobj.group('showname')
2592                 videoId = mobj.group('episode')
2593
2594                 self.report_extraction(showName)
2595                 try:
2596                         webPage = compat_urllib_request.urlopen(url)
2597                         webPageBytes = webPage.read()
2598                         m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2599                         webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2600                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2601                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2602                         return
2603
2604                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2605                 description = unescapeHTML(descMatch.group(1))
2606                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2607                 imgUrl = unescapeHTML(imgMatch.group(1))
2608                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2609                 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2610                 configUrlMatch = re.search('config=(.*)$', playerUrl)
2611                 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2612
2613                 self.report_config_download(showName)
2614                 try:
2615                         configJSON = compat_urllib_request.urlopen(configUrl).read()
2616                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2617                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2618                         return
2619
2620                 # Technically, it's JavaScript, not JSON
2621                 configJSON = configJSON.replace("'", '"')
2622
2623                 try:
2624                         config = json.loads(configJSON)
2625                 except (ValueError,) as err:
2626                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2627                         return
2628
2629                 playlist = config['playlist']
2630                 videoUrl = playlist[1]['url']
2631
2632                 info = {
2633                         'id': videoId,
2634                         'url': videoUrl,
2635                         'uploader': showName,
2636                         'upload_date': None,
2637                         'title': showName,
2638                         'ext': 'flv',
2639                         'thumbnail': imgUrl,
2640                         'description': description,
2641                         'player_url': playerUrl,
2642                 }
2643
2644                 return [info]
2645
2646
2647 class CollegeHumorIE(InfoExtractor):
2648         """Information extractor for collegehumor.com"""
2649
2650         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2651         IE_NAME = u'collegehumor'
2652
2653         def report_webpage(self, video_id):
2654                 """Report information extraction."""
2655                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2656
2657         def report_extraction(self, video_id):
2658                 """Report information extraction."""
2659                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2660
2661         def _real_extract(self, url):
2662                 mobj = re.match(self._VALID_URL, url)
2663                 if mobj is None:
2664                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2665                         return
2666                 video_id = mobj.group('videoid')
2667
2668                 self.report_webpage(video_id)
2669                 request = compat_urllib_request.Request(url)
2670                 try:
2671                         webpage = compat_urllib_request.urlopen(request).read()
2672                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2673                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2674                         return
2675
2676                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2677                 if m is None:
2678                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2679                         return
2680                 internal_video_id = m.group('internalvideoid')
2681
2682                 info = {
2683                         'id': video_id,
2684                         'internal_id': internal_video_id,
2685                         'uploader': None,
2686                         'upload_date': None,
2687                 }
2688
2689                 self.report_extraction(video_id)
2690                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2691                 try:
2692                         metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2693                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2694                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2695                         return
2696
2697                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2698                 try:
2699                         videoNode = mdoc.findall('./video')[0]
2700                         info['description'] = videoNode.findall('./description')[0].text
2701                         info['title'] = videoNode.findall('./caption')[0].text
2702                         info['url'] = videoNode.findall('./file')[0].text
2703                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2704                         info['ext'] = info['url'].rpartition('.')[2]
2705                 except IndexError:
2706                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2707                         return
2708
2709                 return [info]
2710
2711
2712 class XVideosIE(InfoExtractor):
2713         """Information extractor for xvideos.com"""
2714
2715         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2716         IE_NAME = u'xvideos'
2717
2718         def report_webpage(self, video_id):
2719                 """Report information extraction."""
2720                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2721
2722         def report_extraction(self, video_id):
2723                 """Report information extraction."""
2724                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2725
2726         def _real_extract(self, url):
2727                 mobj = re.match(self._VALID_URL, url)
2728                 if mobj is None:
2729                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2730                         return
2731                 video_id = mobj.group(1).decode('utf-8')
2732
2733                 self.report_webpage(video_id)
2734
2735                 request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2736                 try:
2737                         webpage = compat_urllib_request.urlopen(request).read()
2738                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2739                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2740                         return
2741
2742                 self.report_extraction(video_id)
2743
2744
2745                 # Extract video URL
2746                 mobj = re.search(r'flv_url=(.+?)&', webpage)
2747                 if mobj is None:
2748                         self._downloader.trouble(u'ERROR: unable to extract video url')
2749                         return
2750                 video_url = compat_urllib_parse.unquote(mobj.group(1).decode('utf-8'))
2751
2752
2753                 # Extract title
2754                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2755                 if mobj is None:
2756                         self._downloader.trouble(u'ERROR: unable to extract video title')
2757                         return
2758                 video_title = mobj.group(1).decode('utf-8')
2759
2760
2761                 # Extract video thumbnail
2762                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2763                 if mobj is None:
2764                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2765                         return
2766                 video_thumbnail = mobj.group(0).decode('utf-8')
2767
2768                 info = {
2769                         'id': video_id,
2770                         'url': video_url,
2771                         'uploader': None,
2772                         'upload_date': None,
2773                         'title': video_title,
2774                         'ext': 'flv',
2775                         'thumbnail': video_thumbnail,
2776                         'description': None,
2777                 }
2778
2779                 return [info]
2780
2781
2782 class SoundcloudIE(InfoExtractor):
2783         """Information extractor for soundcloud.com
2784            To access the media, the uid of the song and a stream token
2785            must be extracted from the page source and the script must make
2786            a request to media.soundcloud.com/crossdomain.xml. Then
2787            the media can be grabbed by requesting from an url composed
2788            of the stream token and uid
2789          """
2790
2791         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2792         IE_NAME = u'soundcloud'
2793
2794         def __init__(self, downloader=None):
2795                 InfoExtractor.__init__(self, downloader)
2796
2797         def report_webpage(self, video_id):
2798                 """Report information extraction."""
2799                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2800
2801         def report_extraction(self, video_id):
2802                 """Report information extraction."""
2803                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2804
2805         def _real_extract(self, url):
2806                 mobj = re.match(self._VALID_URL, url)
2807                 if mobj is None:
2808                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2809                         return
2810
2811                 # extract uploader (which is in the url)
2812                 uploader = mobj.group(1).decode('utf-8')
2813                 # extract simple title (uploader + slug of song title)
2814                 slug_title =  mobj.group(2).decode('utf-8')
2815                 simple_title = uploader + u'-' + slug_title
2816
2817                 self.report_webpage('%s/%s' % (uploader, slug_title))
2818
2819                 request = compat_urllib_request.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2820                 try:
2821                         webpage = compat_urllib_request.urlopen(request).read()
2822                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2823                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2824                         return
2825
2826                 self.report_extraction('%s/%s' % (uploader, slug_title))
2827
2828                 # extract uid and stream token that soundcloud hands out for access
2829                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2830                 if mobj:
2831                         video_id = mobj.group(1)
2832                         stream_token = mobj.group(2)
2833
2834                 # extract unsimplified title
2835                 mobj = re.search('"title":"(.*?)",', webpage)
2836                 if mobj:
2837                         title = mobj.group(1).decode('utf-8')
2838                 else:
2839                         title = simple_title
2840
2841                 # construct media url (with uid/token)
2842                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2843                 mediaURL = mediaURL % (video_id, stream_token)
2844
2845                 # description
2846                 description = u'No description available'
2847                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2848                 if mobj:
2849                         description = mobj.group(1)
2850
2851                 # upload date
2852                 upload_date = None
2853                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2854                 if mobj:
2855                         try:
2856                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2857                         except Exception as err:
2858                                 self._downloader.to_stderr(compat_str(err))
2859
2860                 # for soundcloud, a request to a cross domain is required for cookies
2861                 request = compat_urllib_request.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2862
2863                 return [{
2864                         'id':           video_id.decode('utf-8'),
2865                         'url':          mediaURL,
2866                         'uploader':     uploader.decode('utf-8'),
2867                         'upload_date':  upload_date,
2868                         'title':        title,
2869                         'ext':          u'mp3',
2870                         'description': description.decode('utf-8')
2871                 }]
2872
2873
2874 class InfoQIE(InfoExtractor):
2875         """Information extractor for infoq.com"""
2876
2877         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2878         IE_NAME = u'infoq'
2879
2880         def report_webpage(self, video_id):
2881                 """Report information extraction."""
2882                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2883
2884         def report_extraction(self, video_id):
2885                 """Report information extraction."""
2886                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2887
2888         def _real_extract(self, url):
2889                 mobj = re.match(self._VALID_URL, url)
2890                 if mobj is None:
2891                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2892                         return
2893
2894                 self.report_webpage(url)
2895
2896                 request = compat_urllib_request.Request(url)
2897                 try:
2898                         webpage = compat_urllib_request.urlopen(request).read()
2899                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2900                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2901                         return
2902
2903                 self.report_extraction(url)
2904
2905
2906                 # Extract video URL
2907                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2908                 if mobj is None:
2909                         self._downloader.trouble(u'ERROR: unable to extract video url')
2910                         return
2911                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2912
2913
2914                 # Extract title
2915                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2916                 if mobj is None:
2917                         self._downloader.trouble(u'ERROR: unable to extract video title')
2918                         return
2919                 video_title = mobj.group(1).decode('utf-8')
2920
2921                 # Extract description
2922                 video_description = u'No description available.'
2923                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2924                 if mobj is not None:
2925                         video_description = mobj.group(1).decode('utf-8')
2926
2927                 video_filename = video_url.split('/')[-1]
2928                 video_id, extension = video_filename.split('.')
2929
2930                 info = {
2931                         'id': video_id,
2932                         'url': video_url,
2933                         'uploader': None,
2934                         'upload_date': None,
2935                         'title': video_title,
2936                         'ext': extension, # Extension is always(?) mp4, but seems to be flv
2937                         'thumbnail': None,
2938                         'description': video_description,
2939                 }
2940
2941                 return [info]
2942
2943 class MixcloudIE(InfoExtractor):
2944         """Information extractor for www.mixcloud.com"""
2945         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2946         IE_NAME = u'mixcloud'
2947
2948         def __init__(self, downloader=None):
2949                 InfoExtractor.__init__(self, downloader)
2950
2951         def report_download_json(self, file_id):
2952                 """Report JSON download."""
2953                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2954
2955         def report_extraction(self, file_id):
2956                 """Report information extraction."""
2957                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2958
2959         def get_urls(self, jsonData, fmt, bitrate='best'):
2960                 """Get urls from 'audio_formats' section in json"""
2961                 file_url = None
2962                 try:
2963                         bitrate_list = jsonData[fmt]
2964                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2965                                 bitrate = max(bitrate_list) # select highest
2966
2967                         url_list = jsonData[fmt][bitrate]
2968                 except TypeError: # we have no bitrate info.
2969                         url_list = jsonData[fmt]
2970                 return url_list
2971
2972         def check_urls(self, url_list):
2973                 """Returns 1st active url from list"""
2974                 for url in url_list:
2975                         try:
2976                                 compat_urllib_request.urlopen(url)
2977                                 return url
2978                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2979                                 url = None
2980
2981                 return None
2982
2983         def _print_formats(self, formats):
2984                 print('Available formats:')
2985                 for fmt in formats.keys():
2986                         for b in formats[fmt]:
2987                                 try:
2988                                         ext = formats[fmt][b][0]
2989                                         print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2990                                 except TypeError: # we have no bitrate info
2991                                         ext = formats[fmt][0]
2992                                         print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2993                                         break
2994
2995         def _real_extract(self, url):
2996                 mobj = re.match(self._VALID_URL, url)
2997                 if mobj is None:
2998                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2999                         return
3000                 # extract uploader & filename from url
3001                 uploader = mobj.group(1).decode('utf-8')
3002                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3003
3004                 # construct API request
3005                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3006                 # retrieve .json file with links to files
3007                 request = compat_urllib_request.Request(file_url)
3008                 try:
3009                         self.report_download_json(file_url)
3010                         jsonData = compat_urllib_request.urlopen(request).read()
3011                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3012                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
3013                         return
3014
3015                 # parse JSON
3016                 json_data = json.loads(jsonData)
3017                 player_url = json_data['player_swf_url']
3018                 formats = dict(json_data['audio_formats'])
3019
3020                 req_format = self._downloader.params.get('format', None)
3021                 bitrate = None
3022
3023                 if self._downloader.params.get('listformats', None):
3024                         self._print_formats(formats)
3025                         return
3026
3027                 if req_format is None or req_format == 'best':
3028                         for format_param in formats.keys():
3029                                 url_list = self.get_urls(formats, format_param)
3030                                 # check urls
3031                                 file_url = self.check_urls(url_list)
3032                                 if file_url is not None:
3033                                         break # got it!
3034                 else:
3035                         if req_format not in formats.keys():
3036                                 self._downloader.trouble(u'ERROR: format is not available')
3037                                 return
3038
3039                         url_list = self.get_urls(formats, req_format)
3040                         file_url = self.check_urls(url_list)
3041                         format_param = req_format
3042
3043                 return [{
3044                         'id': file_id.decode('utf-8'),
3045                         'url': file_url.decode('utf-8'),
3046                         'uploader':     uploader.decode('utf-8'),
3047                         'upload_date': None,
3048                         'title': json_data['name'],
3049                         'ext': file_url.split('.')[-1].decode('utf-8'),
3050                         'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3051                         'thumbnail': json_data['thumbnail_url'],
3052                         'description': json_data['description'],
3053                         'player_url': player_url.decode('utf-8'),
3054                 }]
3055
3056 class StanfordOpenClassroomIE(InfoExtractor):
3057         """Information extractor for Stanford's Open ClassRoom"""
3058
3059         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3060         IE_NAME = u'stanfordoc'
3061
3062         def report_download_webpage(self, objid):
3063                 """Report information extraction."""
3064                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3065
3066         def report_extraction(self, video_id):
3067                 """Report information extraction."""
3068                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3069
3070         def _real_extract(self, url):
3071                 mobj = re.match(self._VALID_URL, url)
3072                 if mobj is None:
3073                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3074                         return
3075
3076                 if mobj.group('course') and mobj.group('video'): # A specific video
3077                         course = mobj.group('course')
3078                         video = mobj.group('video')
3079                         info = {
3080                                 'id': course + '_' + video,
3081                                 'uploader': None,
3082                                 'upload_date': None,
3083                         }
3084
3085                         self.report_extraction(info['id'])
3086                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3087                         xmlUrl = baseUrl + video + '.xml'
3088                         try:
3089                                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3090                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3091                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3092                                 return
3093                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
3094                         try:
3095                                 info['title'] = mdoc.findall('./title')[0].text
3096                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3097                         except IndexError:
3098                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3099                                 return
3100                         info['ext'] = info['url'].rpartition('.')[2]
3101                         return [info]
3102                 elif mobj.group('course'): # A course page
3103                         course = mobj.group('course')
3104                         info = {
3105                                 'id': course,
3106                                 'type': 'playlist',
3107                                 'uploader': None,
3108                                 'upload_date': None,
3109                         }
3110
3111                         self.report_download_webpage(info['id'])
3112                         try:
3113                                 coursepage = compat_urllib_request.urlopen(url).read()
3114                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3115                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3116                                 return
3117
3118                         m = re.search('<h1>([^<]+)</h1>', coursepage)
3119                         if m:
3120                                 info['title'] = unescapeHTML(m.group(1))
3121                         else:
3122                                 info['title'] = info['id']
3123
3124                         m = re.search('<description>([^<]+)</description>', coursepage)
3125                         if m:
3126                                 info['description'] = unescapeHTML(m.group(1))
3127
3128                         links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3129                         info['list'] = [
3130                                 {
3131                                         'type': 'reference',
3132                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3133                                 }
3134                                         for vpage in links]
3135                         results = []
3136                         for entry in info['list']:
3137                                 assert entry['type'] == 'reference'
3138                                 results += self.extract(entry['url'])
3139                         return results
3140                         
3141                 else: # Root page
3142                         info = {
3143                                 'id': 'Stanford OpenClassroom',
3144                                 'type': 'playlist',
3145                                 'uploader': None,
3146                                 'upload_date': None,
3147                         }
3148
3149                         self.report_download_webpage(info['id'])
3150                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3151                         try:
3152                                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3153                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3154                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3155                                 return
3156
3157                         info['title'] = info['id']
3158
3159                         links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3160                         info['list'] = [
3161                                 {
3162                                         'type': 'reference',
3163                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3164                                 }
3165                                         for cpage in links]
3166
3167                         results = []
3168                         for entry in info['list']:
3169                                 assert entry['type'] == 'reference'
3170                                 results += self.extract(entry['url'])
3171                         return results
3172
3173 class MTVIE(InfoExtractor):
3174         """Information extractor for MTV.com"""
3175
3176         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3177         IE_NAME = u'mtv'
3178
3179         def report_webpage(self, video_id):
3180                 """Report information extraction."""
3181                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3182
3183         def report_extraction(self, video_id):
3184                 """Report information extraction."""
3185                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3186
3187         def _real_extract(self, url):
3188                 mobj = re.match(self._VALID_URL, url)
3189                 if mobj is None:
3190                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3191                         return
3192                 if not mobj.group('proto'):
3193                         url = 'http://' + url
3194                 video_id = mobj.group('videoid')
3195                 self.report_webpage(video_id)
3196
3197                 request = compat_urllib_request.Request(url)
3198                 try:
3199                         webpage = compat_urllib_request.urlopen(request).read()
3200                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3201                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3202                         return
3203
3204                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3205                 if mobj is None:
3206                         self._downloader.trouble(u'ERROR: unable to extract song name')
3207                         return
3208                 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3209                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3210                 if mobj is None:
3211                         self._downloader.trouble(u'ERROR: unable to extract performer')
3212                         return
3213                 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3214                 video_title = performer + ' - ' + song_name 
3215
3216                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3217                 if mobj is None:
3218                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3219                         return
3220                 mtvn_uri = mobj.group(1)
3221
3222                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3223                 if mobj is None:
3224                         self._downloader.trouble(u'ERROR: unable to extract content id')
3225                         return
3226                 content_id = mobj.group(1)
3227
3228                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3229                 self.report_extraction(video_id)
3230                 request = compat_urllib_request.Request(videogen_url)
3231                 try:
3232                         metadataXml = compat_urllib_request.urlopen(request).read()
3233                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3234                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3235                         return
3236
3237                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3238                 renditions = mdoc.findall('.//rendition')
3239
3240                 # For now, always pick the highest quality.
3241                 rendition = renditions[-1]
3242
3243                 try:
3244                         _,_,ext = rendition.attrib['type'].partition('/')
3245                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3246                         video_url = rendition.find('./src').text
3247                 except KeyError:
3248                         self._downloader.trouble('Invalid rendition field.')
3249                         return
3250
3251                 info = {
3252                         'id': video_id,
3253                         'url': video_url,
3254                         'uploader': performer,
3255                         'upload_date': None,
3256                         'title': video_title,
3257                         'ext': ext,
3258                         'format': format,
3259                 }
3260
3261                 return [info]
3262
3263
3264 class YoukuIE(InfoExtractor):
3265
3266         _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3267         IE_NAME = u'Youku'
3268
3269         def __init__(self, downloader=None):
3270                 InfoExtractor.__init__(self, downloader)
3271
3272         def report_download_webpage(self, file_id):
3273                 """Report webpage download."""
3274                 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3275
3276         def report_extraction(self, file_id):
3277                 """Report information extraction."""
3278                 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3279
3280         def _gen_sid(self):
3281                 nowTime = int(time.time() * 1000)
3282                 random1 = random.randint(1000,1998)
3283                 random2 = random.randint(1000,9999)
3284
3285                 return "%d%d%d" %(nowTime,random1,random2)
3286
3287         def _get_file_ID_mix_string(self, seed):
3288                 mixed = []
3289                 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3290                 seed = float(seed)
3291                 for i in range(len(source)):
3292                         seed  =  (seed * 211 + 30031 ) % 65536
3293                         index  =  math.floor(seed / 65536 * len(source) )
3294                         mixed.append(source[int(index)])
3295                         source.remove(source[int(index)])
3296                 #return ''.join(mixed)
3297                 return mixed
3298
3299         def _get_file_id(self, fileId, seed):
3300                 mixed = self._get_file_ID_mix_string(seed)
3301                 ids = fileId.split('*')
3302                 realId = []
3303                 for ch in ids:
3304                         if ch:
3305                                 realId.append(mixed[int(ch)])
3306                 return ''.join(realId)
3307
3308         def _real_extract(self, url):
3309                 mobj = re.match(self._VALID_URL, url)
3310                 if mobj is None:
3311                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3312                         return
3313                 video_id = mobj.group('ID')
3314
3315                 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3316
3317                 request = compat_urllib_request.Request(info_url, None, std_headers)
3318                 try:
3319                         self.report_download_webpage(video_id)
3320                         jsondata = compat_urllib_request.urlopen(request).read()
3321                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3322                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3323                         return
3324
3325                 self.report_extraction(video_id)
3326                 try:
3327                         config = json.loads(jsondata)
3328
3329                         video_title =  config['data'][0]['title']
3330                         seed = config['data'][0]['seed']
3331
3332                         format = self._downloader.params.get('format', None)
3333                         supported_format = config['data'][0]['streamfileids'].keys()
3334
3335                         if format is None or format == 'best':
3336                                 if 'hd2' in supported_format:
3337                                         format = 'hd2'
3338                                 else:
3339                                         format = 'flv'
3340                                 ext = u'flv'
3341                         elif format == 'worst':
3342                                 format = 'mp4'
3343                                 ext = u'mp4'
3344                         else:
3345                                 format = 'flv'
3346                                 ext = u'flv'
3347
3348
3349                         fileid = config['data'][0]['streamfileids'][format]
3350                         seg_number = len(config['data'][0]['segs'][format])
3351
3352                         keys=[]
3353                         for i in xrange(seg_number):
3354                                 keys.append(config['data'][0]['segs'][format][i]['k'])
3355
3356                         #TODO check error
3357                         #youku only could be viewed from mainland china
3358                 except:
3359                         self._downloader.trouble(u'ERROR: unable to extract info section')
3360                         return
3361
3362                 files_info=[]
3363                 sid = self._gen_sid()
3364                 fileid = self._get_file_id(fileid, seed)
3365
3366                 #column 8,9 of fileid represent the segment number
3367                 #fileid[7:9] should be changed
3368                 for index, key in enumerate(keys):
3369
3370                         temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3371                         download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3372
3373                         info = {
3374                                 'id': '%s_part%02d' % (video_id, index),
3375                                 'url': download_url,
3376                                 'uploader': None,
3377                                 'upload_date': None,
3378                                 'title': video_title,
3379                                 'ext': ext,
3380                         }
3381                         files_info.append(info)
3382
3383                 return files_info
3384
3385
3386 class XNXXIE(InfoExtractor):
3387         """Information extractor for xnxx.com"""
3388
3389         _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3390         IE_NAME = u'xnxx'
3391         VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3392         VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3393         VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3394
3395         def report_webpage(self, video_id):
3396                 """Report information extraction"""
3397                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3398
3399         def report_extraction(self, video_id):
3400                 """Report information extraction"""
3401                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3402
3403         def _real_extract(self, url):
3404                 mobj = re.match(self._VALID_URL, url)
3405                 if mobj is None:
3406                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3407                         return
3408                 video_id = mobj.group(1).decode('utf-8')
3409
3410                 self.report_webpage(video_id)
3411
3412                 # Get webpage content
3413                 try:
3414                         webpage = compat_urllib_request.urlopen(url).read()
3415                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3416                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3417                         return
3418
3419                 result = re.search(self.VIDEO_URL_RE, webpage)
3420                 if result is None:
3421                         self._downloader.trouble(u'ERROR: unable to extract video url')
3422                         return
3423                 video_url = compat_urllib_parse.unquote(result.group(1).decode('utf-8'))
3424
3425                 result = re.search(self.VIDEO_TITLE_RE, webpage)
3426                 if result is None:
3427                         self._downloader.trouble(u'ERROR: unable to extract video title')
3428                         return
3429                 video_title = result.group(1).decode('utf-8')
3430
3431                 result = re.search(self.VIDEO_THUMB_RE, webpage)
3432                 if result is None:
3433                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3434                         return
3435                 video_thumbnail = result.group(1).decode('utf-8')
3436
3437                 return [{
3438                         'id': video_id,
3439                         'url': video_url,
3440                         'uploader': None,
3441                         'upload_date': None,
3442                         'title': video_title,
3443                         'ext': 'flv',
3444                         'thumbnail': video_thumbnail,
3445                         'description': None,
3446                 }]
3447
3448
3449 class GooglePlusIE(InfoExtractor):
3450         """Information extractor for plus.google.com."""
3451
3452         _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3453         IE_NAME = u'plus.google'
3454
3455         def __init__(self, downloader=None):
3456                 InfoExtractor.__init__(self, downloader)
3457
3458         def report_extract_entry(self, url):
3459                 """Report downloading extry"""
3460                 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3461
3462         def report_date(self, upload_date):
3463                 """Report downloading extry"""
3464                 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3465
3466         def report_uploader(self, uploader):
3467                 """Report downloading extry"""
3468                 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3469
3470         def report_title(self, video_title):
3471                 """Report downloading extry"""
3472                 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3473
3474         def report_extract_vid_page(self, video_page):
3475                 """Report information extraction."""
3476                 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3477
3478         def _real_extract(self, url):
3479                 # Extract id from URL
3480                 mobj = re.match(self._VALID_URL, url)
3481                 if mobj is None:
3482                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3483                         return
3484
3485                 post_url = mobj.group(0)
3486                 video_id = mobj.group(2)
3487
3488                 video_extension = 'flv'
3489
3490                 # Step 1, Retrieve post webpage to extract further information
3491                 self.report_extract_entry(post_url)
3492                 request = compat_urllib_request.Request(post_url)
3493                 try:
3494                         webpage = compat_urllib_request.urlopen(request).read()
3495                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3496                         self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3497                         return
3498
3499                 # Extract update date
3500                 upload_date = None
3501                 pattern = 'title="Timestamp">(.*?)</a>'
3502                 mobj = re.search(pattern, webpage)
3503                 if mobj:
3504                         upload_date = mobj.group(1)
3505                         # Convert timestring to a format suitable for filename
3506                         upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3507                         upload_date = upload_date.strftime('%Y%m%d')
3508                 self.report_date(upload_date)
3509
3510                 # Extract uploader
3511                 uploader = None
3512                 pattern = r'rel\="author".*?>(.*?)</a>'
3513                 mobj = re.search(pattern, webpage)
3514                 if mobj:
3515                         uploader = mobj.group(1)
3516                 self.report_uploader(uploader)
3517
3518                 # Extract title
3519                 # Get the first line for title
3520                 video_title = u'NA'
3521                 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3522                 mobj = re.search(pattern, webpage)
3523                 if mobj:
3524                         video_title = mobj.group(1)
3525                 self.report_title(video_title)
3526
3527                 # Step 2, Stimulate clicking the image box to launch video
3528                 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3529                 mobj = re.search(pattern, webpage)
3530                 if mobj is None:
3531                         self._downloader.trouble(u'ERROR: unable to extract video page URL')
3532
3533                 video_page = mobj.group(1)
3534                 request = compat_urllib_request.Request(video_page)
3535                 try:
3536                         webpage = compat_urllib_request.urlopen(request).read()
3537                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3538                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3539                         return
3540                 self.report_extract_vid_page(video_page)
3541
3542
3543                 # Extract video links on video page
3544                 """Extract video links of all sizes"""
3545                 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3546                 mobj = re.findall(pattern, webpage)
3547                 if len(mobj) == 0:
3548                         self._downloader.trouble(u'ERROR: unable to extract video links')
3549
3550                 # Sort in resolution
3551                 links = sorted(mobj)
3552
3553                 # Choose the lowest of the sort, i.e. highest resolution
3554                 video_url = links[-1]
3555                 # Only get the url. The resolution part in the tuple has no use anymore
3556                 video_url = video_url[-1]
3557                 # Treat escaped \u0026 style hex
3558                 video_url = unicode(video_url, "unicode_escape")
3559
3560
3561                 return [{
3562                         'id':           video_id.decode('utf-8'),
3563                         'url':          video_url,
3564                         'uploader':     uploader.decode('utf-8'),
3565                         'upload_date':  upload_date.decode('utf-8'),
3566                         'title':        video_title.decode('utf-8'),
3567                         'ext':          video_extension.decode('utf-8'),
3568                 }]