_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import datetime
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import time
  12 import email.utils
  13 import xml.etree.ElementTree
  14 import random
  15 import math
  16
  17 from .utils import *
  18
  19
  20 class InfoExtractor(object):
  21     """Information Extractor class.
  22
  23     Information extractors are the classes that, given a URL, extract
  24     information about the video (or videos) the URL refers to. This
  25     information includes the real video URL, the video title, author and
  26     others. The information is stored in a dictionary which is then
  27     passed to the FileDownloader. The FileDownloader processes this
  28     information possibly downloading the video to the file system, among
  29     other possible outcomes.
  30
  31     The dictionaries must include the following fields:
  32
  33     id:             Video identifier.
  34     url:            Final video URL.
  35     title:          Video title, unescaped.
  36     ext:            Video filename extension.
  37     uploader:       Full name of the video uploader.
  38     upload_date:    Video upload date (YYYYMMDD).
  39
  40     The following fields are optional:
  41
  42     format:         The video format, defaults to ext (used for --get-format)
  43     thumbnail:      Full URL to a video thumbnail image.
  44     description:    One-line video description.
  45     uploader_id:    Nickname or id of the video uploader.
  46     player_url:     SWF Player URL (used for rtmpdump).
  47     subtitles:      The .srt file contents.
  48     urlhandle:      [internal] The urlHandle to be used to download the file,
  49                     like returned by urllib.request.urlopen
  50
  51     The fields should all be Unicode strings.
  52
  53     Subclasses of this one should re-define the _real_initialize() and
  54     _real_extract() methods and define a _VALID_URL regexp.
  55     Probably, they should also be added to the list of extractors.
  56
  57     _real_extract() must return a *list* of information dictionaries as
  58     described above.
  59
  60     Finally, the _WORKING attribute should be set to False for broken IEs
  61     in order to warn the users and skip the tests.
  62     """
  63
  64     _ready = False
  65     _downloader = None
  66     _WORKING = True
  67
  68     def __init__(self, downloader=None):
  69         """Constructor. Receives an optional downloader."""
  70         self._ready = False
  71         self.set_downloader(downloader)
  72
  73     def suitable(self, url):
  74         """Receives a URL and returns True if suitable for this IE."""
  75         return re.match(self._VALID_URL, url) is not None
  76
  77     def working(self):
  78         """Getter method for _WORKING."""
  79         return self._WORKING
  80
  81     def initialize(self):
  82         """Initializes an instance (authentication, etc)."""
  83         if not self._ready:
  84             self._real_initialize()
  85             self._ready = True
  86
  87     def extract(self, url):
  88         """Extracts URL information and returns it in list of dicts."""
  89         self.initialize()
  90         return self._real_extract(url)
  91
  92     def set_downloader(self, downloader):
  93         """Sets the downloader for this IE."""
  94         self._downloader = downloader
  95
  96     def _real_initialize(self):
  97         """Real initialization process. Redefine in subclasses."""
  98         pass
  99
 100     def _real_extract(self, url):
 101         """Real extraction process. Redefine in subclasses."""
 102         pass
 103
 104     @property
 105     def IE_NAME(self):
 106         return type(self).__name__[:-2]
 107
 108     def _download_webpage(self, url, video_id, note=None, errnote=None):
 109         if note is None:
 110             note = u'Downloading video webpage'
 111         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
 112         try:
 113             urlh = compat_urllib_request.urlopen(url)
 114             webpage_bytes = urlh.read()
 115             return webpage_bytes.decode('utf-8', 'replace')
 116         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 117             if errnote is None:
 118                 errnote = u'Unable to download webpage'
 119             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)))
 120
 121
 122 class YoutubeIE(InfoExtractor):
 123     """Information extractor for youtube.com."""
 124
 125     _VALID_URL = r"""^
 126                      (
 127                          (?:https?://)?                                       # http(s):// (optional)
 128                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 129                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 130                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 131                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
 132                          (?:                                                  # the various things that can precede the ID:
 133                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 134                              |(?:                                             # or the v= param in all its forms
 135                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 136                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 137                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 138                                  v=
 139                              )
 140                          )?                                                   # optional -> youtube.com/xxxx is OK
 141                      )?                                                       # all until now is optional -> you can pass the naked ID
 142                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 143                      (?(1).+)?                                                # if we found the ID, everything can follow
 144                      $"""
 145     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 146     _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 147     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 148     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 149     _NETRC_MACHINE = 'youtube'
 150     # Listed in order of quality
 151     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 152     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 153     _video_extensions = {
 154         '13': '3gp',
 155         '17': 'mp4',
 156         '18': 'mp4',
 157         '22': 'mp4',
 158         '37': 'mp4',
 159         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 160         '43': 'webm',
 161         '44': 'webm',
 162         '45': 'webm',
 163         '46': 'webm',
 164     }
 165     _video_dimensions = {
 166         '5': '240x400',
 167         '6': '???',
 168         '13': '???',
 169         '17': '144x176',
 170         '18': '360x640',
 171         '22': '720x1280',
 172         '34': '360x640',
 173         '35': '480x854',
 174         '37': '1080x1920',
 175         '38': '3072x4096',
 176         '43': '360x640',
 177         '44': '480x854',
 178         '45': '720x1280',
 179         '46': '1080x1920',
 180     }
 181     IE_NAME = u'youtube'
 182
 183     def suitable(self, url):
 184         """Receives a URL and returns True if suitable for this IE."""
 185         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
 186
 187     def report_lang(self):
 188         """Report attempt to set language."""
 189         self._downloader.to_screen(u'[youtube] Setting language')
 190
 191     def report_login(self):
 192         """Report attempt to log in."""
 193         self._downloader.to_screen(u'[youtube] Logging in')
 194
 195     def report_age_confirmation(self):
 196         """Report attempt to confirm age."""
 197         self._downloader.to_screen(u'[youtube] Confirming age')
 198
 199     def report_video_webpage_download(self, video_id):
 200         """Report attempt to download video webpage."""
 201         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 202
 203     def report_video_info_webpage_download(self, video_id):
 204         """Report attempt to download video info webpage."""
 205         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 206
 207     def report_video_subtitles_download(self, video_id):
 208         """Report attempt to download video info webpage."""
 209         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 210
 211     def report_information_extraction(self, video_id):
 212         """Report attempt to extract video information."""
 213         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 214
 215     def report_unavailable_format(self, video_id, format):
 216         """Report extracted video URL."""
 217         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 218
 219     def report_rtmp_download(self):
 220         """Indicate the download will use the RTMP protocol."""
 221         self._downloader.to_screen(u'[youtube] RTMP download detected')
 222
 223     def _closed_captions_xml_to_srt(self, xml_string):
 224         srt = ''
 225         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 226         # TODO parse xml instead of regex
 227         for n, (start, dur_tag, dur, caption) in enumerate(texts):
 228             if not dur: dur = '4'
 229             start = float(start)
 230             end = start + float(dur)
 231             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 232             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 233             caption = unescapeHTML(caption)
 234             caption = unescapeHTML(caption) # double cycle, intentional
 235             srt += str(n+1) + '\n'
 236             srt += start + ' --> ' + end + '\n'
 237             srt += caption + '\n\n'
 238         return srt
 239
 240     def _extract_subtitles(self, video_id):
 241         self.report_video_subtitles_download(video_id)
 242         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 243         try:
 244             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 245         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 246             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 247         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 248         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 249         if not srt_lang_list:
 250             return (u'WARNING: video has no closed captions', None)
 251         if self._downloader.params.get('subtitleslang', False):
 252             srt_lang = self._downloader.params.get('subtitleslang')
 253         elif 'en' in srt_lang_list:
 254             srt_lang = 'en'
 255         else:
 256             srt_lang = list(srt_lang_list.keys())[0]
 257         if not srt_lang in srt_lang_list:
 258             return (u'WARNING: no closed captions found in the specified language', None)
 259         request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
 260         try:
 261             srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
 262         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 263             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 264         if not srt_xml:
 265             return (u'WARNING: unable to download video subtitles', None)
 266         return (None, self._closed_captions_xml_to_srt(srt_xml))
 267
 268     def _print_formats(self, formats):
 269         print('Available formats:')
 270         for x in formats:
 271             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 272
 273     def _real_initialize(self):
 274         if self._downloader is None:
 275             return
 276
 277         username = None
 278         password = None
 279         downloader_params = self._downloader.params
 280
 281         # Attempt to use provided username and password or .netrc data
 282         if downloader_params.get('username', None) is not None:
 283             username = downloader_params['username']
 284             password = downloader_params['password']
 285         elif downloader_params.get('usenetrc', False):
 286             try:
 287                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 288                 if info is not None:
 289                     username = info[0]
 290                     password = info[2]
 291                 else:
 292                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 293             except (IOError, netrc.NetrcParseError) as err:
 294                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
 295                 return
 296
 297         # Set language
 298         request = compat_urllib_request.Request(self._LANG_URL)
 299         try:
 300             self.report_lang()
 301             compat_urllib_request.urlopen(request).read()
 302         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 303             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
 304             return
 305
 306         # No authentication to be performed
 307         if username is None:
 308             return
 309
 310         # Log in
 311         login_form = {
 312                 'current_form': 'loginForm',
 313                 'next':     '/',
 314                 'action_login': 'Log In',
 315                 'username': username,
 316                 'password': password,
 317                 }
 318         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
 319         try:
 320             self.report_login()
 321             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 322             if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 323                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 324                 return
 325         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 326             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
 327             return
 328
 329         # Confirm age
 330         age_form = {
 331                 'next_url':     '/',
 332                 'action_confirm':   'Confirm',
 333                 }
 334         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 335         try:
 336             self.report_age_confirmation()
 337             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 338         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 339             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 340             return
 341
 342     def _extract_id(self, url):
 343         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 344         if mobj is None:
 345             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 346             return
 347         video_id = mobj.group(2)
 348         return video_id
 349
 350     def _real_extract(self, url):
 351         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 352         mobj = re.search(self._NEXT_URL_RE, url)
 353         if mobj:
 354             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 355         video_id = self._extract_id(url)
 356
 357         # Get video webpage
 358         self.report_video_webpage_download(video_id)
 359         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 360         request = compat_urllib_request.Request(url)
 361         try:
 362             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 363         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 364             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
 365             return
 366
 367         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 368
 369         # Attempt to extract SWF player URL
 370         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 371         if mobj is not None:
 372             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 373         else:
 374             player_url = None
 375
 376         # Get video info
 377         self.report_video_info_webpage_download(video_id)
 378         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 379             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 380                     % (video_id, el_type))
 381             request = compat_urllib_request.Request(video_info_url)
 382             try:
 383                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
 384                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
 385                 video_info = compat_parse_qs(video_info_webpage)
 386                 if 'token' in video_info:
 387                     break
 388             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 389                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 390                 return
 391         if 'token' not in video_info:
 392             if 'reason' in video_info:
 393                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
 394             else:
 395                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 396             return
 397
 398         # Check for "rental" videos
 399         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 400             self._downloader.trouble(u'ERROR: "rental" videos not supported')
 401             return
 402
 403         # Start extracting information
 404         self.report_information_extraction(video_id)
 405
 406         # uploader
 407         if 'author' not in video_info:
 408             self._downloader.trouble(u'ERROR: unable to extract uploader name')
 409             return
 410         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 411
 412         # uploader_id
 413         video_uploader_id = None
 414         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/user/([^"]+)">', video_webpage)
 415         if mobj is not None:
 416             video_uploader_id = mobj.group(1)
 417         else:
 418             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 419
 420         # title
 421         if 'title' not in video_info:
 422             self._downloader.trouble(u'ERROR: unable to extract video title')
 423             return
 424         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 425
 426         # thumbnail image
 427         if 'thumbnail_url' not in video_info:
 428             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 429             video_thumbnail = ''
 430         else:   # don't panic if we can't find it
 431             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 432
 433         # upload date
 434         upload_date = None
 435         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 436         if mobj is not None:
 437             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 438             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 439             for expression in format_expressions:
 440                 try:
 441                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 442                 except:
 443                     pass
 444
 445         # description
 446         video_description = get_element_by_id("eow-description", video_webpage)
 447         if video_description:
 448             video_description = clean_html(video_description)
 449         else:
 450             video_description = ''
 451
 452         # closed captions
 453         video_subtitles = None
 454         if self._downloader.params.get('writesubtitles', False):
 455             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
 456             if srt_error:
 457                 self._downloader.trouble(srt_error)
 458
 459         if 'length_seconds' not in video_info:
 460             self._downloader.trouble(u'WARNING: unable to extract video duration')
 461             video_duration = ''
 462         else:
 463             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 464
 465         # token
 466         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 467
 468         # Decide which formats to download
 469         req_format = self._downloader.params.get('format', None)
 470
 471         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 472             self.report_rtmp_download()
 473             video_url_list = [(None, video_info['conn'][0])]
 474         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 475             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 476             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 477             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 478             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 479
 480             format_limit = self._downloader.params.get('format_limit', None)
 481             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 482             if format_limit is not None and format_limit in available_formats:
 483                 format_list = available_formats[available_formats.index(format_limit):]
 484             else:
 485                 format_list = available_formats
 486             existing_formats = [x for x in format_list if x in url_map]
 487             if len(existing_formats) == 0:
 488                 self._downloader.trouble(u'ERROR: no known formats available for video')
 489                 return
 490             if self._downloader.params.get('listformats', None):
 491                 self._print_formats(existing_formats)
 492                 return
 493             if req_format is None or req_format == 'best':
 494                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 495             elif req_format == 'worst':
 496                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 497             elif req_format in ('-1', 'all'):
 498                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 499             else:
 500                 # Specific formats. We pick the first in a slash-delimeted sequence.
 501                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 502                 req_formats = req_format.split('/')
 503                 video_url_list = None
 504                 for rf in req_formats:
 505                     if rf in url_map:
 506                         video_url_list = [(rf, url_map[rf])]
 507                         break
 508                 if video_url_list is None:
 509                     self._downloader.trouble(u'ERROR: requested format not available')
 510                     return
 511         else:
 512             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 513             return
 514
 515         results = []
 516         for format_param, video_real_url in video_url_list:
 517             # Extension
 518             video_extension = self._video_extensions.get(format_param, 'flv')
 519
 520             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 521                                               self._video_dimensions.get(format_param, '???'))
 522
 523             results.append({
 524                 'id':       video_id,
 525                 'url':      video_real_url,
 526                 'uploader': video_uploader,
 527                 'uploader_id': video_uploader_id,
 528                 'upload_date':  upload_date,
 529                 'title':    video_title,
 530                 'ext':      video_extension,
 531                 'format':   video_format,
 532                 'thumbnail':    video_thumbnail,
 533                 'description':  video_description,
 534                 'player_url':   player_url,
 535                 'subtitles':    video_subtitles,
 536                 'duration':     video_duration
 537             })
 538         return results
 539
 540
 541 class MetacafeIE(InfoExtractor):
 542     """Information Extractor for metacafe.com."""
 543
 544     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 545     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 546     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 547     IE_NAME = u'metacafe'
 548
 549     def __init__(self, downloader=None):
 550         InfoExtractor.__init__(self, downloader)
 551
 552     def report_disclaimer(self):
 553         """Report disclaimer retrieval."""
 554         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 555
 556     def report_age_confirmation(self):
 557         """Report attempt to confirm age."""
 558         self._downloader.to_screen(u'[metacafe] Confirming age')
 559
 560     def report_download_webpage(self, video_id):
 561         """Report webpage download."""
 562         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 563
 564     def report_extraction(self, video_id):
 565         """Report information extraction."""
 566         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 567
 568     def _real_initialize(self):
 569         # Retrieve disclaimer
 570         request = compat_urllib_request.Request(self._DISCLAIMER)
 571         try:
 572             self.report_disclaimer()
 573             disclaimer = compat_urllib_request.urlopen(request).read()
 574         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 575             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
 576             return
 577
 578         # Confirm age
 579         disclaimer_form = {
 580             'filters': '0',
 581             'submit': "Continue - I'm over 18",
 582             }
 583         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 584         try:
 585             self.report_age_confirmation()
 586             disclaimer = compat_urllib_request.urlopen(request).read()
 587         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 588             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 589             return
 590
 591     def _real_extract(self, url):
 592         # Extract id and simplified title from URL
 593         mobj = re.match(self._VALID_URL, url)
 594         if mobj is None:
 595             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 596             return
 597
 598         video_id = mobj.group(1)
 599
 600         # Check if video comes from YouTube
 601         mobj2 = re.match(r'^yt-(.*)$', video_id)
 602         if mobj2 is not None:
 603             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 604             return
 605
 606         # Retrieve video webpage to extract further information
 607         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
 608         try:
 609             self.report_download_webpage(video_id)
 610             webpage = compat_urllib_request.urlopen(request).read()
 611         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 612             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 613             return
 614
 615         # Extract URL, uploader and title from webpage
 616         self.report_extraction(video_id)
 617         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 618         if mobj is not None:
 619             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 620             video_extension = mediaURL[-3:]
 621
 622             # Extract gdaKey if available
 623             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 624             if mobj is None:
 625                 video_url = mediaURL
 626             else:
 627                 gdaKey = mobj.group(1)
 628                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 629         else:
 630             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 631             if mobj is None:
 632                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 633                 return
 634             vardict = compat_parse_qs(mobj.group(1))
 635             if 'mediaData' not in vardict:
 636                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 637                 return
 638             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 639             if mobj is None:
 640                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 641                 return
 642             mediaURL = mobj.group(1).replace('\\/', '/')
 643             video_extension = mediaURL[-3:]
 644             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 645
 646         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 647         if mobj is None:
 648             self._downloader.trouble(u'ERROR: unable to extract title')
 649             return
 650         video_title = mobj.group(1).decode('utf-8')
 651
 652         mobj = re.search(r'submitter=(.*?);', webpage)
 653         if mobj is None:
 654             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 655             return
 656         video_uploader = mobj.group(1)
 657
 658         return [{
 659             'id':       video_id.decode('utf-8'),
 660             'url':      video_url.decode('utf-8'),
 661             'uploader': video_uploader.decode('utf-8'),
 662             'upload_date':  None,
 663             'title':    video_title,
 664             'ext':      video_extension.decode('utf-8'),
 665         }]
 666
 667
 668 class DailymotionIE(InfoExtractor):
 669     """Information Extractor for Dailymotion"""
 670
 671     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 672     IE_NAME = u'dailymotion'
 673
 674     def __init__(self, downloader=None):
 675         InfoExtractor.__init__(self, downloader)
 676
 677     def report_download_webpage(self, video_id):
 678         """Report webpage download."""
 679         self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
 680
 681     def report_extraction(self, video_id):
 682         """Report information extraction."""
 683         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 684
 685     def _real_extract(self, url):
 686         # Extract id and simplified title from URL
 687         mobj = re.match(self._VALID_URL, url)
 688         if mobj is None:
 689             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 690             return
 691
 692         video_id = mobj.group(1).split('_')[0].split('?')[0]
 693
 694         video_extension = 'mp4'
 695
 696         # Retrieve video webpage to extract further information
 697         request = compat_urllib_request.Request(url)
 698         request.add_header('Cookie', 'family_filter=off')
 699         try:
 700             self.report_download_webpage(video_id)
 701             webpage_bytes = compat_urllib_request.urlopen(request).read()
 702             webpage = webpage_bytes.decode('utf-8')
 703         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 704             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 705             return
 706
 707         # Extract URL, uploader and title from webpage
 708         self.report_extraction(video_id)
 709         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 710         if mobj is None:
 711             self._downloader.trouble(u'ERROR: unable to extract media URL')
 712             return
 713         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 714
 715         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 716             if key in flashvars:
 717                 max_quality = key
 718                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 719                 break
 720         else:
 721             self._downloader.trouble(u'ERROR: unable to extract video URL')
 722             return
 723
 724         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 725         if mobj is None:
 726             self._downloader.trouble(u'ERROR: unable to extract video URL')
 727             return
 728
 729         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 730
 731         # TODO: support choosing qualities
 732
 733         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 734         if mobj is None:
 735             self._downloader.trouble(u'ERROR: unable to extract title')
 736             return
 737         video_title = unescapeHTML(mobj.group('title'))
 738
 739         video_uploader = None
 740         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 741         if mobj is None:
 742             # lookin for official user
 743             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 744             if mobj_official is None:
 745                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 746             else:
 747                 video_uploader = mobj_official.group(1)
 748         else:
 749             video_uploader = mobj.group(1)
 750
 751         video_upload_date = None
 752         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 753         if mobj is not None:
 754             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 755
 756         return [{
 757             'id':       video_id,
 758             'url':      video_url,
 759             'uploader': video_uploader,
 760             'upload_date':  video_upload_date,
 761             'title':    video_title,
 762             'ext':      video_extension,
 763         }]
 764
 765
 766 class PhotobucketIE(InfoExtractor):
 767     """Information extractor for photobucket.com."""
 768
 769     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 770     IE_NAME = u'photobucket'
 771
 772     def __init__(self, downloader=None):
 773         InfoExtractor.__init__(self, downloader)
 774
 775     def report_download_webpage(self, video_id):
 776         """Report webpage download."""
 777         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 778
 779     def report_extraction(self, video_id):
 780         """Report information extraction."""
 781         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 782
 783     def _real_extract(self, url):
 784         # Extract id from URL
 785         mobj = re.match(self._VALID_URL, url)
 786         if mobj is None:
 787             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 788             return
 789
 790         video_id = mobj.group(1)
 791
 792         video_extension = 'flv'
 793
 794         # Retrieve video webpage to extract further information
 795         request = compat_urllib_request.Request(url)
 796         try:
 797             self.report_download_webpage(video_id)
 798             webpage = compat_urllib_request.urlopen(request).read()
 799         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 800             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 801             return
 802
 803         # Extract URL, uploader, and title from webpage
 804         self.report_extraction(video_id)
 805         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 806         if mobj is None:
 807             self._downloader.trouble(u'ERROR: unable to extract media URL')
 808             return
 809         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 810
 811         video_url = mediaURL
 812
 813         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 814         if mobj is None:
 815             self._downloader.trouble(u'ERROR: unable to extract title')
 816             return
 817         video_title = mobj.group(1).decode('utf-8')
 818
 819         video_uploader = mobj.group(2).decode('utf-8')
 820
 821         return [{
 822             'id':       video_id.decode('utf-8'),
 823             'url':      video_url.decode('utf-8'),
 824             'uploader': video_uploader,
 825             'upload_date':  None,
 826             'title':    video_title,
 827             'ext':      video_extension.decode('utf-8'),
 828         }]
 829
 830
 831 class YahooIE(InfoExtractor):
 832     """Information extractor for video.yahoo.com."""
 833
 834     _WORKING = False
 835     # _VALID_URL matches all Yahoo! Video URLs
 836     # _VPAGE_URL matches only the extractable '/watch/' URLs
 837     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 838     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 839     IE_NAME = u'video.yahoo'
 840
 841     def __init__(self, downloader=None):
 842         InfoExtractor.__init__(self, downloader)
 843
 844     def report_download_webpage(self, video_id):
 845         """Report webpage download."""
 846         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 847
 848     def report_extraction(self, video_id):
 849         """Report information extraction."""
 850         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 851
 852     def _real_extract(self, url, new_video=True):
 853         # Extract ID from URL
 854         mobj = re.match(self._VALID_URL, url)
 855         if mobj is None:
 856             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 857             return
 858
 859         video_id = mobj.group(2)
 860         video_extension = 'flv'
 861
 862         # Rewrite valid but non-extractable URLs as
 863         # extractable English language /watch/ URLs
 864         if re.match(self._VPAGE_URL, url) is None:
 865             request = compat_urllib_request.Request(url)
 866             try:
 867                 webpage = compat_urllib_request.urlopen(request).read()
 868             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 869                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 870                 return
 871
 872             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 873             if mobj is None:
 874                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 875                 return
 876             yahoo_id = mobj.group(1)
 877
 878             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 879             if mobj is None:
 880                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 881                 return
 882             yahoo_vid = mobj.group(1)
 883
 884             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 885             return self._real_extract(url, new_video=False)
 886
 887         # Retrieve video webpage to extract further information
 888         request = compat_urllib_request.Request(url)
 889         try:
 890             self.report_download_webpage(video_id)
 891             webpage = compat_urllib_request.urlopen(request).read()
 892         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 893             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 894             return
 895
 896         # Extract uploader and title from webpage
 897         self.report_extraction(video_id)
 898         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 899         if mobj is None:
 900             self._downloader.trouble(u'ERROR: unable to extract video title')
 901             return
 902         video_title = mobj.group(1).decode('utf-8')
 903
 904         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 905         if mobj is None:
 906             self._downloader.trouble(u'ERROR: unable to extract video uploader')
 907             return
 908         video_uploader = mobj.group(1).decode('utf-8')
 909
 910         # Extract video thumbnail
 911         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 912         if mobj is None:
 913             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 914             return
 915         video_thumbnail = mobj.group(1).decode('utf-8')
 916
 917         # Extract video description
 918         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 919         if mobj is None:
 920             self._downloader.trouble(u'ERROR: unable to extract video description')
 921             return
 922         video_description = mobj.group(1).decode('utf-8')
 923         if not video_description:
 924             video_description = 'No description available.'
 925
 926         # Extract video height and width
 927         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 928         if mobj is None:
 929             self._downloader.trouble(u'ERROR: unable to extract video height')
 930             return
 931         yv_video_height = mobj.group(1)
 932
 933         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 934         if mobj is None:
 935             self._downloader.trouble(u'ERROR: unable to extract video width')
 936             return
 937         yv_video_width = mobj.group(1)
 938
 939         # Retrieve video playlist to extract media URL
 940         # I'm not completely sure what all these options are, but we
 941         # seem to need most of them, otherwise the server sends a 401.
 942         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 943         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 944         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 945                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 946                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 947         try:
 948             self.report_download_webpage(video_id)
 949             webpage = compat_urllib_request.urlopen(request).read()
 950         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 951             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 952             return
 953
 954         # Extract media URL from playlist XML
 955         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 956         if mobj is None:
 957             self._downloader.trouble(u'ERROR: Unable to extract media URL')
 958             return
 959         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
 960         video_url = unescapeHTML(video_url)
 961
 962         return [{
 963             'id':       video_id.decode('utf-8'),
 964             'url':      video_url,
 965             'uploader': video_uploader,
 966             'upload_date':  None,
 967             'title':    video_title,
 968             'ext':      video_extension.decode('utf-8'),
 969             'thumbnail':    video_thumbnail.decode('utf-8'),
 970             'description':  video_description,
 971         }]
 972
 973
 974 class VimeoIE(InfoExtractor):
 975     """Information extractor for vimeo.com."""
 976
 977     # _VALID_URL matches Vimeo URLs
 978     _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
 979     IE_NAME = u'vimeo'
 980
 981     def __init__(self, downloader=None):
 982         InfoExtractor.__init__(self, downloader)
 983
 984     def report_download_webpage(self, video_id):
 985         """Report webpage download."""
 986         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
 987
 988     def report_extraction(self, video_id):
 989         """Report information extraction."""
 990         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
 991
 992     def _real_extract(self, url, new_video=True):
 993         # Extract ID from URL
 994         mobj = re.match(self._VALID_URL, url)
 995         if mobj is None:
 996             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 997             return
 998
 999         video_id = mobj.group(1)
1000
1001         # Retrieve video webpage to extract further information
1002         request = compat_urllib_request.Request(url, None, std_headers)
1003         try:
1004             self.report_download_webpage(video_id)
1005             webpage_bytes = compat_urllib_request.urlopen(request).read()
1006             webpage = webpage_bytes.decode('utf-8')
1007         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1008             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1009             return
1010
1011         # Now we begin extracting as much information as we can from what we
1012         # retrieved. First we extract the information common to all extractors,
1013         # and latter we extract those that are Vimeo specific.
1014         self.report_extraction(video_id)
1015
1016         # Extract the config JSON
1017         try:
1018             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1019             config = json.loads(config)
1020         except:
1021             self._downloader.trouble(u'ERROR: unable to extract info section')
1022             return
1023
1024         # Extract title
1025         video_title = config["video"]["title"]
1026
1027         # Extract uploader and uploader_id
1028         video_uploader = config["video"]["owner"]["name"]
1029         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1030
1031         # Extract video thumbnail
1032         video_thumbnail = config["video"]["thumbnail"]
1033
1034         # Extract video description
1035         video_description = get_element_by_attribute("itemprop", "description", webpage)
1036         if video_description: video_description = clean_html(video_description)
1037         else: video_description = ''
1038
1039         # Extract upload date
1040         video_upload_date = None
1041         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1042         if mobj is not None:
1043             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1044
1045         # Vimeo specific: extract request signature and timestamp
1046         sig = config['request']['signature']
1047         timestamp = config['request']['timestamp']
1048
1049         # Vimeo specific: extract video codec and quality information
1050         # First consider quality, then codecs, then take everything
1051         # TODO bind to format param
1052         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1053         files = { 'hd': [], 'sd': [], 'other': []}
1054         for codec_name, codec_extension in codecs:
1055             if codec_name in config["video"]["files"]:
1056                 if 'hd' in config["video"]["files"][codec_name]:
1057                     files['hd'].append((codec_name, codec_extension, 'hd'))
1058                 elif 'sd' in config["video"]["files"][codec_name]:
1059                     files['sd'].append((codec_name, codec_extension, 'sd'))
1060                 else:
1061                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1062
1063         for quality in ('hd', 'sd', 'other'):
1064             if len(files[quality]) > 0:
1065                 video_quality = files[quality][0][2]
1066                 video_codec = files[quality][0][0]
1067                 video_extension = files[quality][0][1]
1068                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1069                 break
1070         else:
1071             self._downloader.trouble(u'ERROR: no known codec found')
1072             return
1073
1074         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1075                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1076
1077         return [{
1078             'id':       video_id,
1079             'url':      video_url,
1080             'uploader': video_uploader,
1081             'uploader_id': video_uploader_id,
1082             'upload_date':  video_upload_date,
1083             'title':    video_title,
1084             'ext':      video_extension,
1085             'thumbnail':    video_thumbnail,
1086             'description':  video_description,
1087         }]
1088
1089
1090 class ArteTvIE(InfoExtractor):
1091     """arte.tv information extractor."""
1092
1093     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1094     _LIVE_URL = r'index-[0-9]+\.html$'
1095
1096     IE_NAME = u'arte.tv'
1097
1098     def __init__(self, downloader=None):
1099         InfoExtractor.__init__(self, downloader)
1100
1101     def report_download_webpage(self, video_id):
1102         """Report webpage download."""
1103         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1104
1105     def report_extraction(self, video_id):
1106         """Report information extraction."""
1107         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1108
1109     def fetch_webpage(self, url):
1110         request = compat_urllib_request.Request(url)
1111         try:
1112             self.report_download_webpage(url)
1113             webpage = compat_urllib_request.urlopen(request).read()
1114         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1115             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1116             return
1117         except ValueError as err:
1118             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1119             return
1120         return webpage
1121
1122     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1123         page = self.fetch_webpage(url)
1124         mobj = re.search(regex, page, regexFlags)
1125         info = {}
1126
1127         if mobj is None:
1128             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1129             return
1130
1131         for (i, key, err) in matchTuples:
1132             if mobj.group(i) is None:
1133                 self._downloader.trouble(err)
1134                 return
1135             else:
1136                 info[key] = mobj.group(i)
1137
1138         return info
1139
1140     def extractLiveStream(self, url):
1141         video_lang = url.split('/')[-4]
1142         info = self.grep_webpage(
1143             url,
1144             r'src="(.*?/videothek_js.*?\.js)',
1145             0,
1146             [
1147                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1148             ]
1149         )
1150         http_host = url.split('/')[2]
1151         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1152         info = self.grep_webpage(
1153             next_url,
1154             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1155                 '(http://.*?\.swf).*?' +
1156                 '(rtmp://.*?)\'',
1157             re.DOTALL,
1158             [
1159                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1160                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1161                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1162             ]
1163         )
1164         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1165
1166     def extractPlus7Stream(self, url):
1167         video_lang = url.split('/')[-3]
1168         info = self.grep_webpage(
1169             url,
1170             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1171             0,
1172             [
1173                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1174             ]
1175         )
1176         next_url = compat_urllib_parse.unquote(info.get('url'))
1177         info = self.grep_webpage(
1178             next_url,
1179             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1180             0,
1181             [
1182                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1183             ]
1184         )
1185         next_url = compat_urllib_parse.unquote(info.get('url'))
1186
1187         info = self.grep_webpage(
1188             next_url,
1189             r'<video id="(.*?)".*?>.*?' +
1190                 '<name>(.*?)</name>.*?' +
1191                 '<dateVideo>(.*?)</dateVideo>.*?' +
1192                 '<url quality="hd">(.*?)</url>',
1193             re.DOTALL,
1194             [
1195                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1196                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1197                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1198                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1199             ]
1200         )
1201
1202         return {
1203             'id':           info.get('id'),
1204             'url':          compat_urllib_parse.unquote(info.get('url')),
1205             'uploader':     u'arte.tv',
1206             'upload_date':  info.get('date'),
1207             'title':        info.get('title').decode('utf-8'),
1208             'ext':          u'mp4',
1209             'format':       u'NA',
1210             'player_url':   None,
1211         }
1212
1213     def _real_extract(self, url):
1214         video_id = url.split('/')[-1]
1215         self.report_extraction(video_id)
1216
1217         if re.search(self._LIVE_URL, video_id) is not None:
1218             self.extractLiveStream(url)
1219             return
1220         else:
1221             info = self.extractPlus7Stream(url)
1222
1223         return [info]
1224
1225
1226 class GenericIE(InfoExtractor):
1227     """Generic last-resort information extractor."""
1228
1229     _VALID_URL = r'.*'
1230     IE_NAME = u'generic'
1231
1232     def __init__(self, downloader=None):
1233         InfoExtractor.__init__(self, downloader)
1234
1235     def report_download_webpage(self, video_id):
1236         """Report webpage download."""
1237         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1238         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1239
1240     def report_extraction(self, video_id):
1241         """Report information extraction."""
1242         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1243
1244     def report_following_redirect(self, new_url):
1245         """Report information extraction."""
1246         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1247
1248     def _test_redirect(self, url):
1249         """Check if it is a redirect, like url shorteners, in case restart chain."""
1250         class HeadRequest(compat_urllib_request.Request):
1251             def get_method(self):
1252                 return "HEAD"
1253
1254         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1255             """
1256             Subclass the HTTPRedirectHandler to make it use our
1257             HeadRequest also on the redirected URL
1258             """
1259             def redirect_request(self, req, fp, code, msg, headers, newurl):
1260                 if code in (301, 302, 303, 307):
1261                     newurl = newurl.replace(' ', '%20')
1262                     newheaders = dict((k,v) for k,v in req.headers.items()
1263                                       if k.lower() not in ("content-length", "content-type"))
1264                     return HeadRequest(newurl,
1265                                        headers=newheaders,
1266                                        origin_req_host=req.get_origin_req_host(),
1267                                        unverifiable=True)
1268                 else:
1269                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1270
1271         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1272             """
1273             Fallback to GET if HEAD is not allowed (405 HTTP error)
1274             """
1275             def http_error_405(self, req, fp, code, msg, headers):
1276                 fp.read()
1277                 fp.close()
1278
1279                 newheaders = dict((k,v) for k,v in req.headers.items()
1280                                   if k.lower() not in ("content-length", "content-type"))
1281                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1282                                                  headers=newheaders,
1283                                                  origin_req_host=req.get_origin_req_host(),
1284                                                  unverifiable=True))
1285
1286         # Build our opener
1287         opener = compat_urllib_request.OpenerDirector()
1288         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1289                         HTTPMethodFallback, HEADRedirectHandler,
1290                         compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1291             opener.add_handler(handler())
1292
1293         response = opener.open(HeadRequest(url))
1294         new_url = response.geturl()
1295
1296         if url == new_url:
1297             return False
1298
1299         self.report_following_redirect(new_url)
1300         self._downloader.download([new_url])
1301         return True
1302
1303     def _real_extract(self, url):
1304         if self._test_redirect(url): return
1305
1306         video_id = url.split('/')[-1]
1307         request = compat_urllib_request.Request(url)
1308         try:
1309             self.report_download_webpage(video_id)
1310             webpage = compat_urllib_request.urlopen(request).read()
1311         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1312             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1313             return
1314         except ValueError as err:
1315             # since this is the last-resort InfoExtractor, if
1316             # this error is thrown, it'll be thrown here
1317             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1318             return
1319
1320         self.report_extraction(video_id)
1321         # Start with something easy: JW Player in SWFObject
1322         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1323         if mobj is None:
1324             # Broaden the search a little bit
1325             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1326         if mobj is None:
1327             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1328             return
1329
1330         # It's possible that one of the regexes
1331         # matched, but returned an empty group:
1332         if mobj.group(1) is None:
1333             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1334             return
1335
1336         video_url = compat_urllib_parse.unquote(mobj.group(1))
1337         video_id = os.path.basename(video_url)
1338
1339         # here's a fun little line of code for you:
1340         video_extension = os.path.splitext(video_id)[1][1:]
1341         video_id = os.path.splitext(video_id)[0]
1342
1343         # it's tempting to parse this further, but you would
1344         # have to take into account all the variations like
1345         #   Video Title - Site Name
1346         #   Site Name | Video Title
1347         #   Video Title - Tagline | Site Name
1348         # and so on and so forth; it's just not practical
1349         mobj = re.search(r'<title>(.*)</title>', webpage)
1350         if mobj is None:
1351             self._downloader.trouble(u'ERROR: unable to extract title')
1352             return
1353         video_title = mobj.group(1)
1354
1355         # video uploader is domain name
1356         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1357         if mobj is None:
1358             self._downloader.trouble(u'ERROR: unable to extract title')
1359             return
1360         video_uploader = mobj.group(1)
1361
1362         return [{
1363             'id':       video_id,
1364             'url':      video_url,
1365             'uploader': video_uploader,
1366             'upload_date':  None,
1367             'title':    video_title,
1368             'ext':      video_extension,
1369         }]
1370
1371
1372 class YoutubeSearchIE(InfoExtractor):
1373     """Information Extractor for YouTube search queries."""
1374     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1375     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1376     _max_youtube_results = 1000
1377     IE_NAME = u'youtube:search'
1378
1379     def __init__(self, downloader=None):
1380         InfoExtractor.__init__(self, downloader)
1381
1382     def report_download_page(self, query, pagenum):
1383         """Report attempt to download search page with given number."""
1384         query = query.decode(preferredencoding())
1385         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1386
1387     def _real_extract(self, query):
1388         mobj = re.match(self._VALID_URL, query)
1389         if mobj is None:
1390             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1391             return
1392
1393         prefix, query = query.split(':')
1394         prefix = prefix[8:]
1395         query = query.encode('utf-8')
1396         if prefix == '':
1397             self._download_n_results(query, 1)
1398             return
1399         elif prefix == 'all':
1400             self._download_n_results(query, self._max_youtube_results)
1401             return
1402         else:
1403             try:
1404                 n = int(prefix)
1405                 if n <= 0:
1406                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1407                     return
1408                 elif n > self._max_youtube_results:
1409                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1410                     n = self._max_youtube_results
1411                 self._download_n_results(query, n)
1412                 return
1413             except ValueError: # parsing prefix as integer fails
1414                 self._download_n_results(query, 1)
1415                 return
1416
1417     def _download_n_results(self, query, n):
1418         """Downloads a specified number of results for a query"""
1419
1420         video_ids = []
1421         pagenum = 0
1422         limit = n
1423
1424         while (50 * pagenum) < limit:
1425             self.report_download_page(query, pagenum+1)
1426             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1427             request = compat_urllib_request.Request(result_url)
1428             try:
1429                 data = compat_urllib_request.urlopen(request).read()
1430             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1431                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1432                 return
1433             api_response = json.loads(data)['data']
1434
1435             new_ids = list(video['id'] for video in api_response['items'])
1436             video_ids += new_ids
1437
1438             limit = min(n, api_response['totalItems'])
1439             pagenum += 1
1440
1441         if len(video_ids) > n:
1442             video_ids = video_ids[:n]
1443         for id in video_ids:
1444             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1445         return
1446
1447
1448 class GoogleSearchIE(InfoExtractor):
1449     """Information Extractor for Google Video search queries."""
1450     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1451     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1452     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1453     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1454     _max_google_results = 1000
1455     IE_NAME = u'video.google:search'
1456
1457     def __init__(self, downloader=None):
1458         InfoExtractor.__init__(self, downloader)
1459
1460     def report_download_page(self, query, pagenum):
1461         """Report attempt to download playlist page with given number."""
1462         query = query.decode(preferredencoding())
1463         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1464
1465     def _real_extract(self, query):
1466         mobj = re.match(self._VALID_URL, query)
1467         if mobj is None:
1468             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1469             return
1470
1471         prefix, query = query.split(':')
1472         prefix = prefix[8:]
1473         query = query.encode('utf-8')
1474         if prefix == '':
1475             self._download_n_results(query, 1)
1476             return
1477         elif prefix == 'all':
1478             self._download_n_results(query, self._max_google_results)
1479             return
1480         else:
1481             try:
1482                 n = int(prefix)
1483                 if n <= 0:
1484                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1485                     return
1486                 elif n > self._max_google_results:
1487                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1488                     n = self._max_google_results
1489                 self._download_n_results(query, n)
1490                 return
1491             except ValueError: # parsing prefix as integer fails
1492                 self._download_n_results(query, 1)
1493                 return
1494
1495     def _download_n_results(self, query, n):
1496         """Downloads a specified number of results for a query"""
1497
1498         video_ids = []
1499         pagenum = 0
1500
1501         while True:
1502             self.report_download_page(query, pagenum)
1503             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1504             request = compat_urllib_request.Request(result_url)
1505             try:
1506                 page = compat_urllib_request.urlopen(request).read()
1507             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1508                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1509                 return
1510
1511             # Extract video identifiers
1512             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1513                 video_id = mobj.group(1)
1514                 if video_id not in video_ids:
1515                     video_ids.append(video_id)
1516                     if len(video_ids) == n:
1517                         # Specified n videos reached
1518                         for id in video_ids:
1519                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1520                         return
1521
1522             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1523                 for id in video_ids:
1524                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1525                 return
1526
1527             pagenum = pagenum + 1
1528
1529
1530 class YahooSearchIE(InfoExtractor):
1531     """Information Extractor for Yahoo! Video search queries."""
1532
1533     _WORKING = False
1534     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1535     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1536     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1537     _MORE_PAGES_INDICATOR = r'\s*Next'
1538     _max_yahoo_results = 1000
1539     IE_NAME = u'video.yahoo:search'
1540
1541     def __init__(self, downloader=None):
1542         InfoExtractor.__init__(self, downloader)
1543
1544     def report_download_page(self, query, pagenum):
1545         """Report attempt to download playlist page with given number."""
1546         query = query.decode(preferredencoding())
1547         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1548
1549     def _real_extract(self, query):
1550         mobj = re.match(self._VALID_URL, query)
1551         if mobj is None:
1552             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1553             return
1554
1555         prefix, query = query.split(':')
1556         prefix = prefix[8:]
1557         query = query.encode('utf-8')
1558         if prefix == '':
1559             self._download_n_results(query, 1)
1560             return
1561         elif prefix == 'all':
1562             self._download_n_results(query, self._max_yahoo_results)
1563             return
1564         else:
1565             try:
1566                 n = int(prefix)
1567                 if n <= 0:
1568                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1569                     return
1570                 elif n > self._max_yahoo_results:
1571                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1572                     n = self._max_yahoo_results
1573                 self._download_n_results(query, n)
1574                 return
1575             except ValueError: # parsing prefix as integer fails
1576                 self._download_n_results(query, 1)
1577                 return
1578
1579     def _download_n_results(self, query, n):
1580         """Downloads a specified number of results for a query"""
1581
1582         video_ids = []
1583         already_seen = set()
1584         pagenum = 1
1585
1586         while True:
1587             self.report_download_page(query, pagenum)
1588             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1589             request = compat_urllib_request.Request(result_url)
1590             try:
1591                 page = compat_urllib_request.urlopen(request).read()
1592             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1593                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1594                 return
1595
1596             # Extract video identifiers
1597             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1598                 video_id = mobj.group(1)
1599                 if video_id not in already_seen:
1600                     video_ids.append(video_id)
1601                     already_seen.add(video_id)
1602                     if len(video_ids) == n:
1603                         # Specified n videos reached
1604                         for id in video_ids:
1605                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1606                         return
1607
1608             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1609                 for id in video_ids:
1610                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1611                 return
1612
1613             pagenum = pagenum + 1
1614
1615
1616 class YoutubePlaylistIE(InfoExtractor):
1617     """Information Extractor for YouTube playlists."""
1618
1619     _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1620     _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1621     _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1622     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1623     IE_NAME = u'youtube:playlist'
1624
1625     def __init__(self, downloader=None):
1626         InfoExtractor.__init__(self, downloader)
1627
1628     def report_download_page(self, playlist_id, pagenum):
1629         """Report attempt to download playlist page with given number."""
1630         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1631
1632     def _real_extract(self, url):
1633         # Extract playlist id
1634         mobj = re.match(self._VALID_URL, url)
1635         if mobj is None:
1636             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1637             return
1638
1639         # Single video case
1640         if mobj.group(3) is not None:
1641             self._downloader.download([mobj.group(3)])
1642             return
1643
1644         # Download playlist pages
1645         # prefix is 'p' as default for playlists but there are other types that need extra care
1646         playlist_prefix = mobj.group(1)
1647         if playlist_prefix == 'a':
1648             playlist_access = 'artist'
1649         else:
1650             playlist_prefix = 'p'
1651             playlist_access = 'view_play_list'
1652         playlist_id = mobj.group(2)
1653         video_ids = []
1654         pagenum = 1
1655
1656         while True:
1657             self.report_download_page(playlist_id, pagenum)
1658             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1659             request = compat_urllib_request.Request(url)
1660             try:
1661                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1662             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1663                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1664                 return
1665
1666             # Extract video identifiers
1667             ids_in_page = []
1668             for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1669                 if mobj.group(1) not in ids_in_page:
1670                     ids_in_page.append(mobj.group(1))
1671             video_ids.extend(ids_in_page)
1672
1673             if self._MORE_PAGES_INDICATOR not in page:
1674                 break
1675             pagenum = pagenum + 1
1676
1677         total = len(video_ids)
1678
1679         playliststart = self._downloader.params.get('playliststart', 1) - 1
1680         playlistend = self._downloader.params.get('playlistend', -1)
1681         if playlistend == -1:
1682             video_ids = video_ids[playliststart:]
1683         else:
1684             video_ids = video_ids[playliststart:playlistend]
1685
1686         if len(video_ids) == total:
1687             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1688         else:
1689             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1690
1691         for id in video_ids:
1692             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1693         return
1694
1695
1696 class YoutubeChannelIE(InfoExtractor):
1697     """Information Extractor for YouTube channels."""
1698
1699     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1700     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1701     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1702     IE_NAME = u'youtube:channel'
1703
1704     def report_download_page(self, channel_id, pagenum):
1705         """Report attempt to download channel page with given number."""
1706         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1707
1708     def _real_extract(self, url):
1709         # Extract channel id
1710         mobj = re.match(self._VALID_URL, url)
1711         if mobj is None:
1712             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1713             return
1714
1715         # Download channel pages
1716         channel_id = mobj.group(1)
1717         video_ids = []
1718         pagenum = 1
1719
1720         while True:
1721             self.report_download_page(channel_id, pagenum)
1722             url = self._TEMPLATE_URL % (channel_id, pagenum)
1723             request = compat_urllib_request.Request(url)
1724             try:
1725                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1726             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1727                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1728                 return
1729
1730             # Extract video identifiers
1731             ids_in_page = []
1732             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1733                 if mobj.group(1) not in ids_in_page:
1734                     ids_in_page.append(mobj.group(1))
1735             video_ids.extend(ids_in_page)
1736
1737             if self._MORE_PAGES_INDICATOR not in page:
1738                 break
1739             pagenum = pagenum + 1
1740
1741         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1742
1743         for id in video_ids:
1744             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1745         return
1746
1747
1748 class YoutubeUserIE(InfoExtractor):
1749     """Information Extractor for YouTube users."""
1750
1751     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1752     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1753     _GDATA_PAGE_SIZE = 50
1754     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1755     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1756     IE_NAME = u'youtube:user'
1757
1758     def __init__(self, downloader=None):
1759         InfoExtractor.__init__(self, downloader)
1760
1761     def report_download_page(self, username, start_index):
1762         """Report attempt to download user page."""
1763         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1764                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1765
1766     def _real_extract(self, url):
1767         # Extract username
1768         mobj = re.match(self._VALID_URL, url)
1769         if mobj is None:
1770             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1771             return
1772
1773         username = mobj.group(1)
1774
1775         # Download video ids using YouTube Data API. Result size per
1776         # query is limited (currently to 50 videos) so we need to query
1777         # page by page until there are no video ids - it means we got
1778         # all of them.
1779
1780         video_ids = []
1781         pagenum = 0
1782
1783         while True:
1784             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1785             self.report_download_page(username, start_index)
1786
1787             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1788
1789             try:
1790                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1791             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1792                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1793                 return
1794
1795             # Extract video identifiers
1796             ids_in_page = []
1797
1798             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1799                 if mobj.group(1) not in ids_in_page:
1800                     ids_in_page.append(mobj.group(1))
1801
1802             video_ids.extend(ids_in_page)
1803
1804             # A little optimization - if current page is not
1805             # "full", ie. does not contain PAGE_SIZE video ids then
1806             # we can assume that this page is the last one - there
1807             # are no more ids on further pages - no need to query
1808             # again.
1809
1810             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1811                 break
1812
1813             pagenum += 1
1814
1815         all_ids_count = len(video_ids)
1816         playliststart = self._downloader.params.get('playliststart', 1) - 1
1817         playlistend = self._downloader.params.get('playlistend', -1)
1818
1819         if playlistend == -1:
1820             video_ids = video_ids[playliststart:]
1821         else:
1822             video_ids = video_ids[playliststart:playlistend]
1823
1824         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1825                 (username, all_ids_count, len(video_ids)))
1826
1827         for video_id in video_ids:
1828             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1829
1830
1831 class BlipTVUserIE(InfoExtractor):
1832     """Information Extractor for blip.tv users."""
1833
1834     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1835     _PAGE_SIZE = 12
1836     IE_NAME = u'blip.tv:user'
1837
1838     def __init__(self, downloader=None):
1839         InfoExtractor.__init__(self, downloader)
1840
1841     def report_download_page(self, username, pagenum):
1842         """Report attempt to download user page."""
1843         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1844                 (self.IE_NAME, username, pagenum))
1845
1846     def _real_extract(self, url):
1847         # Extract username
1848         mobj = re.match(self._VALID_URL, url)
1849         if mobj is None:
1850             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1851             return
1852
1853         username = mobj.group(1)
1854
1855         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1856
1857         request = compat_urllib_request.Request(url)
1858
1859         try:
1860             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1861             mobj = re.search(r'data-users-id="([^"]+)"', page)
1862             page_base = page_base % mobj.group(1)
1863         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1864             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1865             return
1866
1867
1868         # Download video ids using BlipTV Ajax calls. Result size per
1869         # query is limited (currently to 12 videos) so we need to query
1870         # page by page until there are no video ids - it means we got
1871         # all of them.
1872
1873         video_ids = []
1874         pagenum = 1
1875
1876         while True:
1877             self.report_download_page(username, pagenum)
1878
1879             request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1880
1881             try:
1882                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1883             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1884                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1885                 return
1886
1887             # Extract video identifiers
1888             ids_in_page = []
1889
1890             for mobj in re.finditer(r'href="/([^"]+)"', page):
1891                 if mobj.group(1) not in ids_in_page:
1892                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1893
1894             video_ids.extend(ids_in_page)
1895
1896             # A little optimization - if current page is not
1897             # "full", ie. does not contain PAGE_SIZE video ids then
1898             # we can assume that this page is the last one - there
1899             # are no more ids on further pages - no need to query
1900             # again.
1901
1902             if len(ids_in_page) < self._PAGE_SIZE:
1903                 break
1904
1905             pagenum += 1
1906
1907         all_ids_count = len(video_ids)
1908         playliststart = self._downloader.params.get('playliststart', 1) - 1
1909         playlistend = self._downloader.params.get('playlistend', -1)
1910
1911         if playlistend == -1:
1912             video_ids = video_ids[playliststart:]
1913         else:
1914             video_ids = video_ids[playliststart:playlistend]
1915
1916         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1917                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1918
1919         for video_id in video_ids:
1920             self._downloader.download([u'http://blip.tv/'+video_id])
1921
1922
1923 class DepositFilesIE(InfoExtractor):
1924     """Information extractor for depositfiles.com"""
1925
1926     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1927
1928     def report_download_webpage(self, file_id):
1929         """Report webpage download."""
1930         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1931
1932     def report_extraction(self, file_id):
1933         """Report information extraction."""
1934         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1935
1936     def _real_extract(self, url):
1937         file_id = url.split('/')[-1]
1938         # Rebuild url in english locale
1939         url = 'http://depositfiles.com/en/files/' + file_id
1940
1941         # Retrieve file webpage with 'Free download' button pressed
1942         free_download_indication = { 'gateway_result' : '1' }
1943         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1944         try:
1945             self.report_download_webpage(file_id)
1946             webpage = compat_urllib_request.urlopen(request).read()
1947         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1948             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1949             return
1950
1951         # Search for the real file URL
1952         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1953         if (mobj is None) or (mobj.group(1) is None):
1954             # Try to figure out reason of the error.
1955             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1956             if (mobj is not None) and (mobj.group(1) is not None):
1957                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1958                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1959             else:
1960                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1961             return
1962
1963         file_url = mobj.group(1)
1964         file_extension = os.path.splitext(file_url)[1][1:]
1965
1966         # Search for file title
1967         mobj = re.search(r'<b title="(.*?)">', webpage)
1968         if mobj is None:
1969             self._downloader.trouble(u'ERROR: unable to extract title')
1970             return
1971         file_title = mobj.group(1).decode('utf-8')
1972
1973         return [{
1974             'id':       file_id.decode('utf-8'),
1975             'url':      file_url.decode('utf-8'),
1976             'uploader': None,
1977             'upload_date':  None,
1978             'title':    file_title,
1979             'ext':      file_extension.decode('utf-8'),
1980         }]
1981
1982
1983 class FacebookIE(InfoExtractor):
1984     """Information Extractor for Facebook"""
1985
1986     _WORKING = False
1987     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1988     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1989     _NETRC_MACHINE = 'facebook'
1990     _available_formats = ['video', 'highqual', 'lowqual']
1991     _video_extensions = {
1992         'video': 'mp4',
1993         'highqual': 'mp4',
1994         'lowqual': 'mp4',
1995     }
1996     IE_NAME = u'facebook'
1997
1998     def __init__(self, downloader=None):
1999         InfoExtractor.__init__(self, downloader)
2000
2001     def _reporter(self, message):
2002         """Add header and report message."""
2003         self._downloader.to_screen(u'[facebook] %s' % message)
2004
2005     def report_login(self):
2006         """Report attempt to log in."""
2007         self._reporter(u'Logging in')
2008
2009     def report_video_webpage_download(self, video_id):
2010         """Report attempt to download video webpage."""
2011         self._reporter(u'%s: Downloading video webpage' % video_id)
2012
2013     def report_information_extraction(self, video_id):
2014         """Report attempt to extract video information."""
2015         self._reporter(u'%s: Extracting video information' % video_id)
2016
2017     def _parse_page(self, video_webpage):
2018         """Extract video information from page"""
2019         # General data
2020         data = {'title': r'\("video_title", "(.*?)"\)',
2021             'description': r'<div class="datawrap">(.*?)</div>',
2022             'owner': r'\("video_owner_name", "(.*?)"\)',
2023             'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2024             }
2025         video_info = {}
2026         for piece in data.keys():
2027             mobj = re.search(data[piece], video_webpage)
2028             if mobj is not None:
2029                 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2030
2031         # Video urls
2032         video_urls = {}
2033         for fmt in self._available_formats:
2034             mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2035             if mobj is not None:
2036                 # URL is in a Javascript segment inside an escaped Unicode format within
2037                 # the generally utf-8 page
2038                 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2039         video_info['video_urls'] = video_urls
2040
2041         return video_info
2042
2043     def _real_initialize(self):
2044         if self._downloader is None:
2045             return
2046
2047         useremail = None
2048         password = None
2049         downloader_params = self._downloader.params
2050
2051         # Attempt to use provided username and password or .netrc data
2052         if downloader_params.get('username', None) is not None:
2053             useremail = downloader_params['username']
2054             password = downloader_params['password']
2055         elif downloader_params.get('usenetrc', False):
2056             try:
2057                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2058                 if info is not None:
2059                     useremail = info[0]
2060                     password = info[2]
2061                 else:
2062                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2063             except (IOError, netrc.NetrcParseError) as err:
2064                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2065                 return
2066
2067         if useremail is None:
2068             return
2069
2070         # Log in
2071         login_form = {
2072             'email': useremail,
2073             'pass': password,
2074             'login': 'Log+In'
2075             }
2076         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2077         try:
2078             self.report_login()
2079             login_results = compat_urllib_request.urlopen(request).read()
2080             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2081                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2082                 return
2083         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2084             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2085             return
2086
2087     def _real_extract(self, url):
2088         mobj = re.match(self._VALID_URL, url)
2089         if mobj is None:
2090             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2091             return
2092         video_id = mobj.group('ID')
2093
2094         # Get video webpage
2095         self.report_video_webpage_download(video_id)
2096         request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2097         try:
2098             page = compat_urllib_request.urlopen(request)
2099             video_webpage = page.read()
2100         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2101             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2102             return
2103
2104         # Start extracting information
2105         self.report_information_extraction(video_id)
2106
2107         # Extract information
2108         video_info = self._parse_page(video_webpage)
2109
2110         # uploader
2111         if 'owner' not in video_info:
2112             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2113             return
2114         video_uploader = video_info['owner']
2115
2116         # title
2117         if 'title' not in video_info:
2118             self._downloader.trouble(u'ERROR: unable to extract video title')
2119             return
2120         video_title = video_info['title']
2121         video_title = video_title.decode('utf-8')
2122
2123         # thumbnail image
2124         if 'thumbnail' not in video_info:
2125             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2126             video_thumbnail = ''
2127         else:
2128             video_thumbnail = video_info['thumbnail']
2129
2130         # upload date
2131         upload_date = None
2132         if 'upload_date' in video_info:
2133             upload_time = video_info['upload_date']
2134             timetuple = email.utils.parsedate_tz(upload_time)
2135             if timetuple is not None:
2136                 try:
2137                     upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2138                 except:
2139                     pass
2140
2141         # description
2142         video_description = video_info.get('description', 'No description available.')
2143
2144         url_map = video_info['video_urls']
2145         if url_map:
2146             # Decide which formats to download
2147             req_format = self._downloader.params.get('format', None)
2148             format_limit = self._downloader.params.get('format_limit', None)
2149
2150             if format_limit is not None and format_limit in self._available_formats:
2151                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2152             else:
2153                 format_list = self._available_formats
2154             existing_formats = [x for x in format_list if x in url_map]
2155             if len(existing_formats) == 0:
2156                 self._downloader.trouble(u'ERROR: no known formats available for video')
2157                 return
2158             if req_format is None:
2159                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2160             elif req_format == 'worst':
2161                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2162             elif req_format == '-1':
2163                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2164             else:
2165                 # Specific format
2166                 if req_format not in url_map:
2167                     self._downloader.trouble(u'ERROR: requested format not available')
2168                     return
2169                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2170
2171         results = []
2172         for format_param, video_real_url in video_url_list:
2173             # Extension
2174             video_extension = self._video_extensions.get(format_param, 'mp4')
2175
2176             results.append({
2177                 'id':       video_id.decode('utf-8'),
2178                 'url':      video_real_url.decode('utf-8'),
2179                 'uploader': video_uploader.decode('utf-8'),
2180                 'upload_date':  upload_date,
2181                 'title':    video_title,
2182                 'ext':      video_extension.decode('utf-8'),
2183                 'format':   (format_param is None and u'NA' or format_param.decode('utf-8')),
2184                 'thumbnail':    video_thumbnail.decode('utf-8'),
2185                 'description':  video_description.decode('utf-8'),
2186             })
2187         return results
2188
2189 class BlipTVIE(InfoExtractor):
2190     """Information extractor for blip.tv"""
2191
2192     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2193     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2194     IE_NAME = u'blip.tv'
2195
2196     def report_extraction(self, file_id):
2197         """Report information extraction."""
2198         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2199
2200     def report_direct_download(self, title):
2201         """Report information extraction."""
2202         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2203
2204     def _real_extract(self, url):
2205         mobj = re.match(self._VALID_URL, url)
2206         if mobj is None:
2207             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2208             return
2209
2210         if '?' in url:
2211             cchar = '&'
2212         else:
2213             cchar = '?'
2214         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2215         request = compat_urllib_request.Request(json_url)
2216         self.report_extraction(mobj.group(1))
2217         info = None
2218         try:
2219             urlh = compat_urllib_request.urlopen(request)
2220             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2221                 basename = url.split('/')[-1]
2222                 title,ext = os.path.splitext(basename)
2223                 title = title.decode('UTF-8')
2224                 ext = ext.replace('.', '')
2225                 self.report_direct_download(title)
2226                 info = {
2227                     'id': title,
2228                     'url': url,
2229                     'uploader': None,
2230                     'upload_date': None,
2231                     'title': title,
2232                     'ext': ext,
2233                     'urlhandle': urlh
2234                 }
2235         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2236             self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2237             return
2238         if info is None: # Regular URL
2239             try:
2240                 json_code_bytes = urlh.read()
2241                 json_code = json_code_bytes.decode('utf-8')
2242             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2243                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2244                 return
2245
2246             try:
2247                 json_data = json.loads(json_code)
2248                 if 'Post' in json_data:
2249                     data = json_data['Post']
2250                 else:
2251                     data = json_data
2252
2253                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2254                 video_url = data['media']['url']
2255                 umobj = re.match(self._URL_EXT, video_url)
2256                 if umobj is None:
2257                     raise ValueError('Can not determine filename extension')
2258                 ext = umobj.group(1)
2259
2260                 info = {
2261                     'id': data['item_id'],
2262                     'url': video_url,
2263                     'uploader': data['display_name'],
2264                     'upload_date': upload_date,
2265                     'title': data['title'],
2266                     'ext': ext,
2267                     'format': data['media']['mimeType'],
2268                     'thumbnail': data['thumbnailUrl'],
2269                     'description': data['description'],
2270                     'player_url': data['embedUrl']
2271                 }
2272             except (ValueError,KeyError) as err:
2273                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2274                 return
2275
2276         std_headers['User-Agent'] = 'iTunes/10.6.1'
2277         return [info]
2278
2279
2280 class MyVideoIE(InfoExtractor):
2281     """Information Extractor for myvideo.de."""
2282
2283     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2284     IE_NAME = u'myvideo'
2285
2286     def __init__(self, downloader=None):
2287         InfoExtractor.__init__(self, downloader)
2288
2289     def report_download_webpage(self, video_id):
2290         """Report webpage download."""
2291         self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2292
2293     def report_extraction(self, video_id):
2294         """Report information extraction."""
2295         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2296
2297     def _real_extract(self,url):
2298         mobj = re.match(self._VALID_URL, url)
2299         if mobj is None:
2300             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2301             return
2302
2303         video_id = mobj.group(1)
2304
2305         # Get video webpage
2306         request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
2307         try:
2308             self.report_download_webpage(video_id)
2309             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
2310         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2311             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2312             return
2313
2314         self.report_extraction(video_id)
2315         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2316                  webpage)
2317         if mobj is None:
2318             self._downloader.trouble(u'ERROR: unable to extract media URL')
2319             return
2320         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2321
2322         mobj = re.search('<title>([^<]+)</title>', webpage)
2323         if mobj is None:
2324             self._downloader.trouble(u'ERROR: unable to extract title')
2325             return
2326
2327         video_title = mobj.group(1)
2328
2329         return [{
2330             'id':       video_id,
2331             'url':      video_url,
2332             'uploader': None,
2333             'upload_date':  None,
2334             'title':    video_title,
2335             'ext':      u'flv',
2336         }]
2337
2338 class ComedyCentralIE(InfoExtractor):
2339     """Information extractor for The Daily Show and Colbert Report """
2340
2341     # urls can be abbreviations like :thedailyshow or :colbert
2342     # urls for episodes like:
2343     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2344     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2345     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2346     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2347                       |(https?://)?(www\.)?
2348                           (?P<showname>thedailyshow|colbertnation)\.com/
2349                          (full-episodes/(?P<episode>.*)|
2350                           (?P<clip>
2351                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2352                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2353                      $"""
2354     IE_NAME = u'comedycentral'
2355
2356     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2357
2358     _video_extensions = {
2359         '3500': 'mp4',
2360         '2200': 'mp4',
2361         '1700': 'mp4',
2362         '1200': 'mp4',
2363         '750': 'mp4',
2364         '400': 'mp4',
2365     }
2366     _video_dimensions = {
2367         '3500': '1280x720',
2368         '2200': '960x540',
2369         '1700': '768x432',
2370         '1200': '640x360',
2371         '750': '512x288',
2372         '400': '384x216',
2373     }
2374
2375     def suitable(self, url):
2376         """Receives a URL and returns True if suitable for this IE."""
2377         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2378
2379     def report_extraction(self, episode_id):
2380         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2381
2382     def report_config_download(self, episode_id):
2383         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2384
2385     def report_index_download(self, episode_id):
2386         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2387
2388     def report_player_url(self, episode_id):
2389         self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2390
2391
2392     def _print_formats(self, formats):
2393         print('Available formats:')
2394         for x in formats:
2395             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2396
2397
2398     def _real_extract(self, url):
2399         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2400         if mobj is None:
2401             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2402             return
2403
2404         if mobj.group('shortname'):
2405             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2406                 url = u'http://www.thedailyshow.com/full-episodes/'
2407             else:
2408                 url = u'http://www.colbertnation.com/full-episodes/'
2409             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2410             assert mobj is not None
2411
2412         if mobj.group('clip'):
2413             if mobj.group('showname') == 'thedailyshow':
2414                 epTitle = mobj.group('tdstitle')
2415             else:
2416                 epTitle = mobj.group('cntitle')
2417             dlNewest = False
2418         else:
2419             dlNewest = not mobj.group('episode')
2420             if dlNewest:
2421                 epTitle = mobj.group('showname')
2422             else:
2423                 epTitle = mobj.group('episode')
2424
2425         req = compat_urllib_request.Request(url)
2426         self.report_extraction(epTitle)
2427         try:
2428             htmlHandle = compat_urllib_request.urlopen(req)
2429             html = htmlHandle.read()
2430         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2431             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2432             return
2433         if dlNewest:
2434             url = htmlHandle.geturl()
2435             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2436             if mobj is None:
2437                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2438                 return
2439             if mobj.group('episode') == '':
2440                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2441                 return
2442             epTitle = mobj.group('episode')
2443
2444         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
2445
2446         if len(mMovieParams) == 0:
2447             # The Colbert Report embeds the information in a without
2448             # a URL prefix; so extract the alternate reference
2449             # and then add the URL prefix manually.
2450
2451             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
2452             if len(altMovieParams) == 0:
2453                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2454                 return
2455             else:
2456                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2457
2458         playerUrl_raw = mMovieParams[0][0]
2459         self.report_player_url(epTitle)
2460         try:
2461             urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2462             playerUrl = urlHandle.geturl()
2463         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2464             self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2465             return
2466
2467         uri = mMovieParams[0][1]
2468         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2469         self.report_index_download(epTitle)
2470         try:
2471             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2472         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2473             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2474             return
2475
2476         results = []
2477
2478         idoc = xml.etree.ElementTree.fromstring(indexXml)
2479         itemEls = idoc.findall('.//item')
2480         for itemEl in itemEls:
2481             mediaId = itemEl.findall('./guid')[0].text
2482             shortMediaId = mediaId.split(':')[-1]
2483             showId = mediaId.split(':')[-2].replace('.com', '')
2484             officialTitle = itemEl.findall('./title')[0].text
2485             officialDate = itemEl.findall('./pubDate')[0].text
2486
2487             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2488                         compat_urllib_parse.urlencode({'uri': mediaId}))
2489             configReq = compat_urllib_request.Request(configUrl)
2490             self.report_config_download(epTitle)
2491             try:
2492                 configXml = compat_urllib_request.urlopen(configReq).read()
2493             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2494                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2495                 return
2496
2497             cdoc = xml.etree.ElementTree.fromstring(configXml)
2498             turls = []
2499             for rendition in cdoc.findall('.//rendition'):
2500                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2501                 turls.append(finfo)
2502
2503             if len(turls) == 0:
2504                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2505                 continue
2506
2507             if self._downloader.params.get('listformats', None):
2508                 self._print_formats([i[0] for i in turls])
2509                 return
2510
2511             # For now, just pick the highest bitrate
2512             format,video_url = turls[-1]
2513
2514             # Get the format arg from the arg stream
2515             req_format = self._downloader.params.get('format', None)
2516
2517             # Select format if we can find one
2518             for f,v in turls:
2519                 if f == req_format:
2520                     format, video_url = f, v
2521                     break
2522
2523             # Patch to download from alternative CDN, which does not
2524             # break on current RTMPDump builds
2525             broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2526             better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2527
2528             if video_url.startswith(broken_cdn):
2529                 video_url = video_url.replace(broken_cdn, better_cdn)
2530
2531             effTitle = showId + u'-' + epTitle
2532             info = {
2533                 'id': shortMediaId,
2534                 'url': video_url,
2535                 'uploader': showId,
2536                 'upload_date': officialDate,
2537                 'title': effTitle,
2538                 'ext': 'mp4',
2539                 'format': format,
2540                 'thumbnail': None,
2541                 'description': officialTitle,
2542                 'player_url': None #playerUrl
2543             }
2544
2545             results.append(info)
2546
2547         return results
2548
2549
2550 class EscapistIE(InfoExtractor):
2551     """Information extractor for The Escapist """
2552
2553     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2554     IE_NAME = u'escapist'
2555
2556     def report_extraction(self, showName):
2557         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2558
2559     def report_config_download(self, showName):
2560         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2561
2562     def _real_extract(self, url):
2563         mobj = re.match(self._VALID_URL, url)
2564         if mobj is None:
2565             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2566             return
2567         showName = mobj.group('showname')
2568         videoId = mobj.group('episode')
2569
2570         self.report_extraction(showName)
2571         try:
2572             webPage = compat_urllib_request.urlopen(url)
2573             webPageBytes = webPage.read()
2574             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2575             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2576         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2577             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2578             return
2579
2580         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2581         description = unescapeHTML(descMatch.group(1))
2582         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2583         imgUrl = unescapeHTML(imgMatch.group(1))
2584         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2585         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2586         configUrlMatch = re.search('config=(.*)$', playerUrl)
2587         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2588
2589         self.report_config_download(showName)
2590         try:
2591             configJSON = compat_urllib_request.urlopen(configUrl)
2592             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2593             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2594         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2595             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2596             return
2597
2598         # Technically, it's JavaScript, not JSON
2599         configJSON = configJSON.replace("'", '"')
2600
2601         try:
2602             config = json.loads(configJSON)
2603         except (ValueError,) as err:
2604             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2605             return
2606
2607         playlist = config['playlist']
2608         videoUrl = playlist[1]['url']
2609
2610         info = {
2611             'id': videoId,
2612             'url': videoUrl,
2613             'uploader': showName,
2614             'upload_date': None,
2615             'title': showName,
2616             'ext': 'flv',
2617             'thumbnail': imgUrl,
2618             'description': description,
2619             'player_url': playerUrl,
2620         }
2621
2622         return [info]
2623
2624
2625 class CollegeHumorIE(InfoExtractor):
2626     """Information extractor for collegehumor.com"""
2627
2628     _WORKING = False
2629     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2630     IE_NAME = u'collegehumor'
2631
2632     def report_manifest(self, video_id):
2633         """Report information extraction."""
2634         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2635
2636     def report_extraction(self, video_id):
2637         """Report information extraction."""
2638         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2639
2640     def _real_extract(self, url):
2641         mobj = re.match(self._VALID_URL, url)
2642         if mobj is None:
2643             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2644             return
2645         video_id = mobj.group('videoid')
2646
2647         info = {
2648             'id': video_id,
2649             'uploader': None,
2650             'upload_date': None,
2651         }
2652
2653         self.report_extraction(video_id)
2654         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2655         try:
2656             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2657         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2658             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2659             return
2660
2661         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2662         try:
2663             videoNode = mdoc.findall('./video')[0]
2664             info['description'] = videoNode.findall('./description')[0].text
2665             info['title'] = videoNode.findall('./caption')[0].text
2666             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2667             manifest_url = videoNode.findall('./file')[0].text
2668         except IndexError:
2669             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2670             return
2671
2672         manifest_url += '?hdcore=2.10.3'
2673         self.report_manifest(video_id)
2674         try:
2675             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2676         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2677             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2678             return
2679
2680         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2681         try:
2682             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2683             node_id = media_node.attrib['url']
2684             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2685         except IndexError as err:
2686             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2687             return
2688
2689         url_pr = compat_urllib_parse_urlparse(manifest_url)
2690         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2691
2692         info['url'] = url
2693         info['ext'] = 'f4f'
2694         return [info]
2695
2696
2697 class XVideosIE(InfoExtractor):
2698     """Information extractor for xvideos.com"""
2699
2700     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2701     IE_NAME = u'xvideos'
2702
2703     def report_webpage(self, video_id):
2704         """Report information extraction."""
2705         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2706
2707     def report_extraction(self, video_id):
2708         """Report information extraction."""
2709         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2710
2711     def _real_extract(self, url):
2712         mobj = re.match(self._VALID_URL, url)
2713         if mobj is None:
2714             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2715             return
2716         video_id = mobj.group(1)
2717
2718         self.report_webpage(video_id)
2719
2720         request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2721         try:
2722             webpage_bytes = compat_urllib_request.urlopen(request).read()
2723             webpage = webpage_bytes.decode('utf-8', 'replace')
2724         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2725             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2726             return
2727
2728         self.report_extraction(video_id)
2729
2730
2731         # Extract video URL
2732         mobj = re.search(r'flv_url=(.+?)&', webpage)
2733         if mobj is None:
2734             self._downloader.trouble(u'ERROR: unable to extract video url')
2735             return
2736         video_url = compat_urllib_parse.unquote(mobj.group(1))
2737
2738
2739         # Extract title
2740         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2741         if mobj is None:
2742             self._downloader.trouble(u'ERROR: unable to extract video title')
2743             return
2744         video_title = mobj.group(1)
2745
2746
2747         # Extract video thumbnail
2748         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2749         if mobj is None:
2750             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2751             return
2752         video_thumbnail = mobj.group(0)
2753
2754         info = {
2755             'id': video_id,
2756             'url': video_url,
2757             'uploader': None,
2758             'upload_date': None,
2759             'title': video_title,
2760             'ext': 'flv',
2761             'thumbnail': video_thumbnail,
2762             'description': None,
2763         }
2764
2765         return [info]
2766
2767
2768 class SoundcloudIE(InfoExtractor):
2769     """Information extractor for soundcloud.com
2770        To access the media, the uid of the song and a stream token
2771        must be extracted from the page source and the script must make
2772        a request to media.soundcloud.com/crossdomain.xml. Then
2773        the media can be grabbed by requesting from an url composed
2774        of the stream token and uid
2775      """
2776
2777     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2778     IE_NAME = u'soundcloud'
2779
2780     def __init__(self, downloader=None):
2781         InfoExtractor.__init__(self, downloader)
2782
2783     def report_resolve(self, video_id):
2784         """Report information extraction."""
2785         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2786
2787     def report_extraction(self, video_id):
2788         """Report information extraction."""
2789         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2790
2791     def _real_extract(self, url):
2792         mobj = re.match(self._VALID_URL, url)
2793         if mobj is None:
2794             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2795             return
2796
2797         # extract uploader (which is in the url)
2798         uploader = mobj.group(1)
2799         # extract simple title (uploader + slug of song title)
2800         slug_title =  mobj.group(2)
2801         simple_title = uploader + u'-' + slug_title
2802
2803         self.report_resolve('%s/%s' % (uploader, slug_title))
2804
2805         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2806         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2807         request = compat_urllib_request.Request(resolv_url)
2808         try:
2809             info_json_bytes = compat_urllib_request.urlopen(request).read()
2810             info_json = info_json_bytes.decode('utf-8')
2811         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2812             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2813             return
2814
2815         info = json.loads(info_json)
2816         video_id = info['id']
2817         self.report_extraction('%s/%s' % (uploader, slug_title))
2818
2819         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2820         request = compat_urllib_request.Request(streams_url)
2821         try:
2822             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2823             stream_json = stream_json_bytes.decode('utf-8')
2824         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2825             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2826             return
2827
2828         streams = json.loads(stream_json)
2829         mediaURL = streams['http_mp3_128_url']
2830
2831         return [{
2832             'id':       info['id'],
2833             'url':      mediaURL,
2834             'uploader': info['user']['username'],
2835             'upload_date':  info['created_at'],
2836             'title':    info['title'],
2837             'ext':      u'mp3',
2838             'description': info['description'],
2839         }]
2840
2841
2842 class InfoQIE(InfoExtractor):
2843     """Information extractor for infoq.com"""
2844
2845     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2846     IE_NAME = u'infoq'
2847
2848     def report_webpage(self, video_id):
2849         """Report information extraction."""
2850         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2851
2852     def report_extraction(self, video_id):
2853         """Report information extraction."""
2854         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2855
2856     def _real_extract(self, url):
2857         mobj = re.match(self._VALID_URL, url)
2858         if mobj is None:
2859             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2860             return
2861
2862         self.report_webpage(url)
2863
2864         request = compat_urllib_request.Request(url)
2865         try:
2866             webpage = compat_urllib_request.urlopen(request).read()
2867         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2868             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2869             return
2870
2871         self.report_extraction(url)
2872
2873
2874         # Extract video URL
2875         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2876         if mobj is None:
2877             self._downloader.trouble(u'ERROR: unable to extract video url')
2878             return
2879         video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2880
2881
2882         # Extract title
2883         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2884         if mobj is None:
2885             self._downloader.trouble(u'ERROR: unable to extract video title')
2886             return
2887         video_title = mobj.group(1).decode('utf-8')
2888
2889         # Extract description
2890         video_description = u'No description available.'
2891         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2892         if mobj is not None:
2893             video_description = mobj.group(1).decode('utf-8')
2894
2895         video_filename = video_url.split('/')[-1]
2896         video_id, extension = video_filename.split('.')
2897
2898         info = {
2899             'id': video_id,
2900             'url': video_url,
2901             'uploader': None,
2902             'upload_date': None,
2903             'title': video_title,
2904             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2905             'thumbnail': None,
2906             'description': video_description,
2907         }
2908
2909         return [info]
2910
2911 class MixcloudIE(InfoExtractor):
2912     """Information extractor for www.mixcloud.com"""
2913
2914     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2915     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2916     IE_NAME = u'mixcloud'
2917
2918     def __init__(self, downloader=None):
2919         InfoExtractor.__init__(self, downloader)
2920
2921     def report_download_json(self, file_id):
2922         """Report JSON download."""
2923         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2924
2925     def report_extraction(self, file_id):
2926         """Report information extraction."""
2927         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2928
2929     def get_urls(self, jsonData, fmt, bitrate='best'):
2930         """Get urls from 'audio_formats' section in json"""
2931         file_url = None
2932         try:
2933             bitrate_list = jsonData[fmt]
2934             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2935                 bitrate = max(bitrate_list) # select highest
2936
2937             url_list = jsonData[fmt][bitrate]
2938         except TypeError: # we have no bitrate info.
2939             url_list = jsonData[fmt]
2940         return url_list
2941
2942     def check_urls(self, url_list):
2943         """Returns 1st active url from list"""
2944         for url in url_list:
2945             try:
2946                 compat_urllib_request.urlopen(url)
2947                 return url
2948             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2949                 url = None
2950
2951         return None
2952
2953     def _print_formats(self, formats):
2954         print('Available formats:')
2955         for fmt in formats.keys():
2956             for b in formats[fmt]:
2957                 try:
2958                     ext = formats[fmt][b][0]
2959                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2960                 except TypeError: # we have no bitrate info
2961                     ext = formats[fmt][0]
2962                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2963                     break
2964
2965     def _real_extract(self, url):
2966         mobj = re.match(self._VALID_URL, url)
2967         if mobj is None:
2968             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2969             return
2970         # extract uploader & filename from url
2971         uploader = mobj.group(1).decode('utf-8')
2972         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2973
2974         # construct API request
2975         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2976         # retrieve .json file with links to files
2977         request = compat_urllib_request.Request(file_url)
2978         try:
2979             self.report_download_json(file_url)
2980             jsonData = compat_urllib_request.urlopen(request).read()
2981         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2982             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2983             return
2984
2985         # parse JSON
2986         json_data = json.loads(jsonData)
2987         player_url = json_data['player_swf_url']
2988         formats = dict(json_data['audio_formats'])
2989
2990         req_format = self._downloader.params.get('format', None)
2991         bitrate = None
2992
2993         if self._downloader.params.get('listformats', None):
2994             self._print_formats(formats)
2995             return
2996
2997         if req_format is None or req_format == 'best':
2998             for format_param in formats.keys():
2999                 url_list = self.get_urls(formats, format_param)
3000                 # check urls
3001                 file_url = self.check_urls(url_list)
3002                 if file_url is not None:
3003                     break # got it!
3004         else:
3005             if req_format not in formats:
3006                 self._downloader.trouble(u'ERROR: format is not available')
3007                 return
3008
3009             url_list = self.get_urls(formats, req_format)
3010             file_url = self.check_urls(url_list)
3011             format_param = req_format
3012
3013         return [{
3014             'id': file_id.decode('utf-8'),
3015             'url': file_url.decode('utf-8'),
3016             'uploader': uploader.decode('utf-8'),
3017             'upload_date': None,
3018             'title': json_data['name'],
3019             'ext': file_url.split('.')[-1].decode('utf-8'),
3020             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3021             'thumbnail': json_data['thumbnail_url'],
3022             'description': json_data['description'],
3023             'player_url': player_url.decode('utf-8'),
3024         }]
3025
3026 class StanfordOpenClassroomIE(InfoExtractor):
3027     """Information extractor for Stanford's Open ClassRoom"""
3028
3029     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3030     IE_NAME = u'stanfordoc'
3031
3032     def report_download_webpage(self, objid):
3033         """Report information extraction."""
3034         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3035
3036     def report_extraction(self, video_id):
3037         """Report information extraction."""
3038         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3039
3040     def _real_extract(self, url):
3041         mobj = re.match(self._VALID_URL, url)
3042         if mobj is None:
3043             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3044             return
3045
3046         if mobj.group('course') and mobj.group('video'): # A specific video
3047             course = mobj.group('course')
3048             video = mobj.group('video')
3049             info = {
3050                 'id': course + '_' + video,
3051                 'uploader': None,
3052                 'upload_date': None,
3053             }
3054
3055             self.report_extraction(info['id'])
3056             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3057             xmlUrl = baseUrl + video + '.xml'
3058             try:
3059                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3060             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3061                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3062                 return
3063             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3064             try:
3065                 info['title'] = mdoc.findall('./title')[0].text
3066                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3067             except IndexError:
3068                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3069                 return
3070             info['ext'] = info['url'].rpartition('.')[2]
3071             return [info]
3072         elif mobj.group('course'): # A course page
3073             course = mobj.group('course')
3074             info = {
3075                 'id': course,
3076                 'type': 'playlist',
3077                 'uploader': None,
3078                 'upload_date': None,
3079             }
3080
3081             self.report_download_webpage(info['id'])
3082             try:
3083                 coursepage = compat_urllib_request.urlopen(url).read()
3084             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3085                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3086                 return
3087
3088             m = re.search('<h1>([^<]+)</h1>', coursepage)
3089             if m:
3090                 info['title'] = unescapeHTML(m.group(1))
3091             else:
3092                 info['title'] = info['id']
3093
3094             m = re.search('<description>([^<]+)</description>', coursepage)
3095             if m:
3096                 info['description'] = unescapeHTML(m.group(1))
3097
3098             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3099             info['list'] = [
3100                 {
3101                     'type': 'reference',
3102                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3103                 }
3104                     for vpage in links]
3105             results = []
3106             for entry in info['list']:
3107                 assert entry['type'] == 'reference'
3108                 results += self.extract(entry['url'])
3109             return results
3110
3111         else: # Root page
3112             info = {
3113                 'id': 'Stanford OpenClassroom',
3114                 'type': 'playlist',
3115                 'uploader': None,
3116                 'upload_date': None,
3117             }
3118
3119             self.report_download_webpage(info['id'])
3120             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3121             try:
3122                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3123             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3124                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3125                 return
3126
3127             info['title'] = info['id']
3128
3129             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3130             info['list'] = [
3131                 {
3132                     'type': 'reference',
3133                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3134                 }
3135                     for cpage in links]
3136
3137             results = []
3138             for entry in info['list']:
3139                 assert entry['type'] == 'reference'
3140                 results += self.extract(entry['url'])
3141             return results
3142
3143 class MTVIE(InfoExtractor):
3144     """Information extractor for MTV.com"""
3145
3146     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3147     IE_NAME = u'mtv'
3148
3149     def report_webpage(self, video_id):
3150         """Report information extraction."""
3151         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3152
3153     def report_extraction(self, video_id):
3154         """Report information extraction."""
3155         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3156
3157     def _real_extract(self, url):
3158         mobj = re.match(self._VALID_URL, url)
3159         if mobj is None:
3160             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3161             return
3162         if not mobj.group('proto'):
3163             url = 'http://' + url
3164         video_id = mobj.group('videoid')
3165         self.report_webpage(video_id)
3166
3167         request = compat_urllib_request.Request(url)
3168         try:
3169             webpage = compat_urllib_request.urlopen(request).read()
3170         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3171             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3172             return
3173
3174         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3175         if mobj is None:
3176             self._downloader.trouble(u'ERROR: unable to extract song name')
3177             return
3178         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3179         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3180         if mobj is None:
3181             self._downloader.trouble(u'ERROR: unable to extract performer')
3182             return
3183         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3184         video_title = performer + ' - ' + song_name
3185
3186         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3187         if mobj is None:
3188             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3189             return
3190         mtvn_uri = mobj.group(1)
3191
3192         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3193         if mobj is None:
3194             self._downloader.trouble(u'ERROR: unable to extract content id')
3195             return
3196         content_id = mobj.group(1)
3197
3198         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3199         self.report_extraction(video_id)
3200         request = compat_urllib_request.Request(videogen_url)
3201         try:
3202             metadataXml = compat_urllib_request.urlopen(request).read()
3203         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3204             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3205             return
3206
3207         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3208         renditions = mdoc.findall('.//rendition')
3209
3210         # For now, always pick the highest quality.
3211         rendition = renditions[-1]
3212
3213         try:
3214             _,_,ext = rendition.attrib['type'].partition('/')
3215             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3216             video_url = rendition.find('./src').text
3217         except KeyError:
3218             self._downloader.trouble('Invalid rendition field.')
3219             return
3220
3221         info = {
3222             'id': video_id,
3223             'url': video_url,
3224             'uploader': performer,
3225             'upload_date': None,
3226             'title': video_title,
3227             'ext': ext,
3228             'format': format,
3229         }
3230
3231         return [info]
3232
3233
3234 class YoukuIE(InfoExtractor):
3235     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3236
3237     def report_download_webpage(self, file_id):
3238         """Report webpage download."""
3239         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3240
3241     def report_extraction(self, file_id):
3242         """Report information extraction."""
3243         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3244
3245     def _gen_sid(self):
3246         nowTime = int(time.time() * 1000)
3247         random1 = random.randint(1000,1998)
3248         random2 = random.randint(1000,9999)
3249
3250         return "%d%d%d" %(nowTime,random1,random2)
3251
3252     def _get_file_ID_mix_string(self, seed):
3253         mixed = []
3254         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3255         seed = float(seed)
3256         for i in range(len(source)):
3257             seed  =  (seed * 211 + 30031 ) % 65536
3258             index  =  math.floor(seed / 65536 * len(source) )
3259             mixed.append(source[int(index)])
3260             source.remove(source[int(index)])
3261         #return ''.join(mixed)
3262         return mixed
3263
3264     def _get_file_id(self, fileId, seed):
3265         mixed = self._get_file_ID_mix_string(seed)
3266         ids = fileId.split('*')
3267         realId = []
3268         for ch in ids:
3269             if ch:
3270                 realId.append(mixed[int(ch)])
3271         return ''.join(realId)
3272
3273     def _real_extract(self, url):
3274         mobj = re.match(self._VALID_URL, url)
3275         if mobj is None:
3276             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3277             return
3278         video_id = mobj.group('ID')
3279
3280         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3281
3282         request = compat_urllib_request.Request(info_url, None, std_headers)
3283         try:
3284             self.report_download_webpage(video_id)
3285             jsondata = compat_urllib_request.urlopen(request).read()
3286         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3287             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3288             return
3289
3290         self.report_extraction(video_id)
3291         try:
3292             jsonstr = jsondata.decode('utf-8')
3293             config = json.loads(jsonstr)
3294
3295             video_title =  config['data'][0]['title']
3296             seed = config['data'][0]['seed']
3297
3298             format = self._downloader.params.get('format', None)
3299             supported_format = list(config['data'][0]['streamfileids'].keys())
3300
3301             if format is None or format == 'best':
3302                 if 'hd2' in supported_format:
3303                     format = 'hd2'
3304                 else:
3305                     format = 'flv'
3306                 ext = u'flv'
3307             elif format == 'worst':
3308                 format = 'mp4'
3309                 ext = u'mp4'
3310             else:
3311                 format = 'flv'
3312                 ext = u'flv'
3313
3314
3315             fileid = config['data'][0]['streamfileids'][format]
3316             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3317         except (UnicodeDecodeError, ValueError, KeyError):
3318             self._downloader.trouble(u'ERROR: unable to extract info section')
3319             return
3320
3321         files_info=[]
3322         sid = self._gen_sid()
3323         fileid = self._get_file_id(fileid, seed)
3324
3325         #column 8,9 of fileid represent the segment number
3326         #fileid[7:9] should be changed
3327         for index, key in enumerate(keys):
3328
3329             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3330             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3331
3332             info = {
3333                 'id': '%s_part%02d' % (video_id, index),
3334                 'url': download_url,
3335                 'uploader': None,
3336                 'upload_date': None,
3337                 'title': video_title,
3338                 'ext': ext,
3339             }
3340             files_info.append(info)
3341
3342         return files_info
3343
3344
3345 class XNXXIE(InfoExtractor):
3346     """Information extractor for xnxx.com"""
3347
3348     _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3349     IE_NAME = u'xnxx'
3350     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3351     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3352     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3353
3354     def report_webpage(self, video_id):
3355         """Report information extraction"""
3356         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3357
3358     def report_extraction(self, video_id):
3359         """Report information extraction"""
3360         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3361
3362     def _real_extract(self, url):
3363         mobj = re.match(self._VALID_URL, url)
3364         if mobj is None:
3365             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3366             return
3367         video_id = mobj.group(1)
3368
3369         self.report_webpage(video_id)
3370
3371         # Get webpage content
3372         try:
3373             webpage_bytes = compat_urllib_request.urlopen(url).read()
3374             webpage = webpage_bytes.decode('utf-8')
3375         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3376             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3377             return
3378
3379         result = re.search(self.VIDEO_URL_RE, webpage)
3380         if result is None:
3381             self._downloader.trouble(u'ERROR: unable to extract video url')
3382             return
3383         video_url = compat_urllib_parse.unquote(result.group(1))
3384
3385         result = re.search(self.VIDEO_TITLE_RE, webpage)
3386         if result is None:
3387             self._downloader.trouble(u'ERROR: unable to extract video title')
3388             return
3389         video_title = result.group(1)
3390
3391         result = re.search(self.VIDEO_THUMB_RE, webpage)
3392         if result is None:
3393             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3394             return
3395         video_thumbnail = result.group(1)
3396
3397         return [{
3398             'id': video_id,
3399             'url': video_url,
3400             'uploader': None,
3401             'upload_date': None,
3402             'title': video_title,
3403             'ext': 'flv',
3404             'thumbnail': video_thumbnail,
3405             'description': None,
3406         }]
3407
3408
3409 class GooglePlusIE(InfoExtractor):
3410     """Information extractor for plus.google.com."""
3411
3412     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3413     IE_NAME = u'plus.google'
3414
3415     def __init__(self, downloader=None):
3416         InfoExtractor.__init__(self, downloader)
3417
3418     def report_extract_entry(self, url):
3419         """Report downloading extry"""
3420         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3421
3422     def report_date(self, upload_date):
3423         """Report downloading extry"""
3424         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3425
3426     def report_uploader(self, uploader):
3427         """Report downloading extry"""
3428         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3429
3430     def report_title(self, video_title):
3431         """Report downloading extry"""
3432         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3433
3434     def report_extract_vid_page(self, video_page):
3435         """Report information extraction."""
3436         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3437
3438     def _real_extract(self, url):
3439         # Extract id from URL
3440         mobj = re.match(self._VALID_URL, url)
3441         if mobj is None:
3442             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3443             return
3444
3445         post_url = mobj.group(0)
3446         video_id = mobj.group(1)
3447
3448         video_extension = 'flv'
3449
3450         # Step 1, Retrieve post webpage to extract further information
3451         self.report_extract_entry(post_url)
3452         request = compat_urllib_request.Request(post_url)
3453         try:
3454             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3455         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3456             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3457             return
3458
3459         # Extract update date
3460         upload_date = None
3461         pattern = 'title="Timestamp">(.*?)</a>'
3462         mobj = re.search(pattern, webpage)
3463         if mobj:
3464             upload_date = mobj.group(1)
3465             # Convert timestring to a format suitable for filename
3466             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3467             upload_date = upload_date.strftime('%Y%m%d')
3468         self.report_date(upload_date)
3469
3470         # Extract uploader
3471         uploader = None
3472         pattern = r'rel\="author".*?>(.*?)</a>'
3473         mobj = re.search(pattern, webpage)
3474         if mobj:
3475             uploader = mobj.group(1)
3476         self.report_uploader(uploader)
3477
3478         # Extract title
3479         # Get the first line for title
3480         video_title = u'NA'
3481         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3482         mobj = re.search(pattern, webpage)
3483         if mobj:
3484             video_title = mobj.group(1)
3485         self.report_title(video_title)
3486
3487         # Step 2, Stimulate clicking the image box to launch video
3488         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3489         mobj = re.search(pattern, webpage)
3490         if mobj is None:
3491             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3492
3493         video_page = mobj.group(1)
3494         request = compat_urllib_request.Request(video_page)
3495         try:
3496             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3497         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3498             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3499             return
3500         self.report_extract_vid_page(video_page)
3501
3502
3503         # Extract video links on video page
3504         """Extract video links of all sizes"""
3505         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3506         mobj = re.findall(pattern, webpage)
3507         if len(mobj) == 0:
3508             self._downloader.trouble(u'ERROR: unable to extract video links')
3509
3510         # Sort in resolution
3511         links = sorted(mobj)
3512
3513         # Choose the lowest of the sort, i.e. highest resolution
3514         video_url = links[-1]
3515         # Only get the url. The resolution part in the tuple has no use anymore
3516         video_url = video_url[-1]
3517         # Treat escaped \u0026 style hex
3518         try:
3519             video_url = video_url.decode("unicode_escape")
3520         except AttributeError: # Python 3
3521             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3522
3523
3524         return [{
3525             'id':       video_id,
3526             'url':      video_url,
3527             'uploader': uploader,
3528             'upload_date':  upload_date,
3529             'title':    video_title,
3530             'ext':      video_extension,
3531         }]
3532
3533 class NBAIE(InfoExtractor):
3534     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3535     IE_NAME = u'nba'
3536
3537     def report_extraction(self, video_id):
3538         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3539
3540     def _real_extract(self, url):
3541         mobj = re.match(self._VALID_URL, url)
3542         if mobj is None:
3543             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3544             return
3545
3546         video_id = mobj.group(1)
3547         if video_id.endswith('/index.html'):
3548             video_id = video_id[:-len('/index.html')]
3549
3550         self.report_extraction(video_id)
3551         try:
3552             urlh = compat_urllib_request.urlopen(url)
3553             webpage_bytes = urlh.read()
3554             webpage = webpage_bytes.decode('utf-8', 'ignore')
3555         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3556             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3557             return
3558
3559         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3560         def _findProp(rexp, default=None):
3561             m = re.search(rexp, webpage)
3562             if m:
3563                 return unescapeHTML(m.group(1))
3564             else:
3565                 return default
3566
3567         shortened_video_id = video_id.rpartition('/')[2]
3568         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3569         info = {
3570             'id': shortened_video_id,
3571             'url': video_url,
3572             'ext': 'mp4',
3573             'title': title,
3574             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3575             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3576         }
3577         return [info]
3578
3579 class JustinTVIE(InfoExtractor):
3580     """Information extractor for justin.tv and twitch.tv"""
3581     # TODO: One broadcast may be split into multiple videos. The key
3582     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3583     # starts at 1 and increases. Can we treat all parts as one video?
3584
3585     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3586         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3587     _JUSTIN_PAGE_LIMIT = 100
3588     IE_NAME = u'justin.tv'
3589
3590     def report_extraction(self, file_id):
3591         """Report information extraction."""
3592         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3593
3594     def report_download_page(self, channel, offset):
3595         """Report attempt to download a single page of videos."""
3596         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3597                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3598
3599     # Return count of items, list of *valid* items
3600     def _parse_page(self, url):
3601         try:
3602             urlh = compat_urllib_request.urlopen(url)
3603             webpage_bytes = urlh.read()
3604             webpage = webpage_bytes.decode('utf-8', 'ignore')
3605         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3606             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3607             return
3608
3609         response = json.loads(webpage)
3610         info = []
3611         for clip in response:
3612             video_url = clip['video_file_url']
3613             if video_url:
3614                 video_extension = os.path.splitext(video_url)[1][1:]
3615                 video_date = re.sub('-', '', clip['created_on'][:10])
3616                 info.append({
3617                     'id': clip['id'],
3618                     'url': video_url,
3619                     'title': clip['title'],
3620                     'uploader': clip.get('user_id', clip.get('channel_id')),
3621                     'upload_date': video_date,
3622                     'ext': video_extension,
3623                 })
3624         return (len(response), info)
3625
3626     def _real_extract(self, url):
3627         mobj = re.match(self._VALID_URL, url)
3628         if mobj is None:
3629             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3630             return
3631
3632         api = 'http://api.justin.tv'
3633         video_id = mobj.group(mobj.lastindex)
3634         paged = False
3635         if mobj.lastindex == 1:
3636             paged = True
3637             api += '/channel/archives/%s.json'
3638         else:
3639             api += '/clip/show/%s.json'
3640         api = api % (video_id,)
3641
3642         self.report_extraction(video_id)
3643
3644         info = []
3645         offset = 0
3646         limit = self._JUSTIN_PAGE_LIMIT
3647         while True:
3648             if paged:
3649                 self.report_download_page(video_id, offset)
3650             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3651             page_count, page_info = self._parse_page(page_url)
3652             info.extend(page_info)
3653             if not paged or page_count != limit:
3654                 break
3655             offset += limit
3656         return info
3657
3658 class FunnyOrDieIE(InfoExtractor):
3659     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3660
3661     def report_extraction(self, video_id):
3662         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3663
3664     def _real_extract(self, url):
3665         mobj = re.match(self._VALID_URL, url)
3666         if mobj is None:
3667             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3668             return
3669
3670         video_id = mobj.group('id')
3671         self.report_extraction(video_id)
3672         try:
3673             urlh = compat_urllib_request.urlopen(url)
3674             webpage_bytes = urlh.read()
3675             webpage = webpage_bytes.decode('utf-8', 'ignore')
3676         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3677             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
3678             return
3679
3680         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3681         if not m:
3682             self._downloader.trouble(u'ERROR: unable to find video information')
3683         video_url = unescapeHTML(m.group('url'))
3684
3685         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3686         if not m:
3687             self._downloader.trouble(u'Cannot find video title')
3688         title = unescapeHTML(m.group('title'))
3689
3690         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3691         if m:
3692             desc = unescapeHTML(m.group('desc'))
3693         else:
3694             desc = None
3695
3696         info = {
3697             'id': video_id,
3698             'url': video_url,
3699             'ext': 'mp4',
3700             'title': title,
3701             'description': desc,
3702         }
3703         return [info]
3704
3705 class TweetReelIE(InfoExtractor):
3706     _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3707
3708     def report_extraction(self, video_id):
3709         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3710
3711     def _real_extract(self, url):
3712         mobj = re.match(self._VALID_URL, url)
3713         if mobj is None:
3714             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3715             return
3716
3717         video_id = mobj.group('id')
3718         self.report_extraction(video_id)
3719         try:
3720             urlh = compat_urllib_request.urlopen(url)
3721             webpage_bytes = urlh.read()
3722             webpage = webpage_bytes.decode('utf-8', 'ignore')
3723         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3724             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
3725             return
3726
3727         m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3728         if not m:
3729             self._downloader.trouble(u'ERROR: Cannot find status ID')
3730         status_id = m.group(1)
3731
3732         m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3733         if not m:
3734             self._downloader.trouble(u'WARNING: Cannot find description')
3735         desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3736
3737         m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3738         if not m:
3739             self._downloader.trouble(u'ERROR: Cannot find uploader')
3740         uploader = unescapeHTML(m.group('uploader'))
3741         uploader_id = unescapeHTML(m.group('uploader_id'))
3742
3743         m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3744         if not m:
3745             self._downloader.trouble(u'ERROR: Cannot find upload date')
3746         upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3747
3748         title = desc
3749         video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3750
3751         info = {
3752             'id': video_id,
3753             'url': video_url,
3754             'ext': 'mov',
3755             'title': title,
3756             'description': desc,
3757             'uploader': uploader,
3758             'uploader_id': uploader_id,
3759             'internal_id': status_id,
3760             'upload_date': upload_date
3761         }
3762         return [info]
3763
3764 class SteamIE(InfoExtractor):
3765     _VALID_URL = r"""http://store.steampowered.com/
3766                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3767                 (?P<gameID>\d+)/?
3768                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3769                 """
3770
3771     def suitable(self, url):
3772         """Receives a URL and returns True if suitable for this IE."""
3773         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3774
3775     def report_download_video_page(self, game_id):
3776         self._downloader.to_screen(u'[%s] %s: Downloading video page' % (self.IE_NAME, game_id))
3777
3778     def _real_extract(self, url):
3779         m = re.match(self._VALID_URL, url, re.VERBOSE)
3780         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3781         gameID = m.group('gameID')
3782         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3783         try:
3784             self.report_download_video_page(gameID)
3785             urlh = compat_urllib_request.urlopen(videourl)
3786             webpage_bytes = urlh.read()
3787             webpage = webpage_bytes.decode('utf-8', 'ignore')
3788         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3789             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
3790             return
3791         mweb = re.finditer(urlRE, webpage)
3792         namesRE = r'<span class=\"title\">(?P<videoName>[\w:/\.\?=\+\s-]+)</span>'
3793         titles = list(re.finditer(namesRE, webpage))
3794         videos = []
3795         i = 0
3796         for vid in mweb:
3797             video_id = vid.group('videoID')
3798             title = titles[i].group('videoName')
3799             video_url=vid.group('videoURL')
3800             if not video_url:
3801                 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3802             i += 1
3803             info = {
3804                 'id':video_id,
3805                 'url':video_url,
3806                 'ext': 'flv',
3807                 'title': title
3808                   }
3809             videos.append(info)
3810         return videos
3811
3812 class UstreamIE(InfoExtractor):
3813     _VALID_URL = r'http://www.ustream.tv/recorded/(?P<videoID>\d+)'
3814     IE_NAME = u'ustream'
3815
3816     def _real_extract(self, url):
3817         m = re.match(self._VALID_URL, url)
3818         video_id = m.group('videoID')
3819         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3820         webpage = self._download_webpage(url, video_id)
3821         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3822         title = m.group('title')
3823         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3824         uploader = m.group('uploader')
3825         info = {
3826                 'id':video_id,
3827                 'url':video_url,
3828                 'ext': 'flv',
3829                 'title': title,
3830                 'uploader': uploader
3831                   }
3832         return [info]
3833
3834
3835 def gen_extractors():
3836     """ Return a list of an instance of every supported extractor.
3837     The order does matter; the first extractor matched is the one handling the URL.
3838     """
3839     return [
3840         YoutubePlaylistIE(),
3841         YoutubeChannelIE(),
3842         YoutubeUserIE(),
3843         YoutubeSearchIE(),
3844         YoutubeIE(),
3845         MetacafeIE(),
3846         DailymotionIE(),
3847         GoogleSearchIE(),
3848         PhotobucketIE(),
3849         YahooIE(),
3850         YahooSearchIE(),
3851         DepositFilesIE(),
3852         FacebookIE(),
3853         BlipTVUserIE(),
3854         BlipTVIE(),
3855         VimeoIE(),
3856         MyVideoIE(),
3857         ComedyCentralIE(),
3858         EscapistIE(),
3859         CollegeHumorIE(),
3860         XVideosIE(),
3861         SoundcloudIE(),
3862         InfoQIE(),
3863         MixcloudIE(),
3864         StanfordOpenClassroomIE(),
3865         MTVIE(),
3866         YoukuIE(),
3867         XNXXIE(),
3868         GooglePlusIE(),
3869         ArteTvIE(),
3870         NBAIE(),
3871         JustinTVIE(),
3872         FunnyOrDieIE(),
3873         TweetReelIE(),
3874         SteamIE(),
3875         UstreamIE(),
3876         GenericIE()
3877     ]
3878
3879