_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import datetime
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import time
  12 import email.utils
  13 import xml.etree.ElementTree
  14 import random
  15 import math
  16
  17 from .utils import *
  18
  19
  20 class InfoExtractor(object):
  21     """Information Extractor class.
  22
  23     Information extractors are the classes that, given a URL, extract
  24     information about the video (or videos) the URL refers to. This
  25     information includes the real video URL, the video title, author and
  26     others. The information is stored in a dictionary which is then
  27     passed to the FileDownloader. The FileDownloader processes this
  28     information possibly downloading the video to the file system, among
  29     other possible outcomes.
  30
  31     The dictionaries must include the following fields:
  32
  33     id:             Video identifier.
  34     url:            Final video URL.
  35     uploader:       Full name of the video uploader, unescaped.
  36     upload_date:    Video upload date (YYYYMMDD).
  37     title:          Video title, unescaped.
  38     ext:            Video filename extension.
  39
  40     The following fields are optional:
  41
  42     format:         The video format, defaults to ext (used for --get-format)
  43     thumbnail:      Full URL to a video thumbnail image.
  44     description:    One-line video description.
  45     uploader_id:    Nickname or id of the video uploader.
  46     player_url:     SWF Player URL (used for rtmpdump).
  47     subtitles:      The .srt file contents.
  48     urlhandle:      [internal] The urlHandle to be used to download the file,
  49                     like returned by urllib.request.urlopen
  50
  51     The fields should all be Unicode strings.
  52
  53     Subclasses of this one should re-define the _real_initialize() and
  54     _real_extract() methods and define a _VALID_URL regexp.
  55     Probably, they should also be added to the list of extractors.
  56
  57     _real_extract() must return a *list* of information dictionaries as
  58     described above.
  59
  60     Finally, the _WORKING attribute should be set to False for broken IEs
  61     in order to warn the users and skip the tests.
  62     """
  63
  64     _ready = False
  65     _downloader = None
  66     _WORKING = True
  67
  68     def __init__(self, downloader=None):
  69         """Constructor. Receives an optional downloader."""
  70         self._ready = False
  71         self.set_downloader(downloader)
  72
  73     def suitable(self, url):
  74         """Receives a URL and returns True if suitable for this IE."""
  75         return re.match(self._VALID_URL, url) is not None
  76
  77     def working(self):
  78         """Getter method for _WORKING."""
  79         return self._WORKING
  80
  81     def initialize(self):
  82         """Initializes an instance (authentication, etc)."""
  83         if not self._ready:
  84             self._real_initialize()
  85             self._ready = True
  86
  87     def extract(self, url):
  88         """Extracts URL information and returns it in list of dicts."""
  89         self.initialize()
  90         return self._real_extract(url)
  91
  92     def set_downloader(self, downloader):
  93         """Sets the downloader for this IE."""
  94         self._downloader = downloader
  95
  96     def _real_initialize(self):
  97         """Real initialization process. Redefine in subclasses."""
  98         pass
  99
 100     def _real_extract(self, url):
 101         """Real extraction process. Redefine in subclasses."""
 102         pass
 103
 104
 105 class YoutubeIE(InfoExtractor):
 106     """Information extractor for youtube.com."""
 107
 108     _VALID_URL = r"""^
 109                      (
 110                          (?:https?://)?                                       # http(s):// (optional)
 111                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 112                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 113                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 114                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
 115                          (?:                                                  # the various things that can precede the ID:
 116                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 117                              |(?:                                             # or the v= param in all its forms
 118                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 119                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 120                                  (?:.+&)?                                     # any other preceding param (like /?s=tuff&v=xxxx)
 121                                  v=
 122                              )
 123                          )?                                                   # optional -> youtube.com/xxxx is OK
 124                      )?                                                       # all until now is optional -> you can pass the naked ID
 125                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 126                      (?(1).+)?                                                # if we found the ID, everything can follow
 127                      $"""
 128     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 129     _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 130     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 131     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 132     _NETRC_MACHINE = 'youtube'
 133     # Listed in order of quality
 134     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 135     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 136     _video_extensions = {
 137         '13': '3gp',
 138         '17': 'mp4',
 139         '18': 'mp4',
 140         '22': 'mp4',
 141         '37': 'mp4',
 142         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 143         '43': 'webm',
 144         '44': 'webm',
 145         '45': 'webm',
 146         '46': 'webm',
 147     }
 148     _video_dimensions = {
 149         '5': '240x400',
 150         '6': '???',
 151         '13': '???',
 152         '17': '144x176',
 153         '18': '360x640',
 154         '22': '720x1280',
 155         '34': '360x640',
 156         '35': '480x854',
 157         '37': '1080x1920',
 158         '38': '3072x4096',
 159         '43': '360x640',
 160         '44': '480x854',
 161         '45': '720x1280',
 162         '46': '1080x1920',
 163     }
 164     IE_NAME = u'youtube'
 165
 166     def suitable(self, url):
 167         """Receives a URL and returns True if suitable for this IE."""
 168         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
 169
 170     def report_lang(self):
 171         """Report attempt to set language."""
 172         self._downloader.to_screen(u'[youtube] Setting language')
 173
 174     def report_login(self):
 175         """Report attempt to log in."""
 176         self._downloader.to_screen(u'[youtube] Logging in')
 177
 178     def report_age_confirmation(self):
 179         """Report attempt to confirm age."""
 180         self._downloader.to_screen(u'[youtube] Confirming age')
 181
 182     def report_video_webpage_download(self, video_id):
 183         """Report attempt to download video webpage."""
 184         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 185
 186     def report_video_info_webpage_download(self, video_id):
 187         """Report attempt to download video info webpage."""
 188         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 189
 190     def report_video_subtitles_download(self, video_id):
 191         """Report attempt to download video info webpage."""
 192         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 193
 194     def report_information_extraction(self, video_id):
 195         """Report attempt to extract video information."""
 196         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 197
 198     def report_unavailable_format(self, video_id, format):
 199         """Report extracted video URL."""
 200         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 201
 202     def report_rtmp_download(self):
 203         """Indicate the download will use the RTMP protocol."""
 204         self._downloader.to_screen(u'[youtube] RTMP download detected')
 205
 206     def _closed_captions_xml_to_srt(self, xml_string):
 207         srt = ''
 208         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 209         # TODO parse xml instead of regex
 210         for n, (start, dur_tag, dur, caption) in enumerate(texts):
 211             if not dur: dur = '4'
 212             start = float(start)
 213             end = start + float(dur)
 214             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 215             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 216             caption = unescapeHTML(caption)
 217             caption = unescapeHTML(caption) # double cycle, intentional
 218             srt += str(n+1) + '\n'
 219             srt += start + ' --> ' + end + '\n'
 220             srt += caption + '\n\n'
 221         return srt
 222
 223     def _extract_subtitles(self, video_id):
 224         self.report_video_subtitles_download(video_id)
 225         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 226         try:
 227             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 228         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 229             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 230         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 231         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 232         if not srt_lang_list:
 233             return (u'WARNING: video has no closed captions', None)
 234         if self._downloader.params.get('subtitleslang', False):
 235             srt_lang = self._downloader.params.get('subtitleslang')
 236         elif 'en' in srt_lang_list:
 237             srt_lang = 'en'
 238         else:
 239             srt_lang = list(srt_lang_list.keys())[0]
 240         if not srt_lang in srt_lang_list:
 241             return (u'WARNING: no closed captions found in the specified language', None)
 242         request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
 243         try:
 244             srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
 245         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 246             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 247         if not srt_xml:
 248             return (u'WARNING: unable to download video subtitles', None)
 249         return (None, self._closed_captions_xml_to_srt(srt_xml))
 250
 251     def _print_formats(self, formats):
 252         print('Available formats:')
 253         for x in formats:
 254             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 255
 256     def _real_initialize(self):
 257         if self._downloader is None:
 258             return
 259
 260         username = None
 261         password = None
 262         downloader_params = self._downloader.params
 263
 264         # Attempt to use provided username and password or .netrc data
 265         if downloader_params.get('username', None) is not None:
 266             username = downloader_params['username']
 267             password = downloader_params['password']
 268         elif downloader_params.get('usenetrc', False):
 269             try:
 270                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 271                 if info is not None:
 272                     username = info[0]
 273                     password = info[2]
 274                 else:
 275                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 276             except (IOError, netrc.NetrcParseError) as err:
 277                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
 278                 return
 279
 280         # Set language
 281         request = compat_urllib_request.Request(self._LANG_URL)
 282         try:
 283             self.report_lang()
 284             compat_urllib_request.urlopen(request).read()
 285         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 286             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
 287             return
 288
 289         # No authentication to be performed
 290         if username is None:
 291             return
 292
 293         # Log in
 294         login_form = {
 295                 'current_form': 'loginForm',
 296                 'next':     '/',
 297                 'action_login': 'Log In',
 298                 'username': username,
 299                 'password': password,
 300                 }
 301         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
 302         try:
 303             self.report_login()
 304             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 305             if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 306                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 307                 return
 308         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 309             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
 310             return
 311
 312         # Confirm age
 313         age_form = {
 314                 'next_url':     '/',
 315                 'action_confirm':   'Confirm',
 316                 }
 317         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 318         try:
 319             self.report_age_confirmation()
 320             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 321         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 322             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 323             return
 324
 325     def _real_extract(self, url):
 326         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 327         mobj = re.search(self._NEXT_URL_RE, url)
 328         if mobj:
 329             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 330
 331         # Extract video id from URL
 332         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 333         if mobj is None:
 334             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 335             return
 336         video_id = mobj.group(2)
 337
 338         # Get video webpage
 339         self.report_video_webpage_download(video_id)
 340         request = compat_urllib_request.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
 341         try:
 342             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 343         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 344             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
 345             return
 346
 347         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 348
 349         # Attempt to extract SWF player URL
 350         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 351         if mobj is not None:
 352             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 353         else:
 354             player_url = None
 355
 356         # Get video info
 357         self.report_video_info_webpage_download(video_id)
 358         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 359             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 360                     % (video_id, el_type))
 361             request = compat_urllib_request.Request(video_info_url)
 362             try:
 363                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
 364                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
 365                 video_info = compat_parse_qs(video_info_webpage)
 366                 if 'token' in video_info:
 367                     break
 368             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 369                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 370                 return
 371         if 'token' not in video_info:
 372             if 'reason' in video_info:
 373                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
 374             else:
 375                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 376             return
 377
 378         # Check for "rental" videos
 379         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 380             self._downloader.trouble(u'ERROR: "rental" videos not supported')
 381             return
 382
 383         # Start extracting information
 384         self.report_information_extraction(video_id)
 385
 386         # uploader
 387         if 'author' not in video_info:
 388             self._downloader.trouble(u'ERROR: unable to extract uploader name')
 389             return
 390         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 391
 392         # uploader_id
 393         video_uploader_id = None
 394         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/user/([^"]+)">', video_webpage)
 395         if mobj is not None:
 396             video_uploader_id = mobj.group(1)
 397         else:
 398             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 399
 400         # title
 401         if 'title' not in video_info:
 402             self._downloader.trouble(u'ERROR: unable to extract video title')
 403             return
 404         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 405
 406         # thumbnail image
 407         if 'thumbnail_url' not in video_info:
 408             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 409             video_thumbnail = ''
 410         else:   # don't panic if we can't find it
 411             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 412
 413         # upload date
 414         upload_date = None
 415         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 416         if mobj is not None:
 417             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 418             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 419             for expression in format_expressions:
 420                 try:
 421                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 422                 except:
 423                     pass
 424
 425         # description
 426         video_description = get_element_by_id("eow-description", video_webpage)
 427         if video_description:
 428             video_description = clean_html(video_description)
 429         else:
 430             video_description = ''
 431
 432         # closed captions
 433         video_subtitles = None
 434         if self._downloader.params.get('writesubtitles', False):
 435             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
 436             if srt_error:
 437                 self._downloader.trouble(srt_error)
 438
 439         if 'length_seconds' not in video_info:
 440             self._downloader.trouble(u'WARNING: unable to extract video duration')
 441             video_duration = ''
 442         else:
 443             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 444
 445         # token
 446         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 447
 448         # Decide which formats to download
 449         req_format = self._downloader.params.get('format', None)
 450
 451         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 452             self.report_rtmp_download()
 453             video_url_list = [(None, video_info['conn'][0])]
 454         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 455             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 456             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 457             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 458             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 459
 460             format_limit = self._downloader.params.get('format_limit', None)
 461             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 462             if format_limit is not None and format_limit in available_formats:
 463                 format_list = available_formats[available_formats.index(format_limit):]
 464             else:
 465                 format_list = available_formats
 466             existing_formats = [x for x in format_list if x in url_map]
 467             if len(existing_formats) == 0:
 468                 self._downloader.trouble(u'ERROR: no known formats available for video')
 469                 return
 470             if self._downloader.params.get('listformats', None):
 471                 self._print_formats(existing_formats)
 472                 return
 473             if req_format is None or req_format == 'best':
 474                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 475             elif req_format == 'worst':
 476                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 477             elif req_format in ('-1', 'all'):
 478                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 479             else:
 480                 # Specific formats. We pick the first in a slash-delimeted sequence.
 481                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 482                 req_formats = req_format.split('/')
 483                 video_url_list = None
 484                 for rf in req_formats:
 485                     if rf in url_map:
 486                         video_url_list = [(rf, url_map[rf])]
 487                         break
 488                 if video_url_list is None:
 489                     self._downloader.trouble(u'ERROR: requested format not available')
 490                     return
 491         else:
 492             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 493             return
 494
 495         results = []
 496         for format_param, video_real_url in video_url_list:
 497             # Extension
 498             video_extension = self._video_extensions.get(format_param, 'flv')
 499
 500             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 501                                               self._video_dimensions.get(format_param, '???'))
 502
 503             results.append({
 504                 'id':       video_id,
 505                 'url':      video_real_url,
 506                 'uploader': video_uploader,
 507                 'uploader_id': video_uploader_id,
 508                 'upload_date':  upload_date,
 509                 'title':    video_title,
 510                 'ext':      video_extension,
 511                 'format':   video_format,
 512                 'thumbnail':    video_thumbnail,
 513                 'description':  video_description,
 514                 'player_url':   player_url,
 515                 'subtitles':    video_subtitles,
 516                 'duration':     video_duration
 517             })
 518         return results
 519
 520
 521 class MetacafeIE(InfoExtractor):
 522     """Information Extractor for metacafe.com."""
 523
 524     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 525     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 526     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 527     IE_NAME = u'metacafe'
 528
 529     def __init__(self, downloader=None):
 530         InfoExtractor.__init__(self, downloader)
 531
 532     def report_disclaimer(self):
 533         """Report disclaimer retrieval."""
 534         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 535
 536     def report_age_confirmation(self):
 537         """Report attempt to confirm age."""
 538         self._downloader.to_screen(u'[metacafe] Confirming age')
 539
 540     def report_download_webpage(self, video_id):
 541         """Report webpage download."""
 542         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 543
 544     def report_extraction(self, video_id):
 545         """Report information extraction."""
 546         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 547
 548     def _real_initialize(self):
 549         # Retrieve disclaimer
 550         request = compat_urllib_request.Request(self._DISCLAIMER)
 551         try:
 552             self.report_disclaimer()
 553             disclaimer = compat_urllib_request.urlopen(request).read()
 554         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 555             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
 556             return
 557
 558         # Confirm age
 559         disclaimer_form = {
 560             'filters': '0',
 561             'submit': "Continue - I'm over 18",
 562             }
 563         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 564         try:
 565             self.report_age_confirmation()
 566             disclaimer = compat_urllib_request.urlopen(request).read()
 567         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 568             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 569             return
 570
 571     def _real_extract(self, url):
 572         # Extract id and simplified title from URL
 573         mobj = re.match(self._VALID_URL, url)
 574         if mobj is None:
 575             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 576             return
 577
 578         video_id = mobj.group(1)
 579
 580         # Check if video comes from YouTube
 581         mobj2 = re.match(r'^yt-(.*)$', video_id)
 582         if mobj2 is not None:
 583             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 584             return
 585
 586         # Retrieve video webpage to extract further information
 587         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
 588         try:
 589             self.report_download_webpage(video_id)
 590             webpage = compat_urllib_request.urlopen(request).read()
 591         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 592             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 593             return
 594
 595         # Extract URL, uploader and title from webpage
 596         self.report_extraction(video_id)
 597         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 598         if mobj is not None:
 599             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 600             video_extension = mediaURL[-3:]
 601
 602             # Extract gdaKey if available
 603             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 604             if mobj is None:
 605                 video_url = mediaURL
 606             else:
 607                 gdaKey = mobj.group(1)
 608                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 609         else:
 610             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 611             if mobj is None:
 612                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 613                 return
 614             vardict = compat_parse_qs(mobj.group(1))
 615             if 'mediaData' not in vardict:
 616                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 617                 return
 618             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 619             if mobj is None:
 620                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 621                 return
 622             mediaURL = mobj.group(1).replace('\\/', '/')
 623             video_extension = mediaURL[-3:]
 624             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 625
 626         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 627         if mobj is None:
 628             self._downloader.trouble(u'ERROR: unable to extract title')
 629             return
 630         video_title = mobj.group(1).decode('utf-8')
 631
 632         mobj = re.search(r'submitter=(.*?);', webpage)
 633         if mobj is None:
 634             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 635             return
 636         video_uploader = mobj.group(1)
 637
 638         return [{
 639             'id':       video_id.decode('utf-8'),
 640             'url':      video_url.decode('utf-8'),
 641             'uploader': video_uploader.decode('utf-8'),
 642             'upload_date':  None,
 643             'title':    video_title,
 644             'ext':      video_extension.decode('utf-8'),
 645         }]
 646
 647
 648 class DailymotionIE(InfoExtractor):
 649     """Information Extractor for Dailymotion"""
 650
 651     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 652     IE_NAME = u'dailymotion'
 653
 654     def __init__(self, downloader=None):
 655         InfoExtractor.__init__(self, downloader)
 656
 657     def report_download_webpage(self, video_id):
 658         """Report webpage download."""
 659         self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
 660
 661     def report_extraction(self, video_id):
 662         """Report information extraction."""
 663         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 664
 665     def _real_extract(self, url):
 666         # Extract id and simplified title from URL
 667         mobj = re.match(self._VALID_URL, url)
 668         if mobj is None:
 669             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 670             return
 671
 672         video_id = mobj.group(1).split('_')[0].split('?')[0]
 673
 674         video_extension = 'mp4'
 675
 676         # Retrieve video webpage to extract further information
 677         request = compat_urllib_request.Request(url)
 678         request.add_header('Cookie', 'family_filter=off')
 679         try:
 680             self.report_download_webpage(video_id)
 681             webpage_bytes = compat_urllib_request.urlopen(request).read()
 682             webpage = webpage_bytes.decode('utf-8')
 683         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 684             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 685             return
 686
 687         # Extract URL, uploader and title from webpage
 688         self.report_extraction(video_id)
 689         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 690         if mobj is None:
 691             self._downloader.trouble(u'ERROR: unable to extract media URL')
 692             return
 693         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 694
 695         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 696             if key in flashvars:
 697                 max_quality = key
 698                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 699                 break
 700         else:
 701             self._downloader.trouble(u'ERROR: unable to extract video URL')
 702             return
 703
 704         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 705         if mobj is None:
 706             self._downloader.trouble(u'ERROR: unable to extract video URL')
 707             return
 708
 709         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 710
 711         # TODO: support choosing qualities
 712
 713         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 714         if mobj is None:
 715             self._downloader.trouble(u'ERROR: unable to extract title')
 716             return
 717         video_title = unescapeHTML(mobj.group('title'))
 718
 719         video_uploader = None
 720         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 721         if mobj is None:
 722             # lookin for official user
 723             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 724             if mobj_official is None:
 725                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 726             else:
 727                 video_uploader = mobj_official.group(1)
 728         else:
 729             video_uploader = mobj.group(1)
 730
 731         video_upload_date = None
 732         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 733         if mobj is not None:
 734             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 735
 736         return [{
 737             'id':       video_id,
 738             'url':      video_url,
 739             'uploader': video_uploader,
 740             'upload_date':  video_upload_date,
 741             'title':    video_title,
 742             'ext':      video_extension,
 743         }]
 744
 745
 746 class PhotobucketIE(InfoExtractor):
 747     """Information extractor for photobucket.com."""
 748
 749     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 750     IE_NAME = u'photobucket'
 751
 752     def __init__(self, downloader=None):
 753         InfoExtractor.__init__(self, downloader)
 754
 755     def report_download_webpage(self, video_id):
 756         """Report webpage download."""
 757         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 758
 759     def report_extraction(self, video_id):
 760         """Report information extraction."""
 761         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 762
 763     def _real_extract(self, url):
 764         # Extract id from URL
 765         mobj = re.match(self._VALID_URL, url)
 766         if mobj is None:
 767             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 768             return
 769
 770         video_id = mobj.group(1)
 771
 772         video_extension = 'flv'
 773
 774         # Retrieve video webpage to extract further information
 775         request = compat_urllib_request.Request(url)
 776         try:
 777             self.report_download_webpage(video_id)
 778             webpage = compat_urllib_request.urlopen(request).read()
 779         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 780             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 781             return
 782
 783         # Extract URL, uploader, and title from webpage
 784         self.report_extraction(video_id)
 785         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 786         if mobj is None:
 787             self._downloader.trouble(u'ERROR: unable to extract media URL')
 788             return
 789         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 790
 791         video_url = mediaURL
 792
 793         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 794         if mobj is None:
 795             self._downloader.trouble(u'ERROR: unable to extract title')
 796             return
 797         video_title = mobj.group(1).decode('utf-8')
 798
 799         video_uploader = mobj.group(2).decode('utf-8')
 800
 801         return [{
 802             'id':       video_id.decode('utf-8'),
 803             'url':      video_url.decode('utf-8'),
 804             'uploader': video_uploader,
 805             'upload_date':  None,
 806             'title':    video_title,
 807             'ext':      video_extension.decode('utf-8'),
 808         }]
 809
 810
 811 class YahooIE(InfoExtractor):
 812     """Information extractor for video.yahoo.com."""
 813
 814     _WORKING = False
 815     # _VALID_URL matches all Yahoo! Video URLs
 816     # _VPAGE_URL matches only the extractable '/watch/' URLs
 817     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 818     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 819     IE_NAME = u'video.yahoo'
 820
 821     def __init__(self, downloader=None):
 822         InfoExtractor.__init__(self, downloader)
 823
 824     def report_download_webpage(self, video_id):
 825         """Report webpage download."""
 826         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 827
 828     def report_extraction(self, video_id):
 829         """Report information extraction."""
 830         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 831
 832     def _real_extract(self, url, new_video=True):
 833         # Extract ID from URL
 834         mobj = re.match(self._VALID_URL, url)
 835         if mobj is None:
 836             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 837             return
 838
 839         video_id = mobj.group(2)
 840         video_extension = 'flv'
 841
 842         # Rewrite valid but non-extractable URLs as
 843         # extractable English language /watch/ URLs
 844         if re.match(self._VPAGE_URL, url) is None:
 845             request = compat_urllib_request.Request(url)
 846             try:
 847                 webpage = compat_urllib_request.urlopen(request).read()
 848             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 849                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 850                 return
 851
 852             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 853             if mobj is None:
 854                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 855                 return
 856             yahoo_id = mobj.group(1)
 857
 858             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 859             if mobj is None:
 860                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 861                 return
 862             yahoo_vid = mobj.group(1)
 863
 864             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 865             return self._real_extract(url, new_video=False)
 866
 867         # Retrieve video webpage to extract further information
 868         request = compat_urllib_request.Request(url)
 869         try:
 870             self.report_download_webpage(video_id)
 871             webpage = compat_urllib_request.urlopen(request).read()
 872         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 873             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 874             return
 875
 876         # Extract uploader and title from webpage
 877         self.report_extraction(video_id)
 878         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 879         if mobj is None:
 880             self._downloader.trouble(u'ERROR: unable to extract video title')
 881             return
 882         video_title = mobj.group(1).decode('utf-8')
 883
 884         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 885         if mobj is None:
 886             self._downloader.trouble(u'ERROR: unable to extract video uploader')
 887             return
 888         video_uploader = mobj.group(1).decode('utf-8')
 889
 890         # Extract video thumbnail
 891         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 892         if mobj is None:
 893             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 894             return
 895         video_thumbnail = mobj.group(1).decode('utf-8')
 896
 897         # Extract video description
 898         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 899         if mobj is None:
 900             self._downloader.trouble(u'ERROR: unable to extract video description')
 901             return
 902         video_description = mobj.group(1).decode('utf-8')
 903         if not video_description:
 904             video_description = 'No description available.'
 905
 906         # Extract video height and width
 907         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 908         if mobj is None:
 909             self._downloader.trouble(u'ERROR: unable to extract video height')
 910             return
 911         yv_video_height = mobj.group(1)
 912
 913         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 914         if mobj is None:
 915             self._downloader.trouble(u'ERROR: unable to extract video width')
 916             return
 917         yv_video_width = mobj.group(1)
 918
 919         # Retrieve video playlist to extract media URL
 920         # I'm not completely sure what all these options are, but we
 921         # seem to need most of them, otherwise the server sends a 401.
 922         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 923         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 924         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 925                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 926                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 927         try:
 928             self.report_download_webpage(video_id)
 929             webpage = compat_urllib_request.urlopen(request).read()
 930         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 931             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 932             return
 933
 934         # Extract media URL from playlist XML
 935         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 936         if mobj is None:
 937             self._downloader.trouble(u'ERROR: Unable to extract media URL')
 938             return
 939         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
 940         video_url = unescapeHTML(video_url)
 941
 942         return [{
 943             'id':       video_id.decode('utf-8'),
 944             'url':      video_url,
 945             'uploader': video_uploader,
 946             'upload_date':  None,
 947             'title':    video_title,
 948             'ext':      video_extension.decode('utf-8'),
 949             'thumbnail':    video_thumbnail.decode('utf-8'),
 950             'description':  video_description,
 951         }]
 952
 953
 954 class VimeoIE(InfoExtractor):
 955     """Information extractor for vimeo.com."""
 956
 957     # _VALID_URL matches Vimeo URLs
 958     _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
 959     IE_NAME = u'vimeo'
 960
 961     def __init__(self, downloader=None):
 962         InfoExtractor.__init__(self, downloader)
 963
 964     def report_download_webpage(self, video_id):
 965         """Report webpage download."""
 966         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
 967
 968     def report_extraction(self, video_id):
 969         """Report information extraction."""
 970         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
 971
 972     def _real_extract(self, url, new_video=True):
 973         # Extract ID from URL
 974         mobj = re.match(self._VALID_URL, url)
 975         if mobj is None:
 976             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 977             return
 978
 979         video_id = mobj.group(1)
 980
 981         # Retrieve video webpage to extract further information
 982         request = compat_urllib_request.Request(url, None, std_headers)
 983         try:
 984             self.report_download_webpage(video_id)
 985             webpage_bytes = compat_urllib_request.urlopen(request).read()
 986             webpage = webpage_bytes.decode('utf-8')
 987         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 988             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 989             return
 990
 991         # Now we begin extracting as much information as we can from what we
 992         # retrieved. First we extract the information common to all extractors,
 993         # and latter we extract those that are Vimeo specific.
 994         self.report_extraction(video_id)
 995
 996         # Extract the config JSON
 997         try:
 998             config = webpage.split(' = {config:')[1].split(',assets:')[0]
 999             config = json.loads(config)
1000         except:
1001             self._downloader.trouble(u'ERROR: unable to extract info section')
1002             return
1003
1004         # Extract title
1005         video_title = config["video"]["title"]
1006
1007         # Extract uploader and uploader_id
1008         video_uploader = config["video"]["owner"]["name"]
1009         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1010
1011         # Extract video thumbnail
1012         video_thumbnail = config["video"]["thumbnail"]
1013
1014         # Extract video description
1015         video_description = get_element_by_attribute("itemprop", "description", webpage)
1016         if video_description: video_description = clean_html(video_description)
1017         else: video_description = ''
1018
1019         # Extract upload date
1020         video_upload_date = None
1021         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1022         if mobj is not None:
1023             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1024
1025         # Vimeo specific: extract request signature and timestamp
1026         sig = config['request']['signature']
1027         timestamp = config['request']['timestamp']
1028
1029         # Vimeo specific: extract video codec and quality information
1030         # First consider quality, then codecs, then take everything
1031         # TODO bind to format param
1032         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1033         files = { 'hd': [], 'sd': [], 'other': []}
1034         for codec_name, codec_extension in codecs:
1035             if codec_name in config["video"]["files"]:
1036                 if 'hd' in config["video"]["files"][codec_name]:
1037                     files['hd'].append((codec_name, codec_extension, 'hd'))
1038                 elif 'sd' in config["video"]["files"][codec_name]:
1039                     files['sd'].append((codec_name, codec_extension, 'sd'))
1040                 else:
1041                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1042
1043         for quality in ('hd', 'sd', 'other'):
1044             if len(files[quality]) > 0:
1045                 video_quality = files[quality][0][2]
1046                 video_codec = files[quality][0][0]
1047                 video_extension = files[quality][0][1]
1048                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1049                 break
1050         else:
1051             self._downloader.trouble(u'ERROR: no known codec found')
1052             return
1053
1054         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1055                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1056
1057         return [{
1058             'id':       video_id,
1059             'url':      video_url,
1060             'uploader': video_uploader,
1061             'uploader_id': video_uploader_id,
1062             'upload_date':  video_upload_date,
1063             'title':    video_title,
1064             'ext':      video_extension,
1065             'thumbnail':    video_thumbnail,
1066             'description':  video_description,
1067         }]
1068
1069
1070 class ArteTvIE(InfoExtractor):
1071     """arte.tv information extractor."""
1072
1073     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1074     _LIVE_URL = r'index-[0-9]+\.html$'
1075
1076     IE_NAME = u'arte.tv'
1077
1078     def __init__(self, downloader=None):
1079         InfoExtractor.__init__(self, downloader)
1080
1081     def report_download_webpage(self, video_id):
1082         """Report webpage download."""
1083         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1084
1085     def report_extraction(self, video_id):
1086         """Report information extraction."""
1087         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1088
1089     def fetch_webpage(self, url):
1090         self._downloader.increment_downloads()
1091         request = compat_urllib_request.Request(url)
1092         try:
1093             self.report_download_webpage(url)
1094             webpage = compat_urllib_request.urlopen(request).read()
1095         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1096             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1097             return
1098         except ValueError as err:
1099             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1100             return
1101         return webpage
1102
1103     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1104         page = self.fetch_webpage(url)
1105         mobj = re.search(regex, page, regexFlags)
1106         info = {}
1107
1108         if mobj is None:
1109             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1110             return
1111
1112         for (i, key, err) in matchTuples:
1113             if mobj.group(i) is None:
1114                 self._downloader.trouble(err)
1115                 return
1116             else:
1117                 info[key] = mobj.group(i)
1118
1119         return info
1120
1121     def extractLiveStream(self, url):
1122         video_lang = url.split('/')[-4]
1123         info = self.grep_webpage(
1124             url,
1125             r'src="(.*?/videothek_js.*?\.js)',
1126             0,
1127             [
1128                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1129             ]
1130         )
1131         http_host = url.split('/')[2]
1132         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1133         info = self.grep_webpage(
1134             next_url,
1135             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1136                 '(http://.*?\.swf).*?' +
1137                 '(rtmp://.*?)\'',
1138             re.DOTALL,
1139             [
1140                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1141                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1142                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1143             ]
1144         )
1145         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1146
1147     def extractPlus7Stream(self, url):
1148         video_lang = url.split('/')[-3]
1149         info = self.grep_webpage(
1150             url,
1151             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1152             0,
1153             [
1154                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1155             ]
1156         )
1157         next_url = compat_urllib_parse.unquote(info.get('url'))
1158         info = self.grep_webpage(
1159             next_url,
1160             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1161             0,
1162             [
1163                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1164             ]
1165         )
1166         next_url = compat_urllib_parse.unquote(info.get('url'))
1167
1168         info = self.grep_webpage(
1169             next_url,
1170             r'<video id="(.*?)".*?>.*?' +
1171                 '<name>(.*?)</name>.*?' +
1172                 '<dateVideo>(.*?)</dateVideo>.*?' +
1173                 '<url quality="hd">(.*?)</url>',
1174             re.DOTALL,
1175             [
1176                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1177                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1178                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1179                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1180             ]
1181         )
1182
1183         return {
1184             'id':           info.get('id'),
1185             'url':          compat_urllib_parse.unquote(info.get('url')),
1186             'uploader':     u'arte.tv',
1187             'upload_date':  info.get('date'),
1188             'title':        info.get('title').decode('utf-8'),
1189             'ext':          u'mp4',
1190             'format':       u'NA',
1191             'player_url':   None,
1192         }
1193
1194     def _real_extract(self, url):
1195         video_id = url.split('/')[-1]
1196         self.report_extraction(video_id)
1197
1198         if re.search(self._LIVE_URL, video_id) is not None:
1199             self.extractLiveStream(url)
1200             return
1201         else:
1202             info = self.extractPlus7Stream(url)
1203
1204         return [info]
1205
1206
1207 class GenericIE(InfoExtractor):
1208     """Generic last-resort information extractor."""
1209
1210     _VALID_URL = r'.*'
1211     IE_NAME = u'generic'
1212
1213     def __init__(self, downloader=None):
1214         InfoExtractor.__init__(self, downloader)
1215
1216     def report_download_webpage(self, video_id):
1217         """Report webpage download."""
1218         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1219         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1220
1221     def report_extraction(self, video_id):
1222         """Report information extraction."""
1223         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1224
1225     def report_following_redirect(self, new_url):
1226         """Report information extraction."""
1227         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1228
1229     def _test_redirect(self, url):
1230         """Check if it is a redirect, like url shorteners, in case restart chain."""
1231         class HeadRequest(compat_urllib_request.Request):
1232             def get_method(self):
1233                 return "HEAD"
1234
1235         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1236             """
1237             Subclass the HTTPRedirectHandler to make it use our
1238             HeadRequest also on the redirected URL
1239             """
1240             def redirect_request(self, req, fp, code, msg, headers, newurl):
1241                 if code in (301, 302, 303, 307):
1242                     newurl = newurl.replace(' ', '%20')
1243                     newheaders = dict((k,v) for k,v in req.headers.items()
1244                                       if k.lower() not in ("content-length", "content-type"))
1245                     return HeadRequest(newurl,
1246                                        headers=newheaders,
1247                                        origin_req_host=req.get_origin_req_host(),
1248                                        unverifiable=True)
1249                 else:
1250                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1251
1252         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1253             """
1254             Fallback to GET if HEAD is not allowed (405 HTTP error)
1255             """
1256             def http_error_405(self, req, fp, code, msg, headers):
1257                 fp.read()
1258                 fp.close()
1259
1260                 newheaders = dict((k,v) for k,v in req.headers.items()
1261                                   if k.lower() not in ("content-length", "content-type"))
1262                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1263                                                  headers=newheaders,
1264                                                  origin_req_host=req.get_origin_req_host(),
1265                                                  unverifiable=True))
1266
1267         # Build our opener
1268         opener = compat_urllib_request.OpenerDirector()
1269         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1270                         HTTPMethodFallback, HEADRedirectHandler,
1271                         compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1272             opener.add_handler(handler())
1273
1274         response = opener.open(HeadRequest(url))
1275         new_url = response.geturl()
1276
1277         if url == new_url:
1278             return False
1279
1280         self.report_following_redirect(new_url)
1281         self._downloader.download([new_url])
1282         return True
1283
1284     def _real_extract(self, url):
1285         if self._test_redirect(url): return
1286
1287         video_id = url.split('/')[-1]
1288         request = compat_urllib_request.Request(url)
1289         try:
1290             self.report_download_webpage(video_id)
1291             webpage = compat_urllib_request.urlopen(request).read()
1292         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1293             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1294             return
1295         except ValueError as err:
1296             # since this is the last-resort InfoExtractor, if
1297             # this error is thrown, it'll be thrown here
1298             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1299             return
1300
1301         self.report_extraction(video_id)
1302         # Start with something easy: JW Player in SWFObject
1303         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1304         if mobj is None:
1305             # Broaden the search a little bit
1306             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1307         if mobj is None:
1308             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1309             return
1310
1311         # It's possible that one of the regexes
1312         # matched, but returned an empty group:
1313         if mobj.group(1) is None:
1314             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1315             return
1316
1317         video_url = compat_urllib_parse.unquote(mobj.group(1))
1318         video_id = os.path.basename(video_url)
1319
1320         # here's a fun little line of code for you:
1321         video_extension = os.path.splitext(video_id)[1][1:]
1322         video_id = os.path.splitext(video_id)[0]
1323
1324         # it's tempting to parse this further, but you would
1325         # have to take into account all the variations like
1326         #   Video Title - Site Name
1327         #   Site Name | Video Title
1328         #   Video Title - Tagline | Site Name
1329         # and so on and so forth; it's just not practical
1330         mobj = re.search(r'<title>(.*)</title>', webpage)
1331         if mobj is None:
1332             self._downloader.trouble(u'ERROR: unable to extract title')
1333             return
1334         video_title = mobj.group(1)
1335
1336         # video uploader is domain name
1337         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1338         if mobj is None:
1339             self._downloader.trouble(u'ERROR: unable to extract title')
1340             return
1341         video_uploader = mobj.group(1)
1342
1343         return [{
1344             'id':       video_id,
1345             'url':      video_url,
1346             'uploader': video_uploader,
1347             'upload_date':  None,
1348             'title':    video_title,
1349             'ext':      video_extension,
1350         }]
1351
1352
1353 class YoutubeSearchIE(InfoExtractor):
1354     """Information Extractor for YouTube search queries."""
1355     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1356     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1357     _max_youtube_results = 1000
1358     IE_NAME = u'youtube:search'
1359
1360     def __init__(self, downloader=None):
1361         InfoExtractor.__init__(self, downloader)
1362
1363     def report_download_page(self, query, pagenum):
1364         """Report attempt to download search page with given number."""
1365         query = query.decode(preferredencoding())
1366         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1367
1368     def _real_extract(self, query):
1369         mobj = re.match(self._VALID_URL, query)
1370         if mobj is None:
1371             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1372             return
1373
1374         prefix, query = query.split(':')
1375         prefix = prefix[8:]
1376         query = query.encode('utf-8')
1377         if prefix == '':
1378             self._download_n_results(query, 1)
1379             return
1380         elif prefix == 'all':
1381             self._download_n_results(query, self._max_youtube_results)
1382             return
1383         else:
1384             try:
1385                 n = int(prefix)
1386                 if n <= 0:
1387                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1388                     return
1389                 elif n > self._max_youtube_results:
1390                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1391                     n = self._max_youtube_results
1392                 self._download_n_results(query, n)
1393                 return
1394             except ValueError: # parsing prefix as integer fails
1395                 self._download_n_results(query, 1)
1396                 return
1397
1398     def _download_n_results(self, query, n):
1399         """Downloads a specified number of results for a query"""
1400
1401         video_ids = []
1402         pagenum = 0
1403         limit = n
1404
1405         while (50 * pagenum) < limit:
1406             self.report_download_page(query, pagenum+1)
1407             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1408             request = compat_urllib_request.Request(result_url)
1409             try:
1410                 data = compat_urllib_request.urlopen(request).read()
1411             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1412                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1413                 return
1414             api_response = json.loads(data)['data']
1415
1416             new_ids = list(video['id'] for video in api_response['items'])
1417             video_ids += new_ids
1418
1419             limit = min(n, api_response['totalItems'])
1420             pagenum += 1
1421
1422         if len(video_ids) > n:
1423             video_ids = video_ids[:n]
1424         for id in video_ids:
1425             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1426         return
1427
1428
1429 class GoogleSearchIE(InfoExtractor):
1430     """Information Extractor for Google Video search queries."""
1431     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1432     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1433     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1434     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1435     _max_google_results = 1000
1436     IE_NAME = u'video.google:search'
1437
1438     def __init__(self, downloader=None):
1439         InfoExtractor.__init__(self, downloader)
1440
1441     def report_download_page(self, query, pagenum):
1442         """Report attempt to download playlist page with given number."""
1443         query = query.decode(preferredencoding())
1444         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1445
1446     def _real_extract(self, query):
1447         mobj = re.match(self._VALID_URL, query)
1448         if mobj is None:
1449             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1450             return
1451
1452         prefix, query = query.split(':')
1453         prefix = prefix[8:]
1454         query = query.encode('utf-8')
1455         if prefix == '':
1456             self._download_n_results(query, 1)
1457             return
1458         elif prefix == 'all':
1459             self._download_n_results(query, self._max_google_results)
1460             return
1461         else:
1462             try:
1463                 n = int(prefix)
1464                 if n <= 0:
1465                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1466                     return
1467                 elif n > self._max_google_results:
1468                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1469                     n = self._max_google_results
1470                 self._download_n_results(query, n)
1471                 return
1472             except ValueError: # parsing prefix as integer fails
1473                 self._download_n_results(query, 1)
1474                 return
1475
1476     def _download_n_results(self, query, n):
1477         """Downloads a specified number of results for a query"""
1478
1479         video_ids = []
1480         pagenum = 0
1481
1482         while True:
1483             self.report_download_page(query, pagenum)
1484             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1485             request = compat_urllib_request.Request(result_url)
1486             try:
1487                 page = compat_urllib_request.urlopen(request).read()
1488             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1489                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1490                 return
1491
1492             # Extract video identifiers
1493             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1494                 video_id = mobj.group(1)
1495                 if video_id not in video_ids:
1496                     video_ids.append(video_id)
1497                     if len(video_ids) == n:
1498                         # Specified n videos reached
1499                         for id in video_ids:
1500                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1501                         return
1502
1503             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1504                 for id in video_ids:
1505                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1506                 return
1507
1508             pagenum = pagenum + 1
1509
1510
1511 class YahooSearchIE(InfoExtractor):
1512     """Information Extractor for Yahoo! Video search queries."""
1513
1514     _WORKING = False
1515     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1516     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1517     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1518     _MORE_PAGES_INDICATOR = r'\s*Next'
1519     _max_yahoo_results = 1000
1520     IE_NAME = u'video.yahoo:search'
1521
1522     def __init__(self, downloader=None):
1523         InfoExtractor.__init__(self, downloader)
1524
1525     def report_download_page(self, query, pagenum):
1526         """Report attempt to download playlist page with given number."""
1527         query = query.decode(preferredencoding())
1528         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1529
1530     def _real_extract(self, query):
1531         mobj = re.match(self._VALID_URL, query)
1532         if mobj is None:
1533             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1534             return
1535
1536         prefix, query = query.split(':')
1537         prefix = prefix[8:]
1538         query = query.encode('utf-8')
1539         if prefix == '':
1540             self._download_n_results(query, 1)
1541             return
1542         elif prefix == 'all':
1543             self._download_n_results(query, self._max_yahoo_results)
1544             return
1545         else:
1546             try:
1547                 n = int(prefix)
1548                 if n <= 0:
1549                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1550                     return
1551                 elif n > self._max_yahoo_results:
1552                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1553                     n = self._max_yahoo_results
1554                 self._download_n_results(query, n)
1555                 return
1556             except ValueError: # parsing prefix as integer fails
1557                 self._download_n_results(query, 1)
1558                 return
1559
1560     def _download_n_results(self, query, n):
1561         """Downloads a specified number of results for a query"""
1562
1563         video_ids = []
1564         already_seen = set()
1565         pagenum = 1
1566
1567         while True:
1568             self.report_download_page(query, pagenum)
1569             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1570             request = compat_urllib_request.Request(result_url)
1571             try:
1572                 page = compat_urllib_request.urlopen(request).read()
1573             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1574                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1575                 return
1576
1577             # Extract video identifiers
1578             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1579                 video_id = mobj.group(1)
1580                 if video_id not in already_seen:
1581                     video_ids.append(video_id)
1582                     already_seen.add(video_id)
1583                     if len(video_ids) == n:
1584                         # Specified n videos reached
1585                         for id in video_ids:
1586                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1587                         return
1588
1589             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1590                 for id in video_ids:
1591                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1592                 return
1593
1594             pagenum = pagenum + 1
1595
1596
1597 class YoutubePlaylistIE(InfoExtractor):
1598     """Information Extractor for YouTube playlists."""
1599
1600     _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1601     _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1602     _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1603     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1604     IE_NAME = u'youtube:playlist'
1605
1606     def __init__(self, downloader=None):
1607         InfoExtractor.__init__(self, downloader)
1608
1609     def report_download_page(self, playlist_id, pagenum):
1610         """Report attempt to download playlist page with given number."""
1611         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1612
1613     def _real_extract(self, url):
1614         # Extract playlist id
1615         mobj = re.match(self._VALID_URL, url)
1616         if mobj is None:
1617             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1618             return
1619
1620         # Single video case
1621         if mobj.group(3) is not None:
1622             self._downloader.download([mobj.group(3)])
1623             return
1624
1625         # Download playlist pages
1626         # prefix is 'p' as default for playlists but there are other types that need extra care
1627         playlist_prefix = mobj.group(1)
1628         if playlist_prefix == 'a':
1629             playlist_access = 'artist'
1630         else:
1631             playlist_prefix = 'p'
1632             playlist_access = 'view_play_list'
1633         playlist_id = mobj.group(2)
1634         video_ids = []
1635         pagenum = 1
1636
1637         while True:
1638             self.report_download_page(playlist_id, pagenum)
1639             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1640             request = compat_urllib_request.Request(url)
1641             try:
1642                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1643             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1644                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1645                 return
1646
1647             # Extract video identifiers
1648             ids_in_page = []
1649             for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1650                 if mobj.group(1) not in ids_in_page:
1651                     ids_in_page.append(mobj.group(1))
1652             video_ids.extend(ids_in_page)
1653
1654             if self._MORE_PAGES_INDICATOR not in page:
1655                 break
1656             pagenum = pagenum + 1
1657
1658         total = len(video_ids)
1659
1660         playliststart = self._downloader.params.get('playliststart', 1) - 1
1661         playlistend = self._downloader.params.get('playlistend', -1)
1662         if playlistend == -1:
1663             video_ids = video_ids[playliststart:]
1664         else:
1665             video_ids = video_ids[playliststart:playlistend]
1666
1667         if len(video_ids) == total:
1668             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1669         else:
1670             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1671
1672         for id in video_ids:
1673             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1674         return
1675
1676
1677 class YoutubeChannelIE(InfoExtractor):
1678     """Information Extractor for YouTube channels."""
1679
1680     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1681     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1682     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1683     IE_NAME = u'youtube:channel'
1684
1685     def report_download_page(self, channel_id, pagenum):
1686         """Report attempt to download channel page with given number."""
1687         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1688
1689     def _real_extract(self, url):
1690         # Extract channel id
1691         mobj = re.match(self._VALID_URL, url)
1692         if mobj is None:
1693             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1694             return
1695
1696         # Download channel pages
1697         channel_id = mobj.group(1)
1698         video_ids = []
1699         pagenum = 1
1700
1701         while True:
1702             self.report_download_page(channel_id, pagenum)
1703             url = self._TEMPLATE_URL % (channel_id, pagenum)
1704             request = compat_urllib_request.Request(url)
1705             try:
1706                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1707             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1708                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1709                 return
1710
1711             # Extract video identifiers
1712             ids_in_page = []
1713             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1714                 if mobj.group(1) not in ids_in_page:
1715                     ids_in_page.append(mobj.group(1))
1716             video_ids.extend(ids_in_page)
1717
1718             if self._MORE_PAGES_INDICATOR not in page:
1719                 break
1720             pagenum = pagenum + 1
1721
1722         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1723
1724         for id in video_ids:
1725             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1726         return
1727
1728
1729 class YoutubeUserIE(InfoExtractor):
1730     """Information Extractor for YouTube users."""
1731
1732     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1733     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1734     _GDATA_PAGE_SIZE = 50
1735     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1736     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1737     IE_NAME = u'youtube:user'
1738
1739     def __init__(self, downloader=None):
1740         InfoExtractor.__init__(self, downloader)
1741
1742     def report_download_page(self, username, start_index):
1743         """Report attempt to download user page."""
1744         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1745                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1746
1747     def _real_extract(self, url):
1748         # Extract username
1749         mobj = re.match(self._VALID_URL, url)
1750         if mobj is None:
1751             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1752             return
1753
1754         username = mobj.group(1)
1755
1756         # Download video ids using YouTube Data API. Result size per
1757         # query is limited (currently to 50 videos) so we need to query
1758         # page by page until there are no video ids - it means we got
1759         # all of them.
1760
1761         video_ids = []
1762         pagenum = 0
1763
1764         while True:
1765             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1766             self.report_download_page(username, start_index)
1767
1768             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1769
1770             try:
1771                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1772             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1773                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1774                 return
1775
1776             # Extract video identifiers
1777             ids_in_page = []
1778
1779             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1780                 if mobj.group(1) not in ids_in_page:
1781                     ids_in_page.append(mobj.group(1))
1782
1783             video_ids.extend(ids_in_page)
1784
1785             # A little optimization - if current page is not
1786             # "full", ie. does not contain PAGE_SIZE video ids then
1787             # we can assume that this page is the last one - there
1788             # are no more ids on further pages - no need to query
1789             # again.
1790
1791             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1792                 break
1793
1794             pagenum += 1
1795
1796         all_ids_count = len(video_ids)
1797         playliststart = self._downloader.params.get('playliststart', 1) - 1
1798         playlistend = self._downloader.params.get('playlistend', -1)
1799
1800         if playlistend == -1:
1801             video_ids = video_ids[playliststart:]
1802         else:
1803             video_ids = video_ids[playliststart:playlistend]
1804
1805         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1806                 (username, all_ids_count, len(video_ids)))
1807
1808         for video_id in video_ids:
1809             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1810
1811
1812 class BlipTVUserIE(InfoExtractor):
1813     """Information Extractor for blip.tv users."""
1814
1815     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1816     _PAGE_SIZE = 12
1817     IE_NAME = u'blip.tv:user'
1818
1819     def __init__(self, downloader=None):
1820         InfoExtractor.__init__(self, downloader)
1821
1822     def report_download_page(self, username, pagenum):
1823         """Report attempt to download user page."""
1824         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1825                 (self.IE_NAME, username, pagenum))
1826
1827     def _real_extract(self, url):
1828         # Extract username
1829         mobj = re.match(self._VALID_URL, url)
1830         if mobj is None:
1831             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1832             return
1833
1834         username = mobj.group(1)
1835
1836         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1837
1838         request = compat_urllib_request.Request(url)
1839
1840         try:
1841             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1842             mobj = re.search(r'data-users-id="([^"]+)"', page)
1843             page_base = page_base % mobj.group(1)
1844         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1845             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1846             return
1847
1848
1849         # Download video ids using BlipTV Ajax calls. Result size per
1850         # query is limited (currently to 12 videos) so we need to query
1851         # page by page until there are no video ids - it means we got
1852         # all of them.
1853
1854         video_ids = []
1855         pagenum = 1
1856
1857         while True:
1858             self.report_download_page(username, pagenum)
1859
1860             request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1861
1862             try:
1863                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1864             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1865                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1866                 return
1867
1868             # Extract video identifiers
1869             ids_in_page = []
1870
1871             for mobj in re.finditer(r'href="/([^"]+)"', page):
1872                 if mobj.group(1) not in ids_in_page:
1873                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1874
1875             video_ids.extend(ids_in_page)
1876
1877             # A little optimization - if current page is not
1878             # "full", ie. does not contain PAGE_SIZE video ids then
1879             # we can assume that this page is the last one - there
1880             # are no more ids on further pages - no need to query
1881             # again.
1882
1883             if len(ids_in_page) < self._PAGE_SIZE:
1884                 break
1885
1886             pagenum += 1
1887
1888         all_ids_count = len(video_ids)
1889         playliststart = self._downloader.params.get('playliststart', 1) - 1
1890         playlistend = self._downloader.params.get('playlistend', -1)
1891
1892         if playlistend == -1:
1893             video_ids = video_ids[playliststart:]
1894         else:
1895             video_ids = video_ids[playliststart:playlistend]
1896
1897         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1898                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1899
1900         for video_id in video_ids:
1901             self._downloader.download([u'http://blip.tv/'+video_id])
1902
1903
1904 class DepositFilesIE(InfoExtractor):
1905     """Information extractor for depositfiles.com"""
1906
1907     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1908     IE_NAME = u'DepositFiles'
1909
1910     def __init__(self, downloader=None):
1911         InfoExtractor.__init__(self, downloader)
1912
1913     def report_download_webpage(self, file_id):
1914         """Report webpage download."""
1915         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1916
1917     def report_extraction(self, file_id):
1918         """Report information extraction."""
1919         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1920
1921     def _real_extract(self, url):
1922         file_id = url.split('/')[-1]
1923         # Rebuild url in english locale
1924         url = 'http://depositfiles.com/en/files/' + file_id
1925
1926         # Retrieve file webpage with 'Free download' button pressed
1927         free_download_indication = { 'gateway_result' : '1' }
1928         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1929         try:
1930             self.report_download_webpage(file_id)
1931             webpage = compat_urllib_request.urlopen(request).read()
1932         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1933             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1934             return
1935
1936         # Search for the real file URL
1937         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1938         if (mobj is None) or (mobj.group(1) is None):
1939             # Try to figure out reason of the error.
1940             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1941             if (mobj is not None) and (mobj.group(1) is not None):
1942                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1943                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1944             else:
1945                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1946             return
1947
1948         file_url = mobj.group(1)
1949         file_extension = os.path.splitext(file_url)[1][1:]
1950
1951         # Search for file title
1952         mobj = re.search(r'<b title="(.*?)">', webpage)
1953         if mobj is None:
1954             self._downloader.trouble(u'ERROR: unable to extract title')
1955             return
1956         file_title = mobj.group(1).decode('utf-8')
1957
1958         return [{
1959             'id':       file_id.decode('utf-8'),
1960             'url':      file_url.decode('utf-8'),
1961             'uploader': None,
1962             'upload_date':  None,
1963             'title':    file_title,
1964             'ext':      file_extension.decode('utf-8'),
1965         }]
1966
1967
1968 class FacebookIE(InfoExtractor):
1969     """Information Extractor for Facebook"""
1970
1971     _WORKING = False
1972     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1973     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1974     _NETRC_MACHINE = 'facebook'
1975     _available_formats = ['video', 'highqual', 'lowqual']
1976     _video_extensions = {
1977         'video': 'mp4',
1978         'highqual': 'mp4',
1979         'lowqual': 'mp4',
1980     }
1981     IE_NAME = u'facebook'
1982
1983     def __init__(self, downloader=None):
1984         InfoExtractor.__init__(self, downloader)
1985
1986     def _reporter(self, message):
1987         """Add header and report message."""
1988         self._downloader.to_screen(u'[facebook] %s' % message)
1989
1990     def report_login(self):
1991         """Report attempt to log in."""
1992         self._reporter(u'Logging in')
1993
1994     def report_video_webpage_download(self, video_id):
1995         """Report attempt to download video webpage."""
1996         self._reporter(u'%s: Downloading video webpage' % video_id)
1997
1998     def report_information_extraction(self, video_id):
1999         """Report attempt to extract video information."""
2000         self._reporter(u'%s: Extracting video information' % video_id)
2001
2002     def _parse_page(self, video_webpage):
2003         """Extract video information from page"""
2004         # General data
2005         data = {'title': r'\("video_title", "(.*?)"\)',
2006             'description': r'<div class="datawrap">(.*?)</div>',
2007             'owner': r'\("video_owner_name", "(.*?)"\)',
2008             'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2009             }
2010         video_info = {}
2011         for piece in data.keys():
2012             mobj = re.search(data[piece], video_webpage)
2013             if mobj is not None:
2014                 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2015
2016         # Video urls
2017         video_urls = {}
2018         for fmt in self._available_formats:
2019             mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2020             if mobj is not None:
2021                 # URL is in a Javascript segment inside an escaped Unicode format within
2022                 # the generally utf-8 page
2023                 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2024         video_info['video_urls'] = video_urls
2025
2026         return video_info
2027
2028     def _real_initialize(self):
2029         if self._downloader is None:
2030             return
2031
2032         useremail = None
2033         password = None
2034         downloader_params = self._downloader.params
2035
2036         # Attempt to use provided username and password or .netrc data
2037         if downloader_params.get('username', None) is not None:
2038             useremail = downloader_params['username']
2039             password = downloader_params['password']
2040         elif downloader_params.get('usenetrc', False):
2041             try:
2042                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2043                 if info is not None:
2044                     useremail = info[0]
2045                     password = info[2]
2046                 else:
2047                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2048             except (IOError, netrc.NetrcParseError) as err:
2049                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2050                 return
2051
2052         if useremail is None:
2053             return
2054
2055         # Log in
2056         login_form = {
2057             'email': useremail,
2058             'pass': password,
2059             'login': 'Log+In'
2060             }
2061         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2062         try:
2063             self.report_login()
2064             login_results = compat_urllib_request.urlopen(request).read()
2065             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2066                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2067                 return
2068         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2069             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2070             return
2071
2072     def _real_extract(self, url):
2073         mobj = re.match(self._VALID_URL, url)
2074         if mobj is None:
2075             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2076             return
2077         video_id = mobj.group('ID')
2078
2079         # Get video webpage
2080         self.report_video_webpage_download(video_id)
2081         request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2082         try:
2083             page = compat_urllib_request.urlopen(request)
2084             video_webpage = page.read()
2085         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2086             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2087             return
2088
2089         # Start extracting information
2090         self.report_information_extraction(video_id)
2091
2092         # Extract information
2093         video_info = self._parse_page(video_webpage)
2094
2095         # uploader
2096         if 'owner' not in video_info:
2097             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2098             return
2099         video_uploader = video_info['owner']
2100
2101         # title
2102         if 'title' not in video_info:
2103             self._downloader.trouble(u'ERROR: unable to extract video title')
2104             return
2105         video_title = video_info['title']
2106         video_title = video_title.decode('utf-8')
2107
2108         # thumbnail image
2109         if 'thumbnail' not in video_info:
2110             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2111             video_thumbnail = ''
2112         else:
2113             video_thumbnail = video_info['thumbnail']
2114
2115         # upload date
2116         upload_date = None
2117         if 'upload_date' in video_info:
2118             upload_time = video_info['upload_date']
2119             timetuple = email.utils.parsedate_tz(upload_time)
2120             if timetuple is not None:
2121                 try:
2122                     upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2123                 except:
2124                     pass
2125
2126         # description
2127         video_description = video_info.get('description', 'No description available.')
2128
2129         url_map = video_info['video_urls']
2130         if url_map:
2131             # Decide which formats to download
2132             req_format = self._downloader.params.get('format', None)
2133             format_limit = self._downloader.params.get('format_limit', None)
2134
2135             if format_limit is not None and format_limit in self._available_formats:
2136                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2137             else:
2138                 format_list = self._available_formats
2139             existing_formats = [x for x in format_list if x in url_map]
2140             if len(existing_formats) == 0:
2141                 self._downloader.trouble(u'ERROR: no known formats available for video')
2142                 return
2143             if req_format is None:
2144                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2145             elif req_format == 'worst':
2146                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2147             elif req_format == '-1':
2148                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2149             else:
2150                 # Specific format
2151                 if req_format not in url_map:
2152                     self._downloader.trouble(u'ERROR: requested format not available')
2153                     return
2154                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2155
2156         results = []
2157         for format_param, video_real_url in video_url_list:
2158             # Extension
2159             video_extension = self._video_extensions.get(format_param, 'mp4')
2160
2161             results.append({
2162                 'id':       video_id.decode('utf-8'),
2163                 'url':      video_real_url.decode('utf-8'),
2164                 'uploader': video_uploader.decode('utf-8'),
2165                 'upload_date':  upload_date,
2166                 'title':    video_title,
2167                 'ext':      video_extension.decode('utf-8'),
2168                 'format':   (format_param is None and u'NA' or format_param.decode('utf-8')),
2169                 'thumbnail':    video_thumbnail.decode('utf-8'),
2170                 'description':  video_description.decode('utf-8'),
2171             })
2172         return results
2173
2174 class BlipTVIE(InfoExtractor):
2175     """Information extractor for blip.tv"""
2176
2177     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2178     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2179     IE_NAME = u'blip.tv'
2180
2181     def report_extraction(self, file_id):
2182         """Report information extraction."""
2183         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2184
2185     def report_direct_download(self, title):
2186         """Report information extraction."""
2187         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2188
2189     def _real_extract(self, url):
2190         mobj = re.match(self._VALID_URL, url)
2191         if mobj is None:
2192             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2193             return
2194
2195         if '?' in url:
2196             cchar = '&'
2197         else:
2198             cchar = '?'
2199         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2200         request = compat_urllib_request.Request(json_url)
2201         self.report_extraction(mobj.group(1))
2202         info = None
2203         try:
2204             urlh = compat_urllib_request.urlopen(request)
2205             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2206                 basename = url.split('/')[-1]
2207                 title,ext = os.path.splitext(basename)
2208                 title = title.decode('UTF-8')
2209                 ext = ext.replace('.', '')
2210                 self.report_direct_download(title)
2211                 info = {
2212                     'id': title,
2213                     'url': url,
2214                     'uploader': None,
2215                     'upload_date': None,
2216                     'title': title,
2217                     'ext': ext,
2218                     'urlhandle': urlh
2219                 }
2220         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2221             self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2222             return
2223         if info is None: # Regular URL
2224             try:
2225                 json_code_bytes = urlh.read()
2226                 json_code = json_code_bytes.decode('utf-8')
2227             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2228                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2229                 return
2230
2231             try:
2232                 json_data = json.loads(json_code)
2233                 if 'Post' in json_data:
2234                     data = json_data['Post']
2235                 else:
2236                     data = json_data
2237
2238                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2239                 video_url = data['media']['url']
2240                 umobj = re.match(self._URL_EXT, video_url)
2241                 if umobj is None:
2242                     raise ValueError('Can not determine filename extension')
2243                 ext = umobj.group(1)
2244
2245                 info = {
2246                     'id': data['item_id'],
2247                     'url': video_url,
2248                     'uploader': data['display_name'],
2249                     'upload_date': upload_date,
2250                     'title': data['title'],
2251                     'ext': ext,
2252                     'format': data['media']['mimeType'],
2253                     'thumbnail': data['thumbnailUrl'],
2254                     'description': data['description'],
2255                     'player_url': data['embedUrl']
2256                 }
2257             except (ValueError,KeyError) as err:
2258                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2259                 return
2260
2261         std_headers['User-Agent'] = 'iTunes/10.6.1'
2262         return [info]
2263
2264
2265 class MyVideoIE(InfoExtractor):
2266     """Information Extractor for myvideo.de."""
2267
2268     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2269     IE_NAME = u'myvideo'
2270
2271     def __init__(self, downloader=None):
2272         InfoExtractor.__init__(self, downloader)
2273
2274     def report_download_webpage(self, video_id):
2275         """Report webpage download."""
2276         self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2277
2278     def report_extraction(self, video_id):
2279         """Report information extraction."""
2280         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2281
2282     def _real_extract(self,url):
2283         mobj = re.match(self._VALID_URL, url)
2284         if mobj is None:
2285             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2286             return
2287
2288         video_id = mobj.group(1)
2289
2290         # Get video webpage
2291         request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
2292         try:
2293             self.report_download_webpage(video_id)
2294             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
2295         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2296             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2297             return
2298
2299         self.report_extraction(video_id)
2300         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2301                  webpage)
2302         if mobj is None:
2303             self._downloader.trouble(u'ERROR: unable to extract media URL')
2304             return
2305         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2306
2307         mobj = re.search('<title>([^<]+)</title>', webpage)
2308         if mobj is None:
2309             self._downloader.trouble(u'ERROR: unable to extract title')
2310             return
2311
2312         video_title = mobj.group(1)
2313
2314         return [{
2315             'id':       video_id,
2316             'url':      video_url,
2317             'uploader': None,
2318             'upload_date':  None,
2319             'title':    video_title,
2320             'ext':      u'flv',
2321         }]
2322
2323 class ComedyCentralIE(InfoExtractor):
2324     """Information extractor for The Daily Show and Colbert Report """
2325
2326     # urls can be abbreviations like :thedailyshow or :colbert
2327     # urls for episodes like:
2328     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2329     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2330     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2331     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2332                       |(https?://)?(www\.)?
2333                           (?P<showname>thedailyshow|colbertnation)\.com/
2334                          (full-episodes/(?P<episode>.*)|
2335                           (?P<clip>
2336                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2337                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2338                      $"""
2339     IE_NAME = u'comedycentral'
2340
2341     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2342
2343     _video_extensions = {
2344         '3500': 'mp4',
2345         '2200': 'mp4',
2346         '1700': 'mp4',
2347         '1200': 'mp4',
2348         '750': 'mp4',
2349         '400': 'mp4',
2350     }
2351     _video_dimensions = {
2352         '3500': '1280x720',
2353         '2200': '960x540',
2354         '1700': '768x432',
2355         '1200': '640x360',
2356         '750': '512x288',
2357         '400': '384x216',
2358     }
2359
2360     def suitable(self, url):
2361         """Receives a URL and returns True if suitable for this IE."""
2362         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2363
2364     def report_extraction(self, episode_id):
2365         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2366
2367     def report_config_download(self, episode_id):
2368         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2369
2370     def report_index_download(self, episode_id):
2371         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2372
2373     def report_player_url(self, episode_id):
2374         self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2375
2376
2377     def _print_formats(self, formats):
2378         print('Available formats:')
2379         for x in formats:
2380             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2381
2382
2383     def _real_extract(self, url):
2384         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2385         if mobj is None:
2386             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2387             return
2388
2389         if mobj.group('shortname'):
2390             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2391                 url = u'http://www.thedailyshow.com/full-episodes/'
2392             else:
2393                 url = u'http://www.colbertnation.com/full-episodes/'
2394             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2395             assert mobj is not None
2396
2397         if mobj.group('clip'):
2398             if mobj.group('showname') == 'thedailyshow':
2399                 epTitle = mobj.group('tdstitle')
2400             else:
2401                 epTitle = mobj.group('cntitle')
2402             dlNewest = False
2403         else:
2404             dlNewest = not mobj.group('episode')
2405             if dlNewest:
2406                 epTitle = mobj.group('showname')
2407             else:
2408                 epTitle = mobj.group('episode')
2409
2410         req = compat_urllib_request.Request(url)
2411         self.report_extraction(epTitle)
2412         try:
2413             htmlHandle = compat_urllib_request.urlopen(req)
2414             html = htmlHandle.read()
2415         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2416             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2417             return
2418         if dlNewest:
2419             url = htmlHandle.geturl()
2420             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2421             if mobj is None:
2422                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2423                 return
2424             if mobj.group('episode') == '':
2425                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2426                 return
2427             epTitle = mobj.group('episode')
2428
2429         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
2430
2431         if len(mMovieParams) == 0:
2432             # The Colbert Report embeds the information in a without
2433             # a URL prefix; so extract the alternate reference
2434             # and then add the URL prefix manually.
2435
2436             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
2437             if len(altMovieParams) == 0:
2438                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2439                 return
2440             else:
2441                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2442
2443         playerUrl_raw = mMovieParams[0][0]
2444         self.report_player_url(epTitle)
2445         try:
2446             urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2447             playerUrl = urlHandle.geturl()
2448         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2449             self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2450             return
2451
2452         uri = mMovieParams[0][1]
2453         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2454         self.report_index_download(epTitle)
2455         try:
2456             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2457         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2458             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2459             return
2460
2461         results = []
2462
2463         idoc = xml.etree.ElementTree.fromstring(indexXml)
2464         itemEls = idoc.findall('.//item')
2465         for itemEl in itemEls:
2466             mediaId = itemEl.findall('./guid')[0].text
2467             shortMediaId = mediaId.split(':')[-1]
2468             showId = mediaId.split(':')[-2].replace('.com', '')
2469             officialTitle = itemEl.findall('./title')[0].text
2470             officialDate = itemEl.findall('./pubDate')[0].text
2471
2472             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2473                         compat_urllib_parse.urlencode({'uri': mediaId}))
2474             configReq = compat_urllib_request.Request(configUrl)
2475             self.report_config_download(epTitle)
2476             try:
2477                 configXml = compat_urllib_request.urlopen(configReq).read()
2478             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2479                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2480                 return
2481
2482             cdoc = xml.etree.ElementTree.fromstring(configXml)
2483             turls = []
2484             for rendition in cdoc.findall('.//rendition'):
2485                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2486                 turls.append(finfo)
2487
2488             if len(turls) == 0:
2489                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2490                 continue
2491
2492             if self._downloader.params.get('listformats', None):
2493                 self._print_formats([i[0] for i in turls])
2494                 return
2495
2496             # For now, just pick the highest bitrate
2497             format,video_url = turls[-1]
2498
2499             # Get the format arg from the arg stream
2500             req_format = self._downloader.params.get('format', None)
2501
2502             # Select format if we can find one
2503             for f,v in turls:
2504                 if f == req_format:
2505                     format, video_url = f, v
2506                     break
2507
2508             # Patch to download from alternative CDN, which does not
2509             # break on current RTMPDump builds
2510             broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2511             better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2512
2513             if video_url.startswith(broken_cdn):
2514                 video_url = video_url.replace(broken_cdn, better_cdn)
2515
2516             effTitle = showId + u'-' + epTitle
2517             info = {
2518                 'id': shortMediaId,
2519                 'url': video_url,
2520                 'uploader': showId,
2521                 'upload_date': officialDate,
2522                 'title': effTitle,
2523                 'ext': 'mp4',
2524                 'format': format,
2525                 'thumbnail': None,
2526                 'description': officialTitle,
2527                 'player_url': None #playerUrl
2528             }
2529
2530             results.append(info)
2531
2532         return results
2533
2534
2535 class EscapistIE(InfoExtractor):
2536     """Information extractor for The Escapist """
2537
2538     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2539     IE_NAME = u'escapist'
2540
2541     def report_extraction(self, showName):
2542         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2543
2544     def report_config_download(self, showName):
2545         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2546
2547     def _real_extract(self, url):
2548         mobj = re.match(self._VALID_URL, url)
2549         if mobj is None:
2550             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2551             return
2552         showName = mobj.group('showname')
2553         videoId = mobj.group('episode')
2554
2555         self.report_extraction(showName)
2556         try:
2557             webPage = compat_urllib_request.urlopen(url)
2558             webPageBytes = webPage.read()
2559             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2560             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2561         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2562             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2563             return
2564
2565         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2566         description = unescapeHTML(descMatch.group(1))
2567         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2568         imgUrl = unescapeHTML(imgMatch.group(1))
2569         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2570         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2571         configUrlMatch = re.search('config=(.*)$', playerUrl)
2572         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2573
2574         self.report_config_download(showName)
2575         try:
2576             configJSON = compat_urllib_request.urlopen(configUrl)
2577             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2578             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2579         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2580             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2581             return
2582
2583         # Technically, it's JavaScript, not JSON
2584         configJSON = configJSON.replace("'", '"')
2585
2586         try:
2587             config = json.loads(configJSON)
2588         except (ValueError,) as err:
2589             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2590             return
2591
2592         playlist = config['playlist']
2593         videoUrl = playlist[1]['url']
2594
2595         info = {
2596             'id': videoId,
2597             'url': videoUrl,
2598             'uploader': showName,
2599             'upload_date': None,
2600             'title': showName,
2601             'ext': 'flv',
2602             'thumbnail': imgUrl,
2603             'description': description,
2604             'player_url': playerUrl,
2605         }
2606
2607         return [info]
2608
2609
2610 class CollegeHumorIE(InfoExtractor):
2611     """Information extractor for collegehumor.com"""
2612
2613     _WORKING = False
2614     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2615     IE_NAME = u'collegehumor'
2616
2617     def report_manifest(self, video_id):
2618         """Report information extraction."""
2619         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2620
2621     def report_extraction(self, video_id):
2622         """Report information extraction."""
2623         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2624
2625     def _real_extract(self, url):
2626         mobj = re.match(self._VALID_URL, url)
2627         if mobj is None:
2628             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2629             return
2630         video_id = mobj.group('videoid')
2631
2632         info = {
2633             'id': video_id,
2634             'uploader': None,
2635             'upload_date': None,
2636         }
2637
2638         self.report_extraction(video_id)
2639         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2640         try:
2641             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2642         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2643             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2644             return
2645
2646         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2647         try:
2648             videoNode = mdoc.findall('./video')[0]
2649             info['description'] = videoNode.findall('./description')[0].text
2650             info['title'] = videoNode.findall('./caption')[0].text
2651             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2652             manifest_url = videoNode.findall('./file')[0].text
2653         except IndexError:
2654             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2655             return
2656
2657         manifest_url += '?hdcore=2.10.3'
2658         self.report_manifest(video_id)
2659         try:
2660             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2661         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2662             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2663             return
2664
2665         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2666         try:
2667             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2668             node_id = media_node.attrib['url']
2669             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2670         except IndexError as err:
2671             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2672             return
2673
2674         url_pr = compat_urllib_parse_urlparse(manifest_url)
2675         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2676
2677         info['url'] = url
2678         info['ext'] = 'f4f'
2679         return [info]
2680
2681
2682 class XVideosIE(InfoExtractor):
2683     """Information extractor for xvideos.com"""
2684
2685     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2686     IE_NAME = u'xvideos'
2687
2688     def report_webpage(self, video_id):
2689         """Report information extraction."""
2690         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2691
2692     def report_extraction(self, video_id):
2693         """Report information extraction."""
2694         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2695
2696     def _real_extract(self, url):
2697         mobj = re.match(self._VALID_URL, url)
2698         if mobj is None:
2699             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2700             return
2701         video_id = mobj.group(1)
2702
2703         self.report_webpage(video_id)
2704
2705         request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2706         try:
2707             webpage_bytes = compat_urllib_request.urlopen(request).read()
2708             webpage = webpage_bytes.decode('utf-8', 'replace')
2709         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2710             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2711             return
2712
2713         self.report_extraction(video_id)
2714
2715
2716         # Extract video URL
2717         mobj = re.search(r'flv_url=(.+?)&', webpage)
2718         if mobj is None:
2719             self._downloader.trouble(u'ERROR: unable to extract video url')
2720             return
2721         video_url = compat_urllib_parse.unquote(mobj.group(1))
2722
2723
2724         # Extract title
2725         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2726         if mobj is None:
2727             self._downloader.trouble(u'ERROR: unable to extract video title')
2728             return
2729         video_title = mobj.group(1)
2730
2731
2732         # Extract video thumbnail
2733         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2734         if mobj is None:
2735             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2736             return
2737         video_thumbnail = mobj.group(0)
2738
2739         info = {
2740             'id': video_id,
2741             'url': video_url,
2742             'uploader': None,
2743             'upload_date': None,
2744             'title': video_title,
2745             'ext': 'flv',
2746             'thumbnail': video_thumbnail,
2747             'description': None,
2748         }
2749
2750         return [info]
2751
2752
2753 class SoundcloudIE(InfoExtractor):
2754     """Information extractor for soundcloud.com
2755        To access the media, the uid of the song and a stream token
2756        must be extracted from the page source and the script must make
2757        a request to media.soundcloud.com/crossdomain.xml. Then
2758        the media can be grabbed by requesting from an url composed
2759        of the stream token and uid
2760      """
2761
2762     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2763     IE_NAME = u'soundcloud'
2764
2765     def __init__(self, downloader=None):
2766         InfoExtractor.__init__(self, downloader)
2767
2768     def report_resolve(self, video_id):
2769         """Report information extraction."""
2770         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2771
2772     def report_extraction(self, video_id):
2773         """Report information extraction."""
2774         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2775
2776     def _real_extract(self, url):
2777         mobj = re.match(self._VALID_URL, url)
2778         if mobj is None:
2779             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2780             return
2781
2782         # extract uploader (which is in the url)
2783         uploader = mobj.group(1)
2784         # extract simple title (uploader + slug of song title)
2785         slug_title =  mobj.group(2)
2786         simple_title = uploader + u'-' + slug_title
2787
2788         self.report_resolve('%s/%s' % (uploader, slug_title))
2789
2790         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2791         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2792         request = compat_urllib_request.Request(resolv_url)
2793         try:
2794             info_json_bytes = compat_urllib_request.urlopen(request).read()
2795             info_json = info_json_bytes.decode('utf-8')
2796         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2797             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2798             return
2799
2800         info = json.loads(info_json)
2801         video_id = info['id']
2802         self.report_extraction('%s/%s' % (uploader, slug_title))
2803
2804         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2805         request = compat_urllib_request.Request(streams_url)
2806         try:
2807             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2808             stream_json = stream_json_bytes.decode('utf-8')
2809         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2810             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2811             return
2812
2813         streams = json.loads(stream_json)
2814         mediaURL = streams['http_mp3_128_url']
2815
2816         return [{
2817             'id':       info['id'],
2818             'url':      mediaURL,
2819             'uploader': info['user']['username'],
2820             'upload_date':  info['created_at'],
2821             'title':    info['title'],
2822             'ext':      u'mp3',
2823             'description': info['description'],
2824         }]
2825
2826
2827 class InfoQIE(InfoExtractor):
2828     """Information extractor for infoq.com"""
2829
2830     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2831     IE_NAME = u'infoq'
2832
2833     def report_webpage(self, video_id):
2834         """Report information extraction."""
2835         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2836
2837     def report_extraction(self, video_id):
2838         """Report information extraction."""
2839         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2840
2841     def _real_extract(self, url):
2842         mobj = re.match(self._VALID_URL, url)
2843         if mobj is None:
2844             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2845             return
2846
2847         self.report_webpage(url)
2848
2849         request = compat_urllib_request.Request(url)
2850         try:
2851             webpage = compat_urllib_request.urlopen(request).read()
2852         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2853             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2854             return
2855
2856         self.report_extraction(url)
2857
2858
2859         # Extract video URL
2860         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2861         if mobj is None:
2862             self._downloader.trouble(u'ERROR: unable to extract video url')
2863             return
2864         video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2865
2866
2867         # Extract title
2868         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2869         if mobj is None:
2870             self._downloader.trouble(u'ERROR: unable to extract video title')
2871             return
2872         video_title = mobj.group(1).decode('utf-8')
2873
2874         # Extract description
2875         video_description = u'No description available.'
2876         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2877         if mobj is not None:
2878             video_description = mobj.group(1).decode('utf-8')
2879
2880         video_filename = video_url.split('/')[-1]
2881         video_id, extension = video_filename.split('.')
2882
2883         info = {
2884             'id': video_id,
2885             'url': video_url,
2886             'uploader': None,
2887             'upload_date': None,
2888             'title': video_title,
2889             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2890             'thumbnail': None,
2891             'description': video_description,
2892         }
2893
2894         return [info]
2895
2896 class MixcloudIE(InfoExtractor):
2897     """Information extractor for www.mixcloud.com"""
2898
2899     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2900     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2901     IE_NAME = u'mixcloud'
2902
2903     def __init__(self, downloader=None):
2904         InfoExtractor.__init__(self, downloader)
2905
2906     def report_download_json(self, file_id):
2907         """Report JSON download."""
2908         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2909
2910     def report_extraction(self, file_id):
2911         """Report information extraction."""
2912         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2913
2914     def get_urls(self, jsonData, fmt, bitrate='best'):
2915         """Get urls from 'audio_formats' section in json"""
2916         file_url = None
2917         try:
2918             bitrate_list = jsonData[fmt]
2919             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2920                 bitrate = max(bitrate_list) # select highest
2921
2922             url_list = jsonData[fmt][bitrate]
2923         except TypeError: # we have no bitrate info.
2924             url_list = jsonData[fmt]
2925         return url_list
2926
2927     def check_urls(self, url_list):
2928         """Returns 1st active url from list"""
2929         for url in url_list:
2930             try:
2931                 compat_urllib_request.urlopen(url)
2932                 return url
2933             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2934                 url = None
2935
2936         return None
2937
2938     def _print_formats(self, formats):
2939         print('Available formats:')
2940         for fmt in formats.keys():
2941             for b in formats[fmt]:
2942                 try:
2943                     ext = formats[fmt][b][0]
2944                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2945                 except TypeError: # we have no bitrate info
2946                     ext = formats[fmt][0]
2947                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2948                     break
2949
2950     def _real_extract(self, url):
2951         mobj = re.match(self._VALID_URL, url)
2952         if mobj is None:
2953             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2954             return
2955         # extract uploader & filename from url
2956         uploader = mobj.group(1).decode('utf-8')
2957         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2958
2959         # construct API request
2960         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2961         # retrieve .json file with links to files
2962         request = compat_urllib_request.Request(file_url)
2963         try:
2964             self.report_download_json(file_url)
2965             jsonData = compat_urllib_request.urlopen(request).read()
2966         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2967             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2968             return
2969
2970         # parse JSON
2971         json_data = json.loads(jsonData)
2972         player_url = json_data['player_swf_url']
2973         formats = dict(json_data['audio_formats'])
2974
2975         req_format = self._downloader.params.get('format', None)
2976         bitrate = None
2977
2978         if self._downloader.params.get('listformats', None):
2979             self._print_formats(formats)
2980             return
2981
2982         if req_format is None or req_format == 'best':
2983             for format_param in formats.keys():
2984                 url_list = self.get_urls(formats, format_param)
2985                 # check urls
2986                 file_url = self.check_urls(url_list)
2987                 if file_url is not None:
2988                     break # got it!
2989         else:
2990             if req_format not in formats:
2991                 self._downloader.trouble(u'ERROR: format is not available')
2992                 return
2993
2994             url_list = self.get_urls(formats, req_format)
2995             file_url = self.check_urls(url_list)
2996             format_param = req_format
2997
2998         return [{
2999             'id': file_id.decode('utf-8'),
3000             'url': file_url.decode('utf-8'),
3001             'uploader': uploader.decode('utf-8'),
3002             'upload_date': None,
3003             'title': json_data['name'],
3004             'ext': file_url.split('.')[-1].decode('utf-8'),
3005             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3006             'thumbnail': json_data['thumbnail_url'],
3007             'description': json_data['description'],
3008             'player_url': player_url.decode('utf-8'),
3009         }]
3010
3011 class StanfordOpenClassroomIE(InfoExtractor):
3012     """Information extractor for Stanford's Open ClassRoom"""
3013
3014     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3015     IE_NAME = u'stanfordoc'
3016
3017     def report_download_webpage(self, objid):
3018         """Report information extraction."""
3019         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3020
3021     def report_extraction(self, video_id):
3022         """Report information extraction."""
3023         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3024
3025     def _real_extract(self, url):
3026         mobj = re.match(self._VALID_URL, url)
3027         if mobj is None:
3028             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3029             return
3030
3031         if mobj.group('course') and mobj.group('video'): # A specific video
3032             course = mobj.group('course')
3033             video = mobj.group('video')
3034             info = {
3035                 'id': course + '_' + video,
3036                 'uploader': None,
3037                 'upload_date': None,
3038             }
3039
3040             self.report_extraction(info['id'])
3041             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3042             xmlUrl = baseUrl + video + '.xml'
3043             try:
3044                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3045             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3046                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3047                 return
3048             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3049             try:
3050                 info['title'] = mdoc.findall('./title')[0].text
3051                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3052             except IndexError:
3053                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3054                 return
3055             info['ext'] = info['url'].rpartition('.')[2]
3056             return [info]
3057         elif mobj.group('course'): # A course page
3058             course = mobj.group('course')
3059             info = {
3060                 'id': course,
3061                 'type': 'playlist',
3062                 'uploader': None,
3063                 'upload_date': None,
3064             }
3065
3066             self.report_download_webpage(info['id'])
3067             try:
3068                 coursepage = compat_urllib_request.urlopen(url).read()
3069             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3070                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3071                 return
3072
3073             m = re.search('<h1>([^<]+)</h1>', coursepage)
3074             if m:
3075                 info['title'] = unescapeHTML(m.group(1))
3076             else:
3077                 info['title'] = info['id']
3078
3079             m = re.search('<description>([^<]+)</description>', coursepage)
3080             if m:
3081                 info['description'] = unescapeHTML(m.group(1))
3082
3083             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3084             info['list'] = [
3085                 {
3086                     'type': 'reference',
3087                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3088                 }
3089                     for vpage in links]
3090             results = []
3091             for entry in info['list']:
3092                 assert entry['type'] == 'reference'
3093                 results += self.extract(entry['url'])
3094             return results
3095
3096         else: # Root page
3097             info = {
3098                 'id': 'Stanford OpenClassroom',
3099                 'type': 'playlist',
3100                 'uploader': None,
3101                 'upload_date': None,
3102             }
3103
3104             self.report_download_webpage(info['id'])
3105             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3106             try:
3107                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3108             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3109                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3110                 return
3111
3112             info['title'] = info['id']
3113
3114             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3115             info['list'] = [
3116                 {
3117                     'type': 'reference',
3118                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3119                 }
3120                     for cpage in links]
3121
3122             results = []
3123             for entry in info['list']:
3124                 assert entry['type'] == 'reference'
3125                 results += self.extract(entry['url'])
3126             return results
3127
3128 class MTVIE(InfoExtractor):
3129     """Information extractor for MTV.com"""
3130
3131     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3132     IE_NAME = u'mtv'
3133
3134     def report_webpage(self, video_id):
3135         """Report information extraction."""
3136         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3137
3138     def report_extraction(self, video_id):
3139         """Report information extraction."""
3140         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3141
3142     def _real_extract(self, url):
3143         mobj = re.match(self._VALID_URL, url)
3144         if mobj is None:
3145             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3146             return
3147         if not mobj.group('proto'):
3148             url = 'http://' + url
3149         video_id = mobj.group('videoid')
3150         self.report_webpage(video_id)
3151
3152         request = compat_urllib_request.Request(url)
3153         try:
3154             webpage = compat_urllib_request.urlopen(request).read()
3155         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3156             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3157             return
3158
3159         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3160         if mobj is None:
3161             self._downloader.trouble(u'ERROR: unable to extract song name')
3162             return
3163         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3164         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3165         if mobj is None:
3166             self._downloader.trouble(u'ERROR: unable to extract performer')
3167             return
3168         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3169         video_title = performer + ' - ' + song_name
3170
3171         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3172         if mobj is None:
3173             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3174             return
3175         mtvn_uri = mobj.group(1)
3176
3177         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3178         if mobj is None:
3179             self._downloader.trouble(u'ERROR: unable to extract content id')
3180             return
3181         content_id = mobj.group(1)
3182
3183         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3184         self.report_extraction(video_id)
3185         request = compat_urllib_request.Request(videogen_url)
3186         try:
3187             metadataXml = compat_urllib_request.urlopen(request).read()
3188         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3189             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3190             return
3191
3192         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3193         renditions = mdoc.findall('.//rendition')
3194
3195         # For now, always pick the highest quality.
3196         rendition = renditions[-1]
3197
3198         try:
3199             _,_,ext = rendition.attrib['type'].partition('/')
3200             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3201             video_url = rendition.find('./src').text
3202         except KeyError:
3203             self._downloader.trouble('Invalid rendition field.')
3204             return
3205
3206         info = {
3207             'id': video_id,
3208             'url': video_url,
3209             'uploader': performer,
3210             'upload_date': None,
3211             'title': video_title,
3212             'ext': ext,
3213             'format': format,
3214         }
3215
3216         return [info]
3217
3218
3219 class YoukuIE(InfoExtractor):
3220
3221     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3222     IE_NAME = u'Youku'
3223
3224     def __init__(self, downloader=None):
3225         InfoExtractor.__init__(self, downloader)
3226
3227     def report_download_webpage(self, file_id):
3228         """Report webpage download."""
3229         self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3230
3231     def report_extraction(self, file_id):
3232         """Report information extraction."""
3233         self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3234
3235     def _gen_sid(self):
3236         nowTime = int(time.time() * 1000)
3237         random1 = random.randint(1000,1998)
3238         random2 = random.randint(1000,9999)
3239
3240         return "%d%d%d" %(nowTime,random1,random2)
3241
3242     def _get_file_ID_mix_string(self, seed):
3243         mixed = []
3244         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3245         seed = float(seed)
3246         for i in range(len(source)):
3247             seed  =  (seed * 211 + 30031 ) % 65536
3248             index  =  math.floor(seed / 65536 * len(source) )
3249             mixed.append(source[int(index)])
3250             source.remove(source[int(index)])
3251         #return ''.join(mixed)
3252         return mixed
3253
3254     def _get_file_id(self, fileId, seed):
3255         mixed = self._get_file_ID_mix_string(seed)
3256         ids = fileId.split('*')
3257         realId = []
3258         for ch in ids:
3259             if ch:
3260                 realId.append(mixed[int(ch)])
3261         return ''.join(realId)
3262
3263     def _real_extract(self, url):
3264         mobj = re.match(self._VALID_URL, url)
3265         if mobj is None:
3266             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3267             return
3268         video_id = mobj.group('ID')
3269
3270         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3271
3272         request = compat_urllib_request.Request(info_url, None, std_headers)
3273         try:
3274             self.report_download_webpage(video_id)
3275             jsondata = compat_urllib_request.urlopen(request).read()
3276         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3277             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3278             return
3279
3280         self.report_extraction(video_id)
3281         try:
3282             jsonstr = jsondata.decode('utf-8')
3283             config = json.loads(jsonstr)
3284
3285             video_title =  config['data'][0]['title']
3286             seed = config['data'][0]['seed']
3287
3288             format = self._downloader.params.get('format', None)
3289             supported_format = list(config['data'][0]['streamfileids'].keys())
3290
3291             if format is None or format == 'best':
3292                 if 'hd2' in supported_format:
3293                     format = 'hd2'
3294                 else:
3295                     format = 'flv'
3296                 ext = u'flv'
3297             elif format == 'worst':
3298                 format = 'mp4'
3299                 ext = u'mp4'
3300             else:
3301                 format = 'flv'
3302                 ext = u'flv'
3303
3304
3305             fileid = config['data'][0]['streamfileids'][format]
3306             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3307         except (UnicodeDecodeError, ValueError, KeyError):
3308             self._downloader.trouble(u'ERROR: unable to extract info section')
3309             return
3310
3311         files_info=[]
3312         sid = self._gen_sid()
3313         fileid = self._get_file_id(fileid, seed)
3314
3315         #column 8,9 of fileid represent the segment number
3316         #fileid[7:9] should be changed
3317         for index, key in enumerate(keys):
3318
3319             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3320             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3321
3322             info = {
3323                 'id': '%s_part%02d' % (video_id, index),
3324                 'url': download_url,
3325                 'uploader': None,
3326                 'upload_date': None,
3327                 'title': video_title,
3328                 'ext': ext,
3329             }
3330             files_info.append(info)
3331
3332         return files_info
3333
3334
3335 class XNXXIE(InfoExtractor):
3336     """Information extractor for xnxx.com"""
3337
3338     _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3339     IE_NAME = u'xnxx'
3340     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3341     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3342     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3343
3344     def report_webpage(self, video_id):
3345         """Report information extraction"""
3346         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3347
3348     def report_extraction(self, video_id):
3349         """Report information extraction"""
3350         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3351
3352     def _real_extract(self, url):
3353         mobj = re.match(self._VALID_URL, url)
3354         if mobj is None:
3355             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3356             return
3357         video_id = mobj.group(1)
3358
3359         self.report_webpage(video_id)
3360
3361         # Get webpage content
3362         try:
3363             webpage_bytes = compat_urllib_request.urlopen(url).read()
3364             webpage = webpage_bytes.decode('utf-8')
3365         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3366             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3367             return
3368
3369         result = re.search(self.VIDEO_URL_RE, webpage)
3370         if result is None:
3371             self._downloader.trouble(u'ERROR: unable to extract video url')
3372             return
3373         video_url = compat_urllib_parse.unquote(result.group(1))
3374
3375         result = re.search(self.VIDEO_TITLE_RE, webpage)
3376         if result is None:
3377             self._downloader.trouble(u'ERROR: unable to extract video title')
3378             return
3379         video_title = result.group(1)
3380
3381         result = re.search(self.VIDEO_THUMB_RE, webpage)
3382         if result is None:
3383             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3384             return
3385         video_thumbnail = result.group(1)
3386
3387         return [{
3388             'id': video_id,
3389             'url': video_url,
3390             'uploader': None,
3391             'upload_date': None,
3392             'title': video_title,
3393             'ext': 'flv',
3394             'thumbnail': video_thumbnail,
3395             'description': None,
3396         }]
3397
3398
3399 class GooglePlusIE(InfoExtractor):
3400     """Information extractor for plus.google.com."""
3401
3402     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3403     IE_NAME = u'plus.google'
3404
3405     def __init__(self, downloader=None):
3406         InfoExtractor.__init__(self, downloader)
3407
3408     def report_extract_entry(self, url):
3409         """Report downloading extry"""
3410         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3411
3412     def report_date(self, upload_date):
3413         """Report downloading extry"""
3414         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3415
3416     def report_uploader(self, uploader):
3417         """Report downloading extry"""
3418         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3419
3420     def report_title(self, video_title):
3421         """Report downloading extry"""
3422         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3423
3424     def report_extract_vid_page(self, video_page):
3425         """Report information extraction."""
3426         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3427
3428     def _real_extract(self, url):
3429         # Extract id from URL
3430         mobj = re.match(self._VALID_URL, url)
3431         if mobj is None:
3432             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3433             return
3434
3435         post_url = mobj.group(0)
3436         video_id = mobj.group(1)
3437
3438         video_extension = 'flv'
3439
3440         # Step 1, Retrieve post webpage to extract further information
3441         self.report_extract_entry(post_url)
3442         request = compat_urllib_request.Request(post_url)
3443         try:
3444             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3445         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3446             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3447             return
3448
3449         # Extract update date
3450         upload_date = None
3451         pattern = 'title="Timestamp">(.*?)</a>'
3452         mobj = re.search(pattern, webpage)
3453         if mobj:
3454             upload_date = mobj.group(1)
3455             # Convert timestring to a format suitable for filename
3456             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3457             upload_date = upload_date.strftime('%Y%m%d')
3458         self.report_date(upload_date)
3459
3460         # Extract uploader
3461         uploader = None
3462         pattern = r'rel\="author".*?>(.*?)</a>'
3463         mobj = re.search(pattern, webpage)
3464         if mobj:
3465             uploader = mobj.group(1)
3466         self.report_uploader(uploader)
3467
3468         # Extract title
3469         # Get the first line for title
3470         video_title = u'NA'
3471         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3472         mobj = re.search(pattern, webpage)
3473         if mobj:
3474             video_title = mobj.group(1)
3475         self.report_title(video_title)
3476
3477         # Step 2, Stimulate clicking the image box to launch video
3478         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3479         mobj = re.search(pattern, webpage)
3480         if mobj is None:
3481             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3482
3483         video_page = mobj.group(1)
3484         request = compat_urllib_request.Request(video_page)
3485         try:
3486             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3487         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3488             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3489             return
3490         self.report_extract_vid_page(video_page)
3491
3492
3493         # Extract video links on video page
3494         """Extract video links of all sizes"""
3495         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3496         mobj = re.findall(pattern, webpage)
3497         if len(mobj) == 0:
3498             self._downloader.trouble(u'ERROR: unable to extract video links')
3499
3500         # Sort in resolution
3501         links = sorted(mobj)
3502
3503         # Choose the lowest of the sort, i.e. highest resolution
3504         video_url = links[-1]
3505         # Only get the url. The resolution part in the tuple has no use anymore
3506         video_url = video_url[-1]
3507         # Treat escaped \u0026 style hex
3508         try:
3509             video_url = video_url.decode("unicode_escape")
3510         except AttributeError: # Python 3
3511             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3512
3513
3514         return [{
3515             'id':       video_id,
3516             'url':      video_url,
3517             'uploader': uploader,
3518             'upload_date':  upload_date,
3519             'title':    video_title,
3520             'ext':      video_extension,
3521         }]
3522
3523 class NBAIE(InfoExtractor):
3524     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3525     IE_NAME = u'nba'
3526
3527     def report_extraction(self, video_id):
3528         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3529
3530     def _real_extract(self, url):
3531         mobj = re.match(self._VALID_URL, url)
3532         if mobj is None:
3533             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3534             return
3535
3536         video_id = mobj.group(1)
3537         if video_id.endswith('/index.html'):
3538             video_id = video_id[:-len('/index.html')]
3539
3540         self.report_extraction(video_id)
3541         try:
3542             urlh = compat_urllib_request.urlopen(url)
3543             webpage_bytes = urlh.read()
3544             webpage = webpage_bytes.decode('utf-8', 'ignore')
3545         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3546             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3547             return
3548
3549         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3550         def _findProp(rexp, default=None):
3551             m = re.search(rexp, webpage)
3552             if m:
3553                 return unescapeHTML(m.group(1))
3554             else:
3555                 return default
3556
3557         shortened_video_id = video_id.rpartition('/')[2]
3558         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3559         info = {
3560             'id': shortened_video_id,
3561             'url': video_url,
3562             'ext': 'mp4',
3563             'title': title,
3564             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3565             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3566         }
3567         return [info]
3568
3569 class JustinTVIE(InfoExtractor):
3570     """Information extractor for justin.tv and twitch.tv"""
3571     # TODO: One broadcast may be split into multiple videos. The key
3572     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3573     # starts at 1 and increases. Can we treat all parts as one video?
3574
3575     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3576         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3577     _JUSTIN_PAGE_LIMIT = 100
3578     IE_NAME = u'justin.tv'
3579
3580     def report_extraction(self, file_id):
3581         """Report information extraction."""
3582         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3583
3584     def report_download_page(self, channel, offset):
3585         """Report attempt to download a single page of videos."""
3586         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3587                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3588
3589     # Return count of items, list of *valid* items
3590     def _parse_page(self, url):
3591         try:
3592             urlh = compat_urllib_request.urlopen(url)
3593             webpage_bytes = urlh.read()
3594             webpage = webpage_bytes.decode('utf-8', 'ignore')
3595         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3596             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3597             return
3598
3599         response = json.loads(webpage)
3600         info = []
3601         for clip in response:
3602             video_url = clip['video_file_url']
3603             if video_url:
3604                 video_extension = os.path.splitext(video_url)[1][1:]
3605                 video_date = re.sub('-', '', clip['created_on'][:10])
3606                 info.append({
3607                     'id': clip['id'],
3608                     'url': video_url,
3609                     'title': clip['title'],
3610                     'uploader': clip.get('user_id', clip.get('channel_id')),
3611                     'upload_date': video_date,
3612                     'ext': video_extension,
3613                 })
3614         return (len(response), info)
3615
3616     def _real_extract(self, url):
3617         mobj = re.match(self._VALID_URL, url)
3618         if mobj is None:
3619             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3620             return
3621
3622         api = 'http://api.justin.tv'
3623         video_id = mobj.group(mobj.lastindex)
3624         paged = False
3625         if mobj.lastindex == 1:
3626             paged = True
3627             api += '/channel/archives/%s.json'
3628         else:
3629             api += '/clip/show/%s.json'
3630         api = api % (video_id,)
3631
3632         self.report_extraction(video_id)
3633
3634         info = []
3635         offset = 0
3636         limit = self._JUSTIN_PAGE_LIMIT
3637         while True:
3638             if paged:
3639                 self.report_download_page(video_id, offset)
3640             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3641             page_count, page_info = self._parse_page(page_url)
3642             info.extend(page_info)
3643             if not paged or page_count != limit:
3644                 break
3645             offset += limit
3646         return info
3647
3648 class FunnyOrDieIE(InfoExtractor):
3649     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3650     IE_NAME = u'FunnyOrDie'
3651
3652     def report_extraction(self, video_id):
3653         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3654
3655     def _real_extract(self, url):
3656         mobj = re.match(self._VALID_URL, url)
3657         if mobj is None:
3658             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3659             return
3660
3661         video_id = mobj.group('id')
3662         self.report_extraction(video_id)
3663         try:
3664             urlh = compat_urllib_request.urlopen(url)
3665             webpage_bytes = urlh.read()
3666             webpage = webpage_bytes.decode('utf-8', 'ignore')
3667         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3668             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
3669             return
3670
3671         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3672         if not m:
3673             self._downloader.trouble(u'ERROR: unable to find video information')
3674         video_url = unescapeHTML(m.group('url'))
3675         print(video_url)
3676
3677         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3678         if not m:
3679             self._downloader.trouble(u'Cannot find video title')
3680         title = unescapeHTML(m.group('title'))
3681
3682         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3683         if m:
3684             desc = unescapeHTML(m.group('desc'))
3685         else:
3686             desc = None
3687
3688         info = {
3689             'id': video_id,
3690             'url': video_url,
3691             'ext': 'mp4',
3692             'title': title,
3693             'description': desc,
3694         }
3695         return [info]