_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18
  19 from .utils import *
  20
  21
  22 class InfoExtractor(object):
  23     """Information Extractor class.
  24
  25     Information extractors are the classes that, given a URL, extract
  26     information about the video (or videos) the URL refers to. This
  27     information includes the real video URL, the video title, author and
  28     others. The information is stored in a dictionary which is then
  29     passed to the FileDownloader. The FileDownloader processes this
  30     information possibly downloading the video to the file system, among
  31     other possible outcomes.
  32
  33     The dictionaries must include the following fields:
  34
  35     id:             Video identifier.
  36     url:            Final video URL.
  37     title:          Video title, unescaped.
  38     ext:            Video filename extension.
  39
  40     The following fields are optional:
  41
  42     format:         The video format, defaults to ext (used for --get-format)
  43     thumbnail:      Full URL to a video thumbnail image.
  44     description:    One-line video description.
  45     uploader:       Full name of the video uploader.
  46     upload_date:    Video upload date (YYYYMMDD).
  47     uploader_id:    Nickname or id of the video uploader.
  48     location:       Physical location of the video.
  49     player_url:     SWF Player URL (used for rtmpdump).
  50     subtitles:      The .srt file contents.
  51     urlhandle:      [internal] The urlHandle to be used to download the file,
  52                     like returned by urllib.request.urlopen
  53
  54     The fields should all be Unicode strings.
  55
  56     Subclasses of this one should re-define the _real_initialize() and
  57     _real_extract() methods and define a _VALID_URL regexp.
  58     Probably, they should also be added to the list of extractors.
  59
  60     _real_extract() must return a *list* of information dictionaries as
  61     described above.
  62
  63     Finally, the _WORKING attribute should be set to False for broken IEs
  64     in order to warn the users and skip the tests.
  65     """
  66
  67     _ready = False
  68     _downloader = None
  69     _WORKING = True
  70
  71     def __init__(self, downloader=None):
  72         """Constructor. Receives an optional downloader."""
  73         self._ready = False
  74         self.set_downloader(downloader)
  75
  76     def suitable(self, url):
  77         """Receives a URL and returns True if suitable for this IE."""
  78         return re.match(self._VALID_URL, url) is not None
  79
  80     def working(self):
  81         """Getter method for _WORKING."""
  82         return self._WORKING
  83
  84     def initialize(self):
  85         """Initializes an instance (authentication, etc)."""
  86         if not self._ready:
  87             self._real_initialize()
  88             self._ready = True
  89
  90     def extract(self, url):
  91         """Extracts URL information and returns it in list of dicts."""
  92         self.initialize()
  93         return self._real_extract(url)
  94
  95     def set_downloader(self, downloader):
  96         """Sets the downloader for this IE."""
  97         self._downloader = downloader
  98
  99     def _real_initialize(self):
 100         """Real initialization process. Redefine in subclasses."""
 101         pass
 102
 103     def _real_extract(self, url):
 104         """Real extraction process. Redefine in subclasses."""
 105         pass
 106
 107     @property
 108     def IE_NAME(self):
 109         return type(self).__name__[:-2]
 110
 111     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 112         """ Returns the response handle """
 113         if note is None:
 114             note = u'Downloading video webpage'
 115         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
 116         try:
 117             return compat_urllib_request.urlopen(url_or_request)
 118         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 119             if errnote is None:
 120                 errnote = u'Unable to download webpage'
 121             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 122
 123     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 124         """ Returns the data of the page as a string """
 125         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 126         webpage_bytes = urlh.read()
 127         return webpage_bytes.decode('utf-8', 'replace')
 128
 129
 130 class YoutubeIE(InfoExtractor):
 131     """Information extractor for youtube.com."""
 132
 133     _VALID_URL = r"""^
 134                      (
 135                          (?:https?://)?                                       # http(s):// (optional)
 136                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 137                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 138                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 139                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
 140                          (?:                                                  # the various things that can precede the ID:
 141                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 142                              |(?:                                             # or the v= param in all its forms
 143                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 144                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 145                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 146                                  v=
 147                              )
 148                          )?                                                   # optional -> youtube.com/xxxx is OK
 149                      )?                                                       # all until now is optional -> you can pass the naked ID
 150                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 151                      (?(1).+)?                                                # if we found the ID, everything can follow
 152                      $"""
 153     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 154     _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 155     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 156     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 157     _NETRC_MACHINE = 'youtube'
 158     # Listed in order of quality
 159     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 160     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 161     _video_extensions = {
 162         '13': '3gp',
 163         '17': 'mp4',
 164         '18': 'mp4',
 165         '22': 'mp4',
 166         '37': 'mp4',
 167         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 168         '43': 'webm',
 169         '44': 'webm',
 170         '45': 'webm',
 171         '46': 'webm',
 172     }
 173     _video_dimensions = {
 174         '5': '240x400',
 175         '6': '???',
 176         '13': '???',
 177         '17': '144x176',
 178         '18': '360x640',
 179         '22': '720x1280',
 180         '34': '360x640',
 181         '35': '480x854',
 182         '37': '1080x1920',
 183         '38': '3072x4096',
 184         '43': '360x640',
 185         '44': '480x854',
 186         '45': '720x1280',
 187         '46': '1080x1920',
 188     }
 189     IE_NAME = u'youtube'
 190
 191     def suitable(self, url):
 192         """Receives a URL and returns True if suitable for this IE."""
 193         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
 194
 195     def report_lang(self):
 196         """Report attempt to set language."""
 197         self._downloader.to_screen(u'[youtube] Setting language')
 198
 199     def report_login(self):
 200         """Report attempt to log in."""
 201         self._downloader.to_screen(u'[youtube] Logging in')
 202
 203     def report_age_confirmation(self):
 204         """Report attempt to confirm age."""
 205         self._downloader.to_screen(u'[youtube] Confirming age')
 206
 207     def report_video_webpage_download(self, video_id):
 208         """Report attempt to download video webpage."""
 209         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 210
 211     def report_video_info_webpage_download(self, video_id):
 212         """Report attempt to download video info webpage."""
 213         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 214
 215     def report_video_subtitles_download(self, video_id):
 216         """Report attempt to download video info webpage."""
 217         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 218
 219     def report_information_extraction(self, video_id):
 220         """Report attempt to extract video information."""
 221         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 222
 223     def report_unavailable_format(self, video_id, format):
 224         """Report extracted video URL."""
 225         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 226
 227     def report_rtmp_download(self):
 228         """Indicate the download will use the RTMP protocol."""
 229         self._downloader.to_screen(u'[youtube] RTMP download detected')
 230
 231     def _closed_captions_xml_to_srt(self, xml_string):
 232         srt = ''
 233         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 234         # TODO parse xml instead of regex
 235         for n, (start, dur_tag, dur, caption) in enumerate(texts):
 236             if not dur: dur = '4'
 237             start = float(start)
 238             end = start + float(dur)
 239             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 240             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 241             caption = unescapeHTML(caption)
 242             caption = unescapeHTML(caption) # double cycle, intentional
 243             srt += str(n+1) + '\n'
 244             srt += start + ' --> ' + end + '\n'
 245             srt += caption + '\n\n'
 246         return srt
 247
 248     def _extract_subtitles(self, video_id):
 249         self.report_video_subtitles_download(video_id)
 250         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 251         try:
 252             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 253         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 254             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 255         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 256         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 257         if not srt_lang_list:
 258             return (u'WARNING: video has no closed captions', None)
 259         if self._downloader.params.get('subtitleslang', False):
 260             srt_lang = self._downloader.params.get('subtitleslang')
 261         elif 'en' in srt_lang_list:
 262             srt_lang = 'en'
 263         else:
 264             srt_lang = list(srt_lang_list.keys())[0]
 265         if not srt_lang in srt_lang_list:
 266             return (u'WARNING: no closed captions found in the specified language', None)
 267         request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
 268         try:
 269             srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
 270         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 271             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 272         if not srt_xml:
 273             return (u'WARNING: unable to download video subtitles', None)
 274         return (None, self._closed_captions_xml_to_srt(srt_xml))
 275
 276     def _print_formats(self, formats):
 277         print('Available formats:')
 278         for x in formats:
 279             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 280
 281     def _real_initialize(self):
 282         if self._downloader is None:
 283             return
 284
 285         username = None
 286         password = None
 287         downloader_params = self._downloader.params
 288
 289         # Attempt to use provided username and password or .netrc data
 290         if downloader_params.get('username', None) is not None:
 291             username = downloader_params['username']
 292             password = downloader_params['password']
 293         elif downloader_params.get('usenetrc', False):
 294             try:
 295                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 296                 if info is not None:
 297                     username = info[0]
 298                     password = info[2]
 299                 else:
 300                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 301             except (IOError, netrc.NetrcParseError) as err:
 302                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
 303                 return
 304
 305         # Set language
 306         request = compat_urllib_request.Request(self._LANG_URL)
 307         try:
 308             self.report_lang()
 309             compat_urllib_request.urlopen(request).read()
 310         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 311             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
 312             return
 313
 314         # No authentication to be performed
 315         if username is None:
 316             return
 317
 318         # Log in
 319         login_form = {
 320                 'current_form': 'loginForm',
 321                 'next':     '/',
 322                 'action_login': 'Log In',
 323                 'username': username,
 324                 'password': password,
 325                 }
 326         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
 327         try:
 328             self.report_login()
 329             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 330             if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 331                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 332                 return
 333         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 334             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
 335             return
 336
 337         # Confirm age
 338         age_form = {
 339                 'next_url':     '/',
 340                 'action_confirm':   'Confirm',
 341                 }
 342         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 343         try:
 344             self.report_age_confirmation()
 345             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 346         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 347             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 348             return
 349
 350     def _extract_id(self, url):
 351         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 352         if mobj is None:
 353             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 354             return
 355         video_id = mobj.group(2)
 356         return video_id
 357
 358     def _real_extract(self, url):
 359         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 360         mobj = re.search(self._NEXT_URL_RE, url)
 361         if mobj:
 362             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 363         video_id = self._extract_id(url)
 364
 365         # Get video webpage
 366         self.report_video_webpage_download(video_id)
 367         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 368         request = compat_urllib_request.Request(url)
 369         try:
 370             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 371         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 372             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
 373             return
 374
 375         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 376
 377         # Attempt to extract SWF player URL
 378         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 379         if mobj is not None:
 380             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 381         else:
 382             player_url = None
 383
 384         # Get video info
 385         self.report_video_info_webpage_download(video_id)
 386         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 387             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 388                     % (video_id, el_type))
 389             request = compat_urllib_request.Request(video_info_url)
 390             try:
 391                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
 392                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
 393                 video_info = compat_parse_qs(video_info_webpage)
 394                 if 'token' in video_info:
 395                     break
 396             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 397                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 398                 return
 399         if 'token' not in video_info:
 400             if 'reason' in video_info:
 401                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
 402             else:
 403                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 404             return
 405
 406         # Check for "rental" videos
 407         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 408             self._downloader.trouble(u'ERROR: "rental" videos not supported')
 409             return
 410
 411         # Start extracting information
 412         self.report_information_extraction(video_id)
 413
 414         # uploader
 415         if 'author' not in video_info:
 416             self._downloader.trouble(u'ERROR: unable to extract uploader name')
 417             return
 418         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 419
 420         # uploader_id
 421         video_uploader_id = None
 422         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 423         if mobj is not None:
 424             video_uploader_id = mobj.group(1)
 425         else:
 426             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 427
 428         # title
 429         if 'title' not in video_info:
 430             self._downloader.trouble(u'ERROR: unable to extract video title')
 431             return
 432         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 433
 434         # thumbnail image
 435         if 'thumbnail_url' not in video_info:
 436             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 437             video_thumbnail = ''
 438         else:   # don't panic if we can't find it
 439             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 440
 441         # upload date
 442         upload_date = None
 443         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 444         if mobj is not None:
 445             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 446             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 447             for expression in format_expressions:
 448                 try:
 449                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 450                 except:
 451                     pass
 452
 453         # description
 454         video_description = get_element_by_id("eow-description", video_webpage)
 455         if video_description:
 456             video_description = clean_html(video_description)
 457         else:
 458             video_description = ''
 459
 460         # closed captions
 461         video_subtitles = None
 462         if self._downloader.params.get('writesubtitles', False):
 463             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
 464             if srt_error:
 465                 self._downloader.trouble(srt_error)
 466
 467         if 'length_seconds' not in video_info:
 468             self._downloader.trouble(u'WARNING: unable to extract video duration')
 469             video_duration = ''
 470         else:
 471             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 472
 473         # token
 474         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 475
 476         # Decide which formats to download
 477         req_format = self._downloader.params.get('format', None)
 478
 479         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 480             self.report_rtmp_download()
 481             video_url_list = [(None, video_info['conn'][0])]
 482         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 483             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 484             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 485             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 486             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 487
 488             format_limit = self._downloader.params.get('format_limit', None)
 489             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 490             if format_limit is not None and format_limit in available_formats:
 491                 format_list = available_formats[available_formats.index(format_limit):]
 492             else:
 493                 format_list = available_formats
 494             existing_formats = [x for x in format_list if x in url_map]
 495             if len(existing_formats) == 0:
 496                 self._downloader.trouble(u'ERROR: no known formats available for video')
 497                 return
 498             if self._downloader.params.get('listformats', None):
 499                 self._print_formats(existing_formats)
 500                 return
 501             if req_format is None or req_format == 'best':
 502                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 503             elif req_format == 'worst':
 504                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 505             elif req_format in ('-1', 'all'):
 506                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 507             else:
 508                 # Specific formats. We pick the first in a slash-delimeted sequence.
 509                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 510                 req_formats = req_format.split('/')
 511                 video_url_list = None
 512                 for rf in req_formats:
 513                     if rf in url_map:
 514                         video_url_list = [(rf, url_map[rf])]
 515                         break
 516                 if video_url_list is None:
 517                     self._downloader.trouble(u'ERROR: requested format not available')
 518                     return
 519         else:
 520             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 521             return
 522
 523         results = []
 524         for format_param, video_real_url in video_url_list:
 525             # Extension
 526             video_extension = self._video_extensions.get(format_param, 'flv')
 527
 528             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 529                                               self._video_dimensions.get(format_param, '???'))
 530
 531             results.append({
 532                 'id':       video_id,
 533                 'url':      video_real_url,
 534                 'uploader': video_uploader,
 535                 'uploader_id': video_uploader_id,
 536                 'upload_date':  upload_date,
 537                 'title':    video_title,
 538                 'ext':      video_extension,
 539                 'format':   video_format,
 540                 'thumbnail':    video_thumbnail,
 541                 'description':  video_description,
 542                 'player_url':   player_url,
 543                 'subtitles':    video_subtitles,
 544                 'duration':     video_duration
 545             })
 546         return results
 547
 548
 549 class MetacafeIE(InfoExtractor):
 550     """Information Extractor for metacafe.com."""
 551
 552     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 553     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 554     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 555     IE_NAME = u'metacafe'
 556
 557     def __init__(self, downloader=None):
 558         InfoExtractor.__init__(self, downloader)
 559
 560     def report_disclaimer(self):
 561         """Report disclaimer retrieval."""
 562         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 563
 564     def report_age_confirmation(self):
 565         """Report attempt to confirm age."""
 566         self._downloader.to_screen(u'[metacafe] Confirming age')
 567
 568     def report_download_webpage(self, video_id):
 569         """Report webpage download."""
 570         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 571
 572     def report_extraction(self, video_id):
 573         """Report information extraction."""
 574         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 575
 576     def _real_initialize(self):
 577         # Retrieve disclaimer
 578         request = compat_urllib_request.Request(self._DISCLAIMER)
 579         try:
 580             self.report_disclaimer()
 581             disclaimer = compat_urllib_request.urlopen(request).read()
 582         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 583             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
 584             return
 585
 586         # Confirm age
 587         disclaimer_form = {
 588             'filters': '0',
 589             'submit': "Continue - I'm over 18",
 590             }
 591         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 592         try:
 593             self.report_age_confirmation()
 594             disclaimer = compat_urllib_request.urlopen(request).read()
 595         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 596             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 597             return
 598
 599     def _real_extract(self, url):
 600         # Extract id and simplified title from URL
 601         mobj = re.match(self._VALID_URL, url)
 602         if mobj is None:
 603             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 604             return
 605
 606         video_id = mobj.group(1)
 607
 608         # Check if video comes from YouTube
 609         mobj2 = re.match(r'^yt-(.*)$', video_id)
 610         if mobj2 is not None:
 611             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 612             return
 613
 614         # Retrieve video webpage to extract further information
 615         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
 616         try:
 617             self.report_download_webpage(video_id)
 618             webpage = compat_urllib_request.urlopen(request).read()
 619         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 620             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 621             return
 622
 623         # Extract URL, uploader and title from webpage
 624         self.report_extraction(video_id)
 625         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 626         if mobj is not None:
 627             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 628             video_extension = mediaURL[-3:]
 629
 630             # Extract gdaKey if available
 631             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 632             if mobj is None:
 633                 video_url = mediaURL
 634             else:
 635                 gdaKey = mobj.group(1)
 636                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 637         else:
 638             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 639             if mobj is None:
 640                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 641                 return
 642             vardict = compat_parse_qs(mobj.group(1))
 643             if 'mediaData' not in vardict:
 644                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 645                 return
 646             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 647             if mobj is None:
 648                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 649                 return
 650             mediaURL = mobj.group(1).replace('\\/', '/')
 651             video_extension = mediaURL[-3:]
 652             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 653
 654         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 655         if mobj is None:
 656             self._downloader.trouble(u'ERROR: unable to extract title')
 657             return
 658         video_title = mobj.group(1).decode('utf-8')
 659
 660         mobj = re.search(r'submitter=(.*?);', webpage)
 661         if mobj is None:
 662             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 663             return
 664         video_uploader = mobj.group(1)
 665
 666         return [{
 667             'id':       video_id.decode('utf-8'),
 668             'url':      video_url.decode('utf-8'),
 669             'uploader': video_uploader.decode('utf-8'),
 670             'upload_date':  None,
 671             'title':    video_title,
 672             'ext':      video_extension.decode('utf-8'),
 673         }]
 674
 675
 676 class DailymotionIE(InfoExtractor):
 677     """Information Extractor for Dailymotion"""
 678
 679     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 680     IE_NAME = u'dailymotion'
 681
 682     def __init__(self, downloader=None):
 683         InfoExtractor.__init__(self, downloader)
 684
 685     def report_extraction(self, video_id):
 686         """Report information extraction."""
 687         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 688
 689     def _real_extract(self, url):
 690         # Extract id and simplified title from URL
 691         mobj = re.match(self._VALID_URL, url)
 692         if mobj is None:
 693             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 694             return
 695
 696         video_id = mobj.group(1).split('_')[0].split('?')[0]
 697
 698         video_extension = 'mp4'
 699
 700         # Retrieve video webpage to extract further information
 701         request = compat_urllib_request.Request(url)
 702         request.add_header('Cookie', 'family_filter=off')
 703         webpage = self._download_webpage(request, video_id)
 704
 705         # Extract URL, uploader and title from webpage
 706         self.report_extraction(video_id)
 707         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 708         if mobj is None:
 709             self._downloader.trouble(u'ERROR: unable to extract media URL')
 710             return
 711         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 712
 713         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 714             if key in flashvars:
 715                 max_quality = key
 716                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 717                 break
 718         else:
 719             self._downloader.trouble(u'ERROR: unable to extract video URL')
 720             return
 721
 722         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 723         if mobj is None:
 724             self._downloader.trouble(u'ERROR: unable to extract video URL')
 725             return
 726
 727         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 728
 729         # TODO: support choosing qualities
 730
 731         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 732         if mobj is None:
 733             self._downloader.trouble(u'ERROR: unable to extract title')
 734             return
 735         video_title = unescapeHTML(mobj.group('title'))
 736
 737         video_uploader = None
 738         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 739         if mobj is None:
 740             # lookin for official user
 741             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 742             if mobj_official is None:
 743                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 744             else:
 745                 video_uploader = mobj_official.group(1)
 746         else:
 747             video_uploader = mobj.group(1)
 748
 749         video_upload_date = None
 750         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 751         if mobj is not None:
 752             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 753
 754         return [{
 755             'id':       video_id,
 756             'url':      video_url,
 757             'uploader': video_uploader,
 758             'upload_date':  video_upload_date,
 759             'title':    video_title,
 760             'ext':      video_extension,
 761         }]
 762
 763
 764 class PhotobucketIE(InfoExtractor):
 765     """Information extractor for photobucket.com."""
 766
 767     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 768     IE_NAME = u'photobucket'
 769
 770     def __init__(self, downloader=None):
 771         InfoExtractor.__init__(self, downloader)
 772
 773     def report_download_webpage(self, video_id):
 774         """Report webpage download."""
 775         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 776
 777     def report_extraction(self, video_id):
 778         """Report information extraction."""
 779         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 780
 781     def _real_extract(self, url):
 782         # Extract id from URL
 783         mobj = re.match(self._VALID_URL, url)
 784         if mobj is None:
 785             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 786             return
 787
 788         video_id = mobj.group(1)
 789
 790         video_extension = 'flv'
 791
 792         # Retrieve video webpage to extract further information
 793         request = compat_urllib_request.Request(url)
 794         try:
 795             self.report_download_webpage(video_id)
 796             webpage = compat_urllib_request.urlopen(request).read()
 797         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 798             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 799             return
 800
 801         # Extract URL, uploader, and title from webpage
 802         self.report_extraction(video_id)
 803         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 804         if mobj is None:
 805             self._downloader.trouble(u'ERROR: unable to extract media URL')
 806             return
 807         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 808
 809         video_url = mediaURL
 810
 811         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 812         if mobj is None:
 813             self._downloader.trouble(u'ERROR: unable to extract title')
 814             return
 815         video_title = mobj.group(1).decode('utf-8')
 816
 817         video_uploader = mobj.group(2).decode('utf-8')
 818
 819         return [{
 820             'id':       video_id.decode('utf-8'),
 821             'url':      video_url.decode('utf-8'),
 822             'uploader': video_uploader,
 823             'upload_date':  None,
 824             'title':    video_title,
 825             'ext':      video_extension.decode('utf-8'),
 826         }]
 827
 828
 829 class YahooIE(InfoExtractor):
 830     """Information extractor for video.yahoo.com."""
 831
 832     _WORKING = False
 833     # _VALID_URL matches all Yahoo! Video URLs
 834     # _VPAGE_URL matches only the extractable '/watch/' URLs
 835     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 836     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 837     IE_NAME = u'video.yahoo'
 838
 839     def __init__(self, downloader=None):
 840         InfoExtractor.__init__(self, downloader)
 841
 842     def report_download_webpage(self, video_id):
 843         """Report webpage download."""
 844         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 845
 846     def report_extraction(self, video_id):
 847         """Report information extraction."""
 848         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 849
 850     def _real_extract(self, url, new_video=True):
 851         # Extract ID from URL
 852         mobj = re.match(self._VALID_URL, url)
 853         if mobj is None:
 854             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 855             return
 856
 857         video_id = mobj.group(2)
 858         video_extension = 'flv'
 859
 860         # Rewrite valid but non-extractable URLs as
 861         # extractable English language /watch/ URLs
 862         if re.match(self._VPAGE_URL, url) is None:
 863             request = compat_urllib_request.Request(url)
 864             try:
 865                 webpage = compat_urllib_request.urlopen(request).read()
 866             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 867                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 868                 return
 869
 870             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 871             if mobj is None:
 872                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 873                 return
 874             yahoo_id = mobj.group(1)
 875
 876             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 877             if mobj is None:
 878                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 879                 return
 880             yahoo_vid = mobj.group(1)
 881
 882             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 883             return self._real_extract(url, new_video=False)
 884
 885         # Retrieve video webpage to extract further information
 886         request = compat_urllib_request.Request(url)
 887         try:
 888             self.report_download_webpage(video_id)
 889             webpage = compat_urllib_request.urlopen(request).read()
 890         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 891             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 892             return
 893
 894         # Extract uploader and title from webpage
 895         self.report_extraction(video_id)
 896         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 897         if mobj is None:
 898             self._downloader.trouble(u'ERROR: unable to extract video title')
 899             return
 900         video_title = mobj.group(1).decode('utf-8')
 901
 902         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 903         if mobj is None:
 904             self._downloader.trouble(u'ERROR: unable to extract video uploader')
 905             return
 906         video_uploader = mobj.group(1).decode('utf-8')
 907
 908         # Extract video thumbnail
 909         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 910         if mobj is None:
 911             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 912             return
 913         video_thumbnail = mobj.group(1).decode('utf-8')
 914
 915         # Extract video description
 916         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 917         if mobj is None:
 918             self._downloader.trouble(u'ERROR: unable to extract video description')
 919             return
 920         video_description = mobj.group(1).decode('utf-8')
 921         if not video_description:
 922             video_description = 'No description available.'
 923
 924         # Extract video height and width
 925         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 926         if mobj is None:
 927             self._downloader.trouble(u'ERROR: unable to extract video height')
 928             return
 929         yv_video_height = mobj.group(1)
 930
 931         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 932         if mobj is None:
 933             self._downloader.trouble(u'ERROR: unable to extract video width')
 934             return
 935         yv_video_width = mobj.group(1)
 936
 937         # Retrieve video playlist to extract media URL
 938         # I'm not completely sure what all these options are, but we
 939         # seem to need most of them, otherwise the server sends a 401.
 940         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 941         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 942         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 943                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 944                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 945         try:
 946             self.report_download_webpage(video_id)
 947             webpage = compat_urllib_request.urlopen(request).read()
 948         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 949             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 950             return
 951
 952         # Extract media URL from playlist XML
 953         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 954         if mobj is None:
 955             self._downloader.trouble(u'ERROR: Unable to extract media URL')
 956             return
 957         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
 958         video_url = unescapeHTML(video_url)
 959
 960         return [{
 961             'id':       video_id.decode('utf-8'),
 962             'url':      video_url,
 963             'uploader': video_uploader,
 964             'upload_date':  None,
 965             'title':    video_title,
 966             'ext':      video_extension.decode('utf-8'),
 967             'thumbnail':    video_thumbnail.decode('utf-8'),
 968             'description':  video_description,
 969         }]
 970
 971
 972 class VimeoIE(InfoExtractor):
 973     """Information extractor for vimeo.com."""
 974
 975     # _VALID_URL matches Vimeo URLs
 976     _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
 977     IE_NAME = u'vimeo'
 978
 979     def __init__(self, downloader=None):
 980         InfoExtractor.__init__(self, downloader)
 981
 982     def report_download_webpage(self, video_id):
 983         """Report webpage download."""
 984         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
 985
 986     def report_extraction(self, video_id):
 987         """Report information extraction."""
 988         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
 989
 990     def _real_extract(self, url, new_video=True):
 991         # Extract ID from URL
 992         mobj = re.match(self._VALID_URL, url)
 993         if mobj is None:
 994             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 995             return
 996
 997         video_id = mobj.group(1)
 998
 999         # Retrieve video webpage to extract further information
1000         request = compat_urllib_request.Request(url, None, std_headers)
1001         try:
1002             self.report_download_webpage(video_id)
1003             webpage_bytes = compat_urllib_request.urlopen(request).read()
1004             webpage = webpage_bytes.decode('utf-8')
1005         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1006             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1007             return
1008
1009         # Now we begin extracting as much information as we can from what we
1010         # retrieved. First we extract the information common to all extractors,
1011         # and latter we extract those that are Vimeo specific.
1012         self.report_extraction(video_id)
1013
1014         # Extract the config JSON
1015         try:
1016             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1017             config = json.loads(config)
1018         except:
1019             self._downloader.trouble(u'ERROR: unable to extract info section')
1020             return
1021
1022         # Extract title
1023         video_title = config["video"]["title"]
1024
1025         # Extract uploader and uploader_id
1026         video_uploader = config["video"]["owner"]["name"]
1027         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1028
1029         # Extract video thumbnail
1030         video_thumbnail = config["video"]["thumbnail"]
1031
1032         # Extract video description
1033         video_description = get_element_by_attribute("itemprop", "description", webpage)
1034         if video_description: video_description = clean_html(video_description)
1035         else: video_description = ''
1036
1037         # Extract upload date
1038         video_upload_date = None
1039         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1040         if mobj is not None:
1041             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1042
1043         # Vimeo specific: extract request signature and timestamp
1044         sig = config['request']['signature']
1045         timestamp = config['request']['timestamp']
1046
1047         # Vimeo specific: extract video codec and quality information
1048         # First consider quality, then codecs, then take everything
1049         # TODO bind to format param
1050         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1051         files = { 'hd': [], 'sd': [], 'other': []}
1052         for codec_name, codec_extension in codecs:
1053             if codec_name in config["video"]["files"]:
1054                 if 'hd' in config["video"]["files"][codec_name]:
1055                     files['hd'].append((codec_name, codec_extension, 'hd'))
1056                 elif 'sd' in config["video"]["files"][codec_name]:
1057                     files['sd'].append((codec_name, codec_extension, 'sd'))
1058                 else:
1059                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1060
1061         for quality in ('hd', 'sd', 'other'):
1062             if len(files[quality]) > 0:
1063                 video_quality = files[quality][0][2]
1064                 video_codec = files[quality][0][0]
1065                 video_extension = files[quality][0][1]
1066                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1067                 break
1068         else:
1069             self._downloader.trouble(u'ERROR: no known codec found')
1070             return
1071
1072         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1073                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1074
1075         return [{
1076             'id':       video_id,
1077             'url':      video_url,
1078             'uploader': video_uploader,
1079             'uploader_id': video_uploader_id,
1080             'upload_date':  video_upload_date,
1081             'title':    video_title,
1082             'ext':      video_extension,
1083             'thumbnail':    video_thumbnail,
1084             'description':  video_description,
1085         }]
1086
1087
1088 class ArteTvIE(InfoExtractor):
1089     """arte.tv information extractor."""
1090
1091     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1092     _LIVE_URL = r'index-[0-9]+\.html$'
1093
1094     IE_NAME = u'arte.tv'
1095
1096     def __init__(self, downloader=None):
1097         InfoExtractor.__init__(self, downloader)
1098
1099     def report_download_webpage(self, video_id):
1100         """Report webpage download."""
1101         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1102
1103     def report_extraction(self, video_id):
1104         """Report information extraction."""
1105         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1106
1107     def fetch_webpage(self, url):
1108         request = compat_urllib_request.Request(url)
1109         try:
1110             self.report_download_webpage(url)
1111             webpage = compat_urllib_request.urlopen(request).read()
1112         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1113             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1114             return
1115         except ValueError as err:
1116             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1117             return
1118         return webpage
1119
1120     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1121         page = self.fetch_webpage(url)
1122         mobj = re.search(regex, page, regexFlags)
1123         info = {}
1124
1125         if mobj is None:
1126             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1127             return
1128
1129         for (i, key, err) in matchTuples:
1130             if mobj.group(i) is None:
1131                 self._downloader.trouble(err)
1132                 return
1133             else:
1134                 info[key] = mobj.group(i)
1135
1136         return info
1137
1138     def extractLiveStream(self, url):
1139         video_lang = url.split('/')[-4]
1140         info = self.grep_webpage(
1141             url,
1142             r'src="(.*?/videothek_js.*?\.js)',
1143             0,
1144             [
1145                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1146             ]
1147         )
1148         http_host = url.split('/')[2]
1149         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1150         info = self.grep_webpage(
1151             next_url,
1152             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1153                 '(http://.*?\.swf).*?' +
1154                 '(rtmp://.*?)\'',
1155             re.DOTALL,
1156             [
1157                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1158                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1159                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1160             ]
1161         )
1162         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1163
1164     def extractPlus7Stream(self, url):
1165         video_lang = url.split('/')[-3]
1166         info = self.grep_webpage(
1167             url,
1168             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1169             0,
1170             [
1171                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1172             ]
1173         )
1174         next_url = compat_urllib_parse.unquote(info.get('url'))
1175         info = self.grep_webpage(
1176             next_url,
1177             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1178             0,
1179             [
1180                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1181             ]
1182         )
1183         next_url = compat_urllib_parse.unquote(info.get('url'))
1184
1185         info = self.grep_webpage(
1186             next_url,
1187             r'<video id="(.*?)".*?>.*?' +
1188                 '<name>(.*?)</name>.*?' +
1189                 '<dateVideo>(.*?)</dateVideo>.*?' +
1190                 '<url quality="hd">(.*?)</url>',
1191             re.DOTALL,
1192             [
1193                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1194                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1195                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1196                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1197             ]
1198         )
1199
1200         return {
1201             'id':           info.get('id'),
1202             'url':          compat_urllib_parse.unquote(info.get('url')),
1203             'uploader':     u'arte.tv',
1204             'upload_date':  info.get('date'),
1205             'title':        info.get('title').decode('utf-8'),
1206             'ext':          u'mp4',
1207             'format':       u'NA',
1208             'player_url':   None,
1209         }
1210
1211     def _real_extract(self, url):
1212         video_id = url.split('/')[-1]
1213         self.report_extraction(video_id)
1214
1215         if re.search(self._LIVE_URL, video_id) is not None:
1216             self.extractLiveStream(url)
1217             return
1218         else:
1219             info = self.extractPlus7Stream(url)
1220
1221         return [info]
1222
1223
1224 class GenericIE(InfoExtractor):
1225     """Generic last-resort information extractor."""
1226
1227     _VALID_URL = r'.*'
1228     IE_NAME = u'generic'
1229
1230     def __init__(self, downloader=None):
1231         InfoExtractor.__init__(self, downloader)
1232
1233     def report_download_webpage(self, video_id):
1234         """Report webpage download."""
1235         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1236         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1237
1238     def report_extraction(self, video_id):
1239         """Report information extraction."""
1240         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1241
1242     def report_following_redirect(self, new_url):
1243         """Report information extraction."""
1244         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1245
1246     def _test_redirect(self, url):
1247         """Check if it is a redirect, like url shorteners, in case restart chain."""
1248         class HeadRequest(compat_urllib_request.Request):
1249             def get_method(self):
1250                 return "HEAD"
1251
1252         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1253             """
1254             Subclass the HTTPRedirectHandler to make it use our
1255             HeadRequest also on the redirected URL
1256             """
1257             def redirect_request(self, req, fp, code, msg, headers, newurl):
1258                 if code in (301, 302, 303, 307):
1259                     newurl = newurl.replace(' ', '%20')
1260                     newheaders = dict((k,v) for k,v in req.headers.items()
1261                                       if k.lower() not in ("content-length", "content-type"))
1262                     return HeadRequest(newurl,
1263                                        headers=newheaders,
1264                                        origin_req_host=req.get_origin_req_host(),
1265                                        unverifiable=True)
1266                 else:
1267                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1268
1269         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1270             """
1271             Fallback to GET if HEAD is not allowed (405 HTTP error)
1272             """
1273             def http_error_405(self, req, fp, code, msg, headers):
1274                 fp.read()
1275                 fp.close()
1276
1277                 newheaders = dict((k,v) for k,v in req.headers.items()
1278                                   if k.lower() not in ("content-length", "content-type"))
1279                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1280                                                  headers=newheaders,
1281                                                  origin_req_host=req.get_origin_req_host(),
1282                                                  unverifiable=True))
1283
1284         # Build our opener
1285         opener = compat_urllib_request.OpenerDirector()
1286         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1287                         HTTPMethodFallback, HEADRedirectHandler,
1288                         compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1289             opener.add_handler(handler())
1290
1291         response = opener.open(HeadRequest(url))
1292         new_url = response.geturl()
1293
1294         if url == new_url:
1295             return False
1296
1297         self.report_following_redirect(new_url)
1298         self._downloader.download([new_url])
1299         return True
1300
1301     def _real_extract(self, url):
1302         if self._test_redirect(url): return
1303
1304         video_id = url.split('/')[-1]
1305         request = compat_urllib_request.Request(url)
1306         try:
1307             self.report_download_webpage(video_id)
1308             webpage = compat_urllib_request.urlopen(request).read()
1309         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1310             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1311             return
1312         except ValueError as err:
1313             # since this is the last-resort InfoExtractor, if
1314             # this error is thrown, it'll be thrown here
1315             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1316             return
1317
1318         self.report_extraction(video_id)
1319         # Start with something easy: JW Player in SWFObject
1320         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1321         if mobj is None:
1322             # Broaden the search a little bit
1323             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1324         if mobj is None:
1325             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1326             return
1327
1328         # It's possible that one of the regexes
1329         # matched, but returned an empty group:
1330         if mobj.group(1) is None:
1331             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1332             return
1333
1334         video_url = compat_urllib_parse.unquote(mobj.group(1))
1335         video_id = os.path.basename(video_url)
1336
1337         # here's a fun little line of code for you:
1338         video_extension = os.path.splitext(video_id)[1][1:]
1339         video_id = os.path.splitext(video_id)[0]
1340
1341         # it's tempting to parse this further, but you would
1342         # have to take into account all the variations like
1343         #   Video Title - Site Name
1344         #   Site Name | Video Title
1345         #   Video Title - Tagline | Site Name
1346         # and so on and so forth; it's just not practical
1347         mobj = re.search(r'<title>(.*)</title>', webpage)
1348         if mobj is None:
1349             self._downloader.trouble(u'ERROR: unable to extract title')
1350             return
1351         video_title = mobj.group(1)
1352
1353         # video uploader is domain name
1354         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1355         if mobj is None:
1356             self._downloader.trouble(u'ERROR: unable to extract title')
1357             return
1358         video_uploader = mobj.group(1)
1359
1360         return [{
1361             'id':       video_id,
1362             'url':      video_url,
1363             'uploader': video_uploader,
1364             'upload_date':  None,
1365             'title':    video_title,
1366             'ext':      video_extension,
1367         }]
1368
1369
1370 class YoutubeSearchIE(InfoExtractor):
1371     """Information Extractor for YouTube search queries."""
1372     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1373     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1374     _max_youtube_results = 1000
1375     IE_NAME = u'youtube:search'
1376
1377     def __init__(self, downloader=None):
1378         InfoExtractor.__init__(self, downloader)
1379
1380     def report_download_page(self, query, pagenum):
1381         """Report attempt to download search page with given number."""
1382         query = query.decode(preferredencoding())
1383         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1384
1385     def _real_extract(self, query):
1386         mobj = re.match(self._VALID_URL, query)
1387         if mobj is None:
1388             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1389             return
1390
1391         prefix, query = query.split(':')
1392         prefix = prefix[8:]
1393         query = query.encode('utf-8')
1394         if prefix == '':
1395             self._download_n_results(query, 1)
1396             return
1397         elif prefix == 'all':
1398             self._download_n_results(query, self._max_youtube_results)
1399             return
1400         else:
1401             try:
1402                 n = int(prefix)
1403                 if n <= 0:
1404                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1405                     return
1406                 elif n > self._max_youtube_results:
1407                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1408                     n = self._max_youtube_results
1409                 self._download_n_results(query, n)
1410                 return
1411             except ValueError: # parsing prefix as integer fails
1412                 self._download_n_results(query, 1)
1413                 return
1414
1415     def _download_n_results(self, query, n):
1416         """Downloads a specified number of results for a query"""
1417
1418         video_ids = []
1419         pagenum = 0
1420         limit = n
1421
1422         while (50 * pagenum) < limit:
1423             self.report_download_page(query, pagenum+1)
1424             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1425             request = compat_urllib_request.Request(result_url)
1426             try:
1427                 data = compat_urllib_request.urlopen(request).read()
1428             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1429                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1430                 return
1431             api_response = json.loads(data)['data']
1432
1433             new_ids = list(video['id'] for video in api_response['items'])
1434             video_ids += new_ids
1435
1436             limit = min(n, api_response['totalItems'])
1437             pagenum += 1
1438
1439         if len(video_ids) > n:
1440             video_ids = video_ids[:n]
1441         for id in video_ids:
1442             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1443         return
1444
1445
1446 class GoogleSearchIE(InfoExtractor):
1447     """Information Extractor for Google Video search queries."""
1448     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1449     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1450     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1451     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1452     _max_google_results = 1000
1453     IE_NAME = u'video.google:search'
1454
1455     def __init__(self, downloader=None):
1456         InfoExtractor.__init__(self, downloader)
1457
1458     def report_download_page(self, query, pagenum):
1459         """Report attempt to download playlist page with given number."""
1460         query = query.decode(preferredencoding())
1461         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1462
1463     def _real_extract(self, query):
1464         mobj = re.match(self._VALID_URL, query)
1465         if mobj is None:
1466             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1467             return
1468
1469         prefix, query = query.split(':')
1470         prefix = prefix[8:]
1471         query = query.encode('utf-8')
1472         if prefix == '':
1473             self._download_n_results(query, 1)
1474             return
1475         elif prefix == 'all':
1476             self._download_n_results(query, self._max_google_results)
1477             return
1478         else:
1479             try:
1480                 n = int(prefix)
1481                 if n <= 0:
1482                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1483                     return
1484                 elif n > self._max_google_results:
1485                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1486                     n = self._max_google_results
1487                 self._download_n_results(query, n)
1488                 return
1489             except ValueError: # parsing prefix as integer fails
1490                 self._download_n_results(query, 1)
1491                 return
1492
1493     def _download_n_results(self, query, n):
1494         """Downloads a specified number of results for a query"""
1495
1496         video_ids = []
1497         pagenum = 0
1498
1499         while True:
1500             self.report_download_page(query, pagenum)
1501             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1502             request = compat_urllib_request.Request(result_url)
1503             try:
1504                 page = compat_urllib_request.urlopen(request).read()
1505             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1506                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1507                 return
1508
1509             # Extract video identifiers
1510             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1511                 video_id = mobj.group(1)
1512                 if video_id not in video_ids:
1513                     video_ids.append(video_id)
1514                     if len(video_ids) == n:
1515                         # Specified n videos reached
1516                         for id in video_ids:
1517                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1518                         return
1519
1520             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1521                 for id in video_ids:
1522                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1523                 return
1524
1525             pagenum = pagenum + 1
1526
1527
1528 class YahooSearchIE(InfoExtractor):
1529     """Information Extractor for Yahoo! Video search queries."""
1530
1531     _WORKING = False
1532     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1533     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1534     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1535     _MORE_PAGES_INDICATOR = r'\s*Next'
1536     _max_yahoo_results = 1000
1537     IE_NAME = u'video.yahoo:search'
1538
1539     def __init__(self, downloader=None):
1540         InfoExtractor.__init__(self, downloader)
1541
1542     def report_download_page(self, query, pagenum):
1543         """Report attempt to download playlist page with given number."""
1544         query = query.decode(preferredencoding())
1545         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1546
1547     def _real_extract(self, query):
1548         mobj = re.match(self._VALID_URL, query)
1549         if mobj is None:
1550             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1551             return
1552
1553         prefix, query = query.split(':')
1554         prefix = prefix[8:]
1555         query = query.encode('utf-8')
1556         if prefix == '':
1557             self._download_n_results(query, 1)
1558             return
1559         elif prefix == 'all':
1560             self._download_n_results(query, self._max_yahoo_results)
1561             return
1562         else:
1563             try:
1564                 n = int(prefix)
1565                 if n <= 0:
1566                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1567                     return
1568                 elif n > self._max_yahoo_results:
1569                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1570                     n = self._max_yahoo_results
1571                 self._download_n_results(query, n)
1572                 return
1573             except ValueError: # parsing prefix as integer fails
1574                 self._download_n_results(query, 1)
1575                 return
1576
1577     def _download_n_results(self, query, n):
1578         """Downloads a specified number of results for a query"""
1579
1580         video_ids = []
1581         already_seen = set()
1582         pagenum = 1
1583
1584         while True:
1585             self.report_download_page(query, pagenum)
1586             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1587             request = compat_urllib_request.Request(result_url)
1588             try:
1589                 page = compat_urllib_request.urlopen(request).read()
1590             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1591                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1592                 return
1593
1594             # Extract video identifiers
1595             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1596                 video_id = mobj.group(1)
1597                 if video_id not in already_seen:
1598                     video_ids.append(video_id)
1599                     already_seen.add(video_id)
1600                     if len(video_ids) == n:
1601                         # Specified n videos reached
1602                         for id in video_ids:
1603                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1604                         return
1605
1606             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1607                 for id in video_ids:
1608                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1609                 return
1610
1611             pagenum = pagenum + 1
1612
1613
1614 class YoutubePlaylistIE(InfoExtractor):
1615     """Information Extractor for YouTube playlists."""
1616
1617     _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1618     _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1619     _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1620     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1621     IE_NAME = u'youtube:playlist'
1622
1623     def __init__(self, downloader=None):
1624         InfoExtractor.__init__(self, downloader)
1625
1626     def report_download_page(self, playlist_id, pagenum):
1627         """Report attempt to download playlist page with given number."""
1628         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1629
1630     def _real_extract(self, url):
1631         # Extract playlist id
1632         mobj = re.match(self._VALID_URL, url)
1633         if mobj is None:
1634             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1635             return
1636
1637         # Single video case
1638         if mobj.group(3) is not None:
1639             self._downloader.download([mobj.group(3)])
1640             return
1641
1642         # Download playlist pages
1643         # prefix is 'p' as default for playlists but there are other types that need extra care
1644         playlist_prefix = mobj.group(1)
1645         if playlist_prefix == 'a':
1646             playlist_access = 'artist'
1647         else:
1648             playlist_prefix = 'p'
1649             playlist_access = 'view_play_list'
1650         playlist_id = mobj.group(2)
1651         video_ids = []
1652         pagenum = 1
1653
1654         while True:
1655             self.report_download_page(playlist_id, pagenum)
1656             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1657             request = compat_urllib_request.Request(url)
1658             try:
1659                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1660             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1661                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1662                 return
1663
1664             # Extract video identifiers
1665             ids_in_page = []
1666             for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1667                 if mobj.group(1) not in ids_in_page:
1668                     ids_in_page.append(mobj.group(1))
1669             video_ids.extend(ids_in_page)
1670
1671             if self._MORE_PAGES_INDICATOR not in page:
1672                 break
1673             pagenum = pagenum + 1
1674
1675         total = len(video_ids)
1676
1677         playliststart = self._downloader.params.get('playliststart', 1) - 1
1678         playlistend = self._downloader.params.get('playlistend', -1)
1679         if playlistend == -1:
1680             video_ids = video_ids[playliststart:]
1681         else:
1682             video_ids = video_ids[playliststart:playlistend]
1683
1684         if len(video_ids) == total:
1685             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1686         else:
1687             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1688
1689         for id in video_ids:
1690             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1691         return
1692
1693
1694 class YoutubeChannelIE(InfoExtractor):
1695     """Information Extractor for YouTube channels."""
1696
1697     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1698     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1699     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1700     IE_NAME = u'youtube:channel'
1701
1702     def report_download_page(self, channel_id, pagenum):
1703         """Report attempt to download channel page with given number."""
1704         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1705
1706     def _real_extract(self, url):
1707         # Extract channel id
1708         mobj = re.match(self._VALID_URL, url)
1709         if mobj is None:
1710             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1711             return
1712
1713         # Download channel pages
1714         channel_id = mobj.group(1)
1715         video_ids = []
1716         pagenum = 1
1717
1718         while True:
1719             self.report_download_page(channel_id, pagenum)
1720             url = self._TEMPLATE_URL % (channel_id, pagenum)
1721             request = compat_urllib_request.Request(url)
1722             try:
1723                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1724             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1725                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1726                 return
1727
1728             # Extract video identifiers
1729             ids_in_page = []
1730             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1731                 if mobj.group(1) not in ids_in_page:
1732                     ids_in_page.append(mobj.group(1))
1733             video_ids.extend(ids_in_page)
1734
1735             if self._MORE_PAGES_INDICATOR not in page:
1736                 break
1737             pagenum = pagenum + 1
1738
1739         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1740
1741         for id in video_ids:
1742             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1743         return
1744
1745
1746 class YoutubeUserIE(InfoExtractor):
1747     """Information Extractor for YouTube users."""
1748
1749     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1750     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1751     _GDATA_PAGE_SIZE = 50
1752     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1753     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1754     IE_NAME = u'youtube:user'
1755
1756     def __init__(self, downloader=None):
1757         InfoExtractor.__init__(self, downloader)
1758
1759     def report_download_page(self, username, start_index):
1760         """Report attempt to download user page."""
1761         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1762                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1763
1764     def _real_extract(self, url):
1765         # Extract username
1766         mobj = re.match(self._VALID_URL, url)
1767         if mobj is None:
1768             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1769             return
1770
1771         username = mobj.group(1)
1772
1773         # Download video ids using YouTube Data API. Result size per
1774         # query is limited (currently to 50 videos) so we need to query
1775         # page by page until there are no video ids - it means we got
1776         # all of them.
1777
1778         video_ids = []
1779         pagenum = 0
1780
1781         while True:
1782             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1783             self.report_download_page(username, start_index)
1784
1785             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1786
1787             try:
1788                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1789             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1790                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1791                 return
1792
1793             # Extract video identifiers
1794             ids_in_page = []
1795
1796             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1797                 if mobj.group(1) not in ids_in_page:
1798                     ids_in_page.append(mobj.group(1))
1799
1800             video_ids.extend(ids_in_page)
1801
1802             # A little optimization - if current page is not
1803             # "full", ie. does not contain PAGE_SIZE video ids then
1804             # we can assume that this page is the last one - there
1805             # are no more ids on further pages - no need to query
1806             # again.
1807
1808             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1809                 break
1810
1811             pagenum += 1
1812
1813         all_ids_count = len(video_ids)
1814         playliststart = self._downloader.params.get('playliststart', 1) - 1
1815         playlistend = self._downloader.params.get('playlistend', -1)
1816
1817         if playlistend == -1:
1818             video_ids = video_ids[playliststart:]
1819         else:
1820             video_ids = video_ids[playliststart:playlistend]
1821
1822         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1823                 (username, all_ids_count, len(video_ids)))
1824
1825         for video_id in video_ids:
1826             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1827
1828
1829 class BlipTVUserIE(InfoExtractor):
1830     """Information Extractor for blip.tv users."""
1831
1832     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1833     _PAGE_SIZE = 12
1834     IE_NAME = u'blip.tv:user'
1835
1836     def __init__(self, downloader=None):
1837         InfoExtractor.__init__(self, downloader)
1838
1839     def report_download_page(self, username, pagenum):
1840         """Report attempt to download user page."""
1841         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1842                 (self.IE_NAME, username, pagenum))
1843
1844     def _real_extract(self, url):
1845         # Extract username
1846         mobj = re.match(self._VALID_URL, url)
1847         if mobj is None:
1848             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1849             return
1850
1851         username = mobj.group(1)
1852
1853         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1854
1855         request = compat_urllib_request.Request(url)
1856
1857         try:
1858             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1859             mobj = re.search(r'data-users-id="([^"]+)"', page)
1860             page_base = page_base % mobj.group(1)
1861         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1862             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1863             return
1864
1865
1866         # Download video ids using BlipTV Ajax calls. Result size per
1867         # query is limited (currently to 12 videos) so we need to query
1868         # page by page until there are no video ids - it means we got
1869         # all of them.
1870
1871         video_ids = []
1872         pagenum = 1
1873
1874         while True:
1875             self.report_download_page(username, pagenum)
1876
1877             request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1878
1879             try:
1880                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1881             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1882                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1883                 return
1884
1885             # Extract video identifiers
1886             ids_in_page = []
1887
1888             for mobj in re.finditer(r'href="/([^"]+)"', page):
1889                 if mobj.group(1) not in ids_in_page:
1890                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1891
1892             video_ids.extend(ids_in_page)
1893
1894             # A little optimization - if current page is not
1895             # "full", ie. does not contain PAGE_SIZE video ids then
1896             # we can assume that this page is the last one - there
1897             # are no more ids on further pages - no need to query
1898             # again.
1899
1900             if len(ids_in_page) < self._PAGE_SIZE:
1901                 break
1902
1903             pagenum += 1
1904
1905         all_ids_count = len(video_ids)
1906         playliststart = self._downloader.params.get('playliststart', 1) - 1
1907         playlistend = self._downloader.params.get('playlistend', -1)
1908
1909         if playlistend == -1:
1910             video_ids = video_ids[playliststart:]
1911         else:
1912             video_ids = video_ids[playliststart:playlistend]
1913
1914         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1915                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1916
1917         for video_id in video_ids:
1918             self._downloader.download([u'http://blip.tv/'+video_id])
1919
1920
1921 class DepositFilesIE(InfoExtractor):
1922     """Information extractor for depositfiles.com"""
1923
1924     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1925
1926     def report_download_webpage(self, file_id):
1927         """Report webpage download."""
1928         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1929
1930     def report_extraction(self, file_id):
1931         """Report information extraction."""
1932         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1933
1934     def _real_extract(self, url):
1935         file_id = url.split('/')[-1]
1936         # Rebuild url in english locale
1937         url = 'http://depositfiles.com/en/files/' + file_id
1938
1939         # Retrieve file webpage with 'Free download' button pressed
1940         free_download_indication = { 'gateway_result' : '1' }
1941         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1942         try:
1943             self.report_download_webpage(file_id)
1944             webpage = compat_urllib_request.urlopen(request).read()
1945         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1946             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1947             return
1948
1949         # Search for the real file URL
1950         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1951         if (mobj is None) or (mobj.group(1) is None):
1952             # Try to figure out reason of the error.
1953             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1954             if (mobj is not None) and (mobj.group(1) is not None):
1955                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1956                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1957             else:
1958                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1959             return
1960
1961         file_url = mobj.group(1)
1962         file_extension = os.path.splitext(file_url)[1][1:]
1963
1964         # Search for file title
1965         mobj = re.search(r'<b title="(.*?)">', webpage)
1966         if mobj is None:
1967             self._downloader.trouble(u'ERROR: unable to extract title')
1968             return
1969         file_title = mobj.group(1).decode('utf-8')
1970
1971         return [{
1972             'id':       file_id.decode('utf-8'),
1973             'url':      file_url.decode('utf-8'),
1974             'uploader': None,
1975             'upload_date':  None,
1976             'title':    file_title,
1977             'ext':      file_extension.decode('utf-8'),
1978         }]
1979
1980
1981 class FacebookIE(InfoExtractor):
1982     """Information Extractor for Facebook"""
1983
1984     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1985     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1986     _NETRC_MACHINE = 'facebook'
1987     IE_NAME = u'facebook'
1988
1989     def report_login(self):
1990         """Report attempt to log in."""
1991         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
1992
1993     def _real_initialize(self):
1994         if self._downloader is None:
1995             return
1996
1997         useremail = None
1998         password = None
1999         downloader_params = self._downloader.params
2000
2001         # Attempt to use provided username and password or .netrc data
2002         if downloader_params.get('username', None) is not None:
2003             useremail = downloader_params['username']
2004             password = downloader_params['password']
2005         elif downloader_params.get('usenetrc', False):
2006             try:
2007                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2008                 if info is not None:
2009                     useremail = info[0]
2010                     password = info[2]
2011                 else:
2012                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2013             except (IOError, netrc.NetrcParseError) as err:
2014                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2015                 return
2016
2017         if useremail is None:
2018             return
2019
2020         # Log in
2021         login_form = {
2022             'email': useremail,
2023             'pass': password,
2024             'login': 'Log+In'
2025             }
2026         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2027         try:
2028             self.report_login()
2029             login_results = compat_urllib_request.urlopen(request).read()
2030             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2031                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2032                 return
2033         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2034             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2035             return
2036
2037     def _real_extract(self, url):
2038         mobj = re.match(self._VALID_URL, url)
2039         if mobj is None:
2040             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2041             return
2042         video_id = mobj.group('ID')
2043
2044         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2045         webpage = self._download_webpage(url, video_id)
2046
2047         BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2048         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2049         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2050         if not m:
2051             raise ExtractorError(u'Cannot parse data')
2052         data = dict(json.loads(m.group(1)))
2053         video_url = compat_urllib_parse.unquote(data['hd_src'])
2054         video_duration = int(data['video_duration'])
2055
2056         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2057         if not m:
2058             raise ExtractorError(u'Cannot find title in webpage')
2059         video_title = unescapeHTML(m.group(1))
2060
2061         info = {
2062             'id': video_id,
2063             'title': video_title,
2064             'url': video_url,
2065             'ext': 'mp4',
2066             'duration': video_duration,
2067             'thumbnail': data['thumbnail_src'],
2068         }
2069         return [info]
2070
2071
2072 class BlipTVIE(InfoExtractor):
2073     """Information extractor for blip.tv"""
2074
2075     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2076     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2077     IE_NAME = u'blip.tv'
2078
2079     def report_extraction(self, file_id):
2080         """Report information extraction."""
2081         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2082
2083     def report_direct_download(self, title):
2084         """Report information extraction."""
2085         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2086
2087     def _real_extract(self, url):
2088         mobj = re.match(self._VALID_URL, url)
2089         if mobj is None:
2090             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2091             return
2092
2093         if '?' in url:
2094             cchar = '&'
2095         else:
2096             cchar = '?'
2097         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2098         request = compat_urllib_request.Request(json_url)
2099         request.add_header('User-Agent', 'iTunes/10.6.1')
2100         self.report_extraction(mobj.group(1))
2101         info = None
2102         try:
2103             urlh = compat_urllib_request.urlopen(request)
2104             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2105                 basename = url.split('/')[-1]
2106                 title,ext = os.path.splitext(basename)
2107                 title = title.decode('UTF-8')
2108                 ext = ext.replace('.', '')
2109                 self.report_direct_download(title)
2110                 info = {
2111                     'id': title,
2112                     'url': url,
2113                     'uploader': None,
2114                     'upload_date': None,
2115                     'title': title,
2116                     'ext': ext,
2117                     'urlhandle': urlh
2118                 }
2119         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2120             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2121         if info is None: # Regular URL
2122             try:
2123                 json_code_bytes = urlh.read()
2124                 json_code = json_code_bytes.decode('utf-8')
2125             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2126                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2127                 return
2128
2129             try:
2130                 json_data = json.loads(json_code)
2131                 if 'Post' in json_data:
2132                     data = json_data['Post']
2133                 else:
2134                     data = json_data
2135
2136                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2137                 video_url = data['media']['url']
2138                 umobj = re.match(self._URL_EXT, video_url)
2139                 if umobj is None:
2140                     raise ValueError('Can not determine filename extension')
2141                 ext = umobj.group(1)
2142
2143                 info = {
2144                     'id': data['item_id'],
2145                     'url': video_url,
2146                     'uploader': data['display_name'],
2147                     'upload_date': upload_date,
2148                     'title': data['title'],
2149                     'ext': ext,
2150                     'format': data['media']['mimeType'],
2151                     'thumbnail': data['thumbnailUrl'],
2152                     'description': data['description'],
2153                     'player_url': data['embedUrl'],
2154                     'user_agent': 'iTunes/10.6.1',
2155                 }
2156             except (ValueError,KeyError) as err:
2157                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2158                 return
2159
2160         return [info]
2161
2162
2163 class MyVideoIE(InfoExtractor):
2164     """Information Extractor for myvideo.de."""
2165
2166     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2167     IE_NAME = u'myvideo'
2168
2169     def __init__(self, downloader=None):
2170         InfoExtractor.__init__(self, downloader)
2171
2172     def report_extraction(self, video_id):
2173         """Report information extraction."""
2174         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2175
2176     def _real_extract(self,url):
2177         mobj = re.match(self._VALID_URL, url)
2178         if mobj is None:
2179             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2180             return
2181
2182         video_id = mobj.group(1)
2183
2184         # Get video webpage
2185         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2186         webpage = self._download_webpage(webpage_url, video_id)
2187
2188         self.report_extraction(video_id)
2189         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2190                  webpage)
2191         if mobj is None:
2192             self._downloader.trouble(u'ERROR: unable to extract media URL')
2193             return
2194         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2195
2196         mobj = re.search('<title>([^<]+)</title>', webpage)
2197         if mobj is None:
2198             self._downloader.trouble(u'ERROR: unable to extract title')
2199             return
2200
2201         video_title = mobj.group(1)
2202
2203         return [{
2204             'id':       video_id,
2205             'url':      video_url,
2206             'uploader': None,
2207             'upload_date':  None,
2208             'title':    video_title,
2209             'ext':      u'flv',
2210         }]
2211
2212 class ComedyCentralIE(InfoExtractor):
2213     """Information extractor for The Daily Show and Colbert Report """
2214
2215     # urls can be abbreviations like :thedailyshow or :colbert
2216     # urls for episodes like:
2217     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2218     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2219     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2220     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2221                       |(https?://)?(www\.)?
2222                           (?P<showname>thedailyshow|colbertnation)\.com/
2223                          (full-episodes/(?P<episode>.*)|
2224                           (?P<clip>
2225                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2226                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2227                      $"""
2228
2229     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2230
2231     _video_extensions = {
2232         '3500': 'mp4',
2233         '2200': 'mp4',
2234         '1700': 'mp4',
2235         '1200': 'mp4',
2236         '750': 'mp4',
2237         '400': 'mp4',
2238     }
2239     _video_dimensions = {
2240         '3500': '1280x720',
2241         '2200': '960x540',
2242         '1700': '768x432',
2243         '1200': '640x360',
2244         '750': '512x288',
2245         '400': '384x216',
2246     }
2247
2248     def suitable(self, url):
2249         """Receives a URL and returns True if suitable for this IE."""
2250         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2251
2252     def report_extraction(self, episode_id):
2253         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2254
2255     def report_config_download(self, episode_id, media_id):
2256         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2257
2258     def report_index_download(self, episode_id):
2259         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2260
2261     def _print_formats(self, formats):
2262         print('Available formats:')
2263         for x in formats:
2264             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2265
2266
2267     def _real_extract(self, url):
2268         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2269         if mobj is None:
2270             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2271             return
2272
2273         if mobj.group('shortname'):
2274             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2275                 url = u'http://www.thedailyshow.com/full-episodes/'
2276             else:
2277                 url = u'http://www.colbertnation.com/full-episodes/'
2278             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2279             assert mobj is not None
2280
2281         if mobj.group('clip'):
2282             if mobj.group('showname') == 'thedailyshow':
2283                 epTitle = mobj.group('tdstitle')
2284             else:
2285                 epTitle = mobj.group('cntitle')
2286             dlNewest = False
2287         else:
2288             dlNewest = not mobj.group('episode')
2289             if dlNewest:
2290                 epTitle = mobj.group('showname')
2291             else:
2292                 epTitle = mobj.group('episode')
2293
2294         req = compat_urllib_request.Request(url)
2295         self.report_extraction(epTitle)
2296         try:
2297             htmlHandle = compat_urllib_request.urlopen(req)
2298             html = htmlHandle.read()
2299             webpage = html.decode('utf-8')
2300         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2301             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2302             return
2303         if dlNewest:
2304             url = htmlHandle.geturl()
2305             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2306             if mobj is None:
2307                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2308                 return
2309             if mobj.group('episode') == '':
2310                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2311                 return
2312             epTitle = mobj.group('episode')
2313
2314         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2315
2316         if len(mMovieParams) == 0:
2317             # The Colbert Report embeds the information in a without
2318             # a URL prefix; so extract the alternate reference
2319             # and then add the URL prefix manually.
2320
2321             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2322             if len(altMovieParams) == 0:
2323                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2324                 return
2325             else:
2326                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2327
2328         uri = mMovieParams[0][1]
2329         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2330         self.report_index_download(epTitle)
2331         try:
2332             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2333         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2334             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2335             return
2336
2337         results = []
2338
2339         idoc = xml.etree.ElementTree.fromstring(indexXml)
2340         itemEls = idoc.findall('.//item')
2341         for partNum,itemEl in enumerate(itemEls):
2342             mediaId = itemEl.findall('./guid')[0].text
2343             shortMediaId = mediaId.split(':')[-1]
2344             showId = mediaId.split(':')[-2].replace('.com', '')
2345             officialTitle = itemEl.findall('./title')[0].text
2346             officialDate = itemEl.findall('./pubDate')[0].text
2347
2348             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2349                         compat_urllib_parse.urlencode({'uri': mediaId}))
2350             configReq = compat_urllib_request.Request(configUrl)
2351             self.report_config_download(epTitle, shortMediaId)
2352             try:
2353                 configXml = compat_urllib_request.urlopen(configReq).read()
2354             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2355                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2356                 return
2357
2358             cdoc = xml.etree.ElementTree.fromstring(configXml)
2359             turls = []
2360             for rendition in cdoc.findall('.//rendition'):
2361                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2362                 turls.append(finfo)
2363
2364             if len(turls) == 0:
2365                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2366                 continue
2367
2368             if self._downloader.params.get('listformats', None):
2369                 self._print_formats([i[0] for i in turls])
2370                 return
2371
2372             # For now, just pick the highest bitrate
2373             format,rtmp_video_url = turls[-1]
2374
2375             # Get the format arg from the arg stream
2376             req_format = self._downloader.params.get('format', None)
2377
2378             # Select format if we can find one
2379             for f,v in turls:
2380                 if f == req_format:
2381                     format, rtmp_video_url = f, v
2382                     break
2383
2384             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2385             if not m:
2386                 raise ExtractorError(u'Cannot transform RTMP url')
2387             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2388             video_url = base + m.group('finalid')
2389
2390             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2391             info = {
2392                 'id': shortMediaId,
2393                 'url': video_url,
2394                 'uploader': showId,
2395                 'upload_date': officialDate,
2396                 'title': effTitle,
2397                 'ext': 'mp4',
2398                 'format': format,
2399                 'thumbnail': None,
2400                 'description': officialTitle,
2401             }
2402             results.append(info)
2403
2404         return results
2405
2406
2407 class EscapistIE(InfoExtractor):
2408     """Information extractor for The Escapist """
2409
2410     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2411     IE_NAME = u'escapist'
2412
2413     def report_extraction(self, showName):
2414         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2415
2416     def report_config_download(self, showName):
2417         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2418
2419     def _real_extract(self, url):
2420         mobj = re.match(self._VALID_URL, url)
2421         if mobj is None:
2422             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2423             return
2424         showName = mobj.group('showname')
2425         videoId = mobj.group('episode')
2426
2427         self.report_extraction(showName)
2428         try:
2429             webPage = compat_urllib_request.urlopen(url)
2430             webPageBytes = webPage.read()
2431             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2432             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2433         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2434             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2435             return
2436
2437         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2438         description = unescapeHTML(descMatch.group(1))
2439         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2440         imgUrl = unescapeHTML(imgMatch.group(1))
2441         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2442         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2443         configUrlMatch = re.search('config=(.*)$', playerUrl)
2444         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2445
2446         self.report_config_download(showName)
2447         try:
2448             configJSON = compat_urllib_request.urlopen(configUrl)
2449             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2450             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2451         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2452             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2453             return
2454
2455         # Technically, it's JavaScript, not JSON
2456         configJSON = configJSON.replace("'", '"')
2457
2458         try:
2459             config = json.loads(configJSON)
2460         except (ValueError,) as err:
2461             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2462             return
2463
2464         playlist = config['playlist']
2465         videoUrl = playlist[1]['url']
2466
2467         info = {
2468             'id': videoId,
2469             'url': videoUrl,
2470             'uploader': showName,
2471             'upload_date': None,
2472             'title': showName,
2473             'ext': 'flv',
2474             'thumbnail': imgUrl,
2475             'description': description,
2476             'player_url': playerUrl,
2477         }
2478
2479         return [info]
2480
2481 class CollegeHumorIE(InfoExtractor):
2482     """Information extractor for collegehumor.com"""
2483
2484     _WORKING = False
2485     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2486     IE_NAME = u'collegehumor'
2487
2488     def report_manifest(self, video_id):
2489         """Report information extraction."""
2490         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2491
2492     def report_extraction(self, video_id):
2493         """Report information extraction."""
2494         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2495
2496     def _real_extract(self, url):
2497         mobj = re.match(self._VALID_URL, url)
2498         if mobj is None:
2499             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2500             return
2501         video_id = mobj.group('videoid')
2502
2503         info = {
2504             'id': video_id,
2505             'uploader': None,
2506             'upload_date': None,
2507         }
2508
2509         self.report_extraction(video_id)
2510         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2511         try:
2512             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2513         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2514             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2515             return
2516
2517         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2518         try:
2519             videoNode = mdoc.findall('./video')[0]
2520             info['description'] = videoNode.findall('./description')[0].text
2521             info['title'] = videoNode.findall('./caption')[0].text
2522             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2523             manifest_url = videoNode.findall('./file')[0].text
2524         except IndexError:
2525             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2526             return
2527
2528         manifest_url += '?hdcore=2.10.3'
2529         self.report_manifest(video_id)
2530         try:
2531             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2532         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2533             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2534             return
2535
2536         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2537         try:
2538             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2539             node_id = media_node.attrib['url']
2540             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2541         except IndexError as err:
2542             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2543             return
2544
2545         url_pr = compat_urllib_parse_urlparse(manifest_url)
2546         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2547
2548         info['url'] = url
2549         info['ext'] = 'f4f'
2550         return [info]
2551
2552
2553 class XVideosIE(InfoExtractor):
2554     """Information extractor for xvideos.com"""
2555
2556     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2557     IE_NAME = u'xvideos'
2558
2559     def report_extraction(self, video_id):
2560         """Report information extraction."""
2561         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2562
2563     def _real_extract(self, url):
2564         mobj = re.match(self._VALID_URL, url)
2565         if mobj is None:
2566             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2567             return
2568         video_id = mobj.group(1)
2569
2570         webpage = self._download_webpage(url, video_id)
2571
2572         self.report_extraction(video_id)
2573
2574
2575         # Extract video URL
2576         mobj = re.search(r'flv_url=(.+?)&', webpage)
2577         if mobj is None:
2578             self._downloader.trouble(u'ERROR: unable to extract video url')
2579             return
2580         video_url = compat_urllib_parse.unquote(mobj.group(1))
2581
2582
2583         # Extract title
2584         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2585         if mobj is None:
2586             self._downloader.trouble(u'ERROR: unable to extract video title')
2587             return
2588         video_title = mobj.group(1)
2589
2590
2591         # Extract video thumbnail
2592         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2593         if mobj is None:
2594             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2595             return
2596         video_thumbnail = mobj.group(0)
2597
2598         info = {
2599             'id': video_id,
2600             'url': video_url,
2601             'uploader': None,
2602             'upload_date': None,
2603             'title': video_title,
2604             'ext': 'flv',
2605             'thumbnail': video_thumbnail,
2606             'description': None,
2607         }
2608
2609         return [info]
2610
2611
2612 class SoundcloudIE(InfoExtractor):
2613     """Information extractor for soundcloud.com
2614        To access the media, the uid of the song and a stream token
2615        must be extracted from the page source and the script must make
2616        a request to media.soundcloud.com/crossdomain.xml. Then
2617        the media can be grabbed by requesting from an url composed
2618        of the stream token and uid
2619      """
2620
2621     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2622     IE_NAME = u'soundcloud'
2623
2624     def __init__(self, downloader=None):
2625         InfoExtractor.__init__(self, downloader)
2626
2627     def report_resolve(self, video_id):
2628         """Report information extraction."""
2629         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2630
2631     def report_extraction(self, video_id):
2632         """Report information extraction."""
2633         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2634
2635     def _real_extract(self, url):
2636         mobj = re.match(self._VALID_URL, url)
2637         if mobj is None:
2638             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2639             return
2640
2641         # extract uploader (which is in the url)
2642         uploader = mobj.group(1)
2643         # extract simple title (uploader + slug of song title)
2644         slug_title =  mobj.group(2)
2645         simple_title = uploader + u'-' + slug_title
2646
2647         self.report_resolve('%s/%s' % (uploader, slug_title))
2648
2649         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2650         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2651         request = compat_urllib_request.Request(resolv_url)
2652         try:
2653             info_json_bytes = compat_urllib_request.urlopen(request).read()
2654             info_json = info_json_bytes.decode('utf-8')
2655         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2656             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2657             return
2658
2659         info = json.loads(info_json)
2660         video_id = info['id']
2661         self.report_extraction('%s/%s' % (uploader, slug_title))
2662
2663         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2664         request = compat_urllib_request.Request(streams_url)
2665         try:
2666             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2667             stream_json = stream_json_bytes.decode('utf-8')
2668         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2669             self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2670             return
2671
2672         streams = json.loads(stream_json)
2673         mediaURL = streams['http_mp3_128_url']
2674
2675         return [{
2676             'id':       info['id'],
2677             'url':      mediaURL,
2678             'uploader': info['user']['username'],
2679             'upload_date':  info['created_at'],
2680             'title':    info['title'],
2681             'ext':      u'mp3',
2682             'description': info['description'],
2683         }]
2684
2685
2686 class InfoQIE(InfoExtractor):
2687     """Information extractor for infoq.com"""
2688     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2689
2690     def report_extraction(self, video_id):
2691         """Report information extraction."""
2692         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2693
2694     def _real_extract(self, url):
2695         mobj = re.match(self._VALID_URL, url)
2696         if mobj is None:
2697             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2698             return
2699
2700         webpage = self._download_webpage(url, video_id=url)
2701         self.report_extraction(url)
2702
2703         # Extract video URL
2704         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2705         if mobj is None:
2706             self._downloader.trouble(u'ERROR: unable to extract video url')
2707             return
2708         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2709         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2710
2711         # Extract title
2712         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2713         if mobj is None:
2714             self._downloader.trouble(u'ERROR: unable to extract video title')
2715             return
2716         video_title = mobj.group(1)
2717
2718         # Extract description
2719         video_description = u'No description available.'
2720         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2721         if mobj is not None:
2722             video_description = mobj.group(1)
2723
2724         video_filename = video_url.split('/')[-1]
2725         video_id, extension = video_filename.split('.')
2726
2727         info = {
2728             'id': video_id,
2729             'url': video_url,
2730             'uploader': None,
2731             'upload_date': None,
2732             'title': video_title,
2733             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2734             'thumbnail': None,
2735             'description': video_description,
2736         }
2737
2738         return [info]
2739
2740 class MixcloudIE(InfoExtractor):
2741     """Information extractor for www.mixcloud.com"""
2742
2743     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2744     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2745     IE_NAME = u'mixcloud'
2746
2747     def __init__(self, downloader=None):
2748         InfoExtractor.__init__(self, downloader)
2749
2750     def report_download_json(self, file_id):
2751         """Report JSON download."""
2752         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2753
2754     def report_extraction(self, file_id):
2755         """Report information extraction."""
2756         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2757
2758     def get_urls(self, jsonData, fmt, bitrate='best'):
2759         """Get urls from 'audio_formats' section in json"""
2760         file_url = None
2761         try:
2762             bitrate_list = jsonData[fmt]
2763             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2764                 bitrate = max(bitrate_list) # select highest
2765
2766             url_list = jsonData[fmt][bitrate]
2767         except TypeError: # we have no bitrate info.
2768             url_list = jsonData[fmt]
2769         return url_list
2770
2771     def check_urls(self, url_list):
2772         """Returns 1st active url from list"""
2773         for url in url_list:
2774             try:
2775                 compat_urllib_request.urlopen(url)
2776                 return url
2777             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2778                 url = None
2779
2780         return None
2781
2782     def _print_formats(self, formats):
2783         print('Available formats:')
2784         for fmt in formats.keys():
2785             for b in formats[fmt]:
2786                 try:
2787                     ext = formats[fmt][b][0]
2788                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2789                 except TypeError: # we have no bitrate info
2790                     ext = formats[fmt][0]
2791                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2792                     break
2793
2794     def _real_extract(self, url):
2795         mobj = re.match(self._VALID_URL, url)
2796         if mobj is None:
2797             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2798             return
2799         # extract uploader & filename from url
2800         uploader = mobj.group(1).decode('utf-8')
2801         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2802
2803         # construct API request
2804         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2805         # retrieve .json file with links to files
2806         request = compat_urllib_request.Request(file_url)
2807         try:
2808             self.report_download_json(file_url)
2809             jsonData = compat_urllib_request.urlopen(request).read()
2810         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2811             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2812             return
2813
2814         # parse JSON
2815         json_data = json.loads(jsonData)
2816         player_url = json_data['player_swf_url']
2817         formats = dict(json_data['audio_formats'])
2818
2819         req_format = self._downloader.params.get('format', None)
2820         bitrate = None
2821
2822         if self._downloader.params.get('listformats', None):
2823             self._print_formats(formats)
2824             return
2825
2826         if req_format is None or req_format == 'best':
2827             for format_param in formats.keys():
2828                 url_list = self.get_urls(formats, format_param)
2829                 # check urls
2830                 file_url = self.check_urls(url_list)
2831                 if file_url is not None:
2832                     break # got it!
2833         else:
2834             if req_format not in formats:
2835                 self._downloader.trouble(u'ERROR: format is not available')
2836                 return
2837
2838             url_list = self.get_urls(formats, req_format)
2839             file_url = self.check_urls(url_list)
2840             format_param = req_format
2841
2842         return [{
2843             'id': file_id.decode('utf-8'),
2844             'url': file_url.decode('utf-8'),
2845             'uploader': uploader.decode('utf-8'),
2846             'upload_date': None,
2847             'title': json_data['name'],
2848             'ext': file_url.split('.')[-1].decode('utf-8'),
2849             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2850             'thumbnail': json_data['thumbnail_url'],
2851             'description': json_data['description'],
2852             'player_url': player_url.decode('utf-8'),
2853         }]
2854
2855 class StanfordOpenClassroomIE(InfoExtractor):
2856     """Information extractor for Stanford's Open ClassRoom"""
2857
2858     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2859     IE_NAME = u'stanfordoc'
2860
2861     def report_download_webpage(self, objid):
2862         """Report information extraction."""
2863         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2864
2865     def report_extraction(self, video_id):
2866         """Report information extraction."""
2867         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2868
2869     def _real_extract(self, url):
2870         mobj = re.match(self._VALID_URL, url)
2871         if mobj is None:
2872             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2873             return
2874
2875         if mobj.group('course') and mobj.group('video'): # A specific video
2876             course = mobj.group('course')
2877             video = mobj.group('video')
2878             info = {
2879                 'id': course + '_' + video,
2880                 'uploader': None,
2881                 'upload_date': None,
2882             }
2883
2884             self.report_extraction(info['id'])
2885             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2886             xmlUrl = baseUrl + video + '.xml'
2887             try:
2888                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2889             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2890                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2891                 return
2892             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2893             try:
2894                 info['title'] = mdoc.findall('./title')[0].text
2895                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2896             except IndexError:
2897                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2898                 return
2899             info['ext'] = info['url'].rpartition('.')[2]
2900             return [info]
2901         elif mobj.group('course'): # A course page
2902             course = mobj.group('course')
2903             info = {
2904                 'id': course,
2905                 'type': 'playlist',
2906                 'uploader': None,
2907                 'upload_date': None,
2908             }
2909
2910             self.report_download_webpage(info['id'])
2911             try:
2912                 coursepage = compat_urllib_request.urlopen(url).read()
2913             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2914                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
2915                 return
2916
2917             m = re.search('<h1>([^<]+)</h1>', coursepage)
2918             if m:
2919                 info['title'] = unescapeHTML(m.group(1))
2920             else:
2921                 info['title'] = info['id']
2922
2923             m = re.search('<description>([^<]+)</description>', coursepage)
2924             if m:
2925                 info['description'] = unescapeHTML(m.group(1))
2926
2927             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2928             info['list'] = [
2929                 {
2930                     'type': 'reference',
2931                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2932                 }
2933                     for vpage in links]
2934             results = []
2935             for entry in info['list']:
2936                 assert entry['type'] == 'reference'
2937                 results += self.extract(entry['url'])
2938             return results
2939
2940         else: # Root page
2941             info = {
2942                 'id': 'Stanford OpenClassroom',
2943                 'type': 'playlist',
2944                 'uploader': None,
2945                 'upload_date': None,
2946             }
2947
2948             self.report_download_webpage(info['id'])
2949             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2950             try:
2951                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2952             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2953                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
2954                 return
2955
2956             info['title'] = info['id']
2957
2958             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2959             info['list'] = [
2960                 {
2961                     'type': 'reference',
2962                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2963                 }
2964                     for cpage in links]
2965
2966             results = []
2967             for entry in info['list']:
2968                 assert entry['type'] == 'reference'
2969                 results += self.extract(entry['url'])
2970             return results
2971
2972 class MTVIE(InfoExtractor):
2973     """Information extractor for MTV.com"""
2974
2975     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2976     IE_NAME = u'mtv'
2977
2978     def report_extraction(self, video_id):
2979         """Report information extraction."""
2980         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2981
2982     def _real_extract(self, url):
2983         mobj = re.match(self._VALID_URL, url)
2984         if mobj is None:
2985             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2986             return
2987         if not mobj.group('proto'):
2988             url = 'http://' + url
2989         video_id = mobj.group('videoid')
2990
2991         webpage = self._download_webpage(url, video_id)
2992
2993         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2994         if mobj is None:
2995             self._downloader.trouble(u'ERROR: unable to extract song name')
2996             return
2997         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2998         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2999         if mobj is None:
3000             self._downloader.trouble(u'ERROR: unable to extract performer')
3001             return
3002         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3003         video_title = performer + ' - ' + song_name
3004
3005         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3006         if mobj is None:
3007             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3008             return
3009         mtvn_uri = mobj.group(1)
3010
3011         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3012         if mobj is None:
3013             self._downloader.trouble(u'ERROR: unable to extract content id')
3014             return
3015         content_id = mobj.group(1)
3016
3017         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3018         self.report_extraction(video_id)
3019         request = compat_urllib_request.Request(videogen_url)
3020         try:
3021             metadataXml = compat_urllib_request.urlopen(request).read()
3022         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3023             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3024             return
3025
3026         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3027         renditions = mdoc.findall('.//rendition')
3028
3029         # For now, always pick the highest quality.
3030         rendition = renditions[-1]
3031
3032         try:
3033             _,_,ext = rendition.attrib['type'].partition('/')
3034             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3035             video_url = rendition.find('./src').text
3036         except KeyError:
3037             self._downloader.trouble('Invalid rendition field.')
3038             return
3039
3040         info = {
3041             'id': video_id,
3042             'url': video_url,
3043             'uploader': performer,
3044             'upload_date': None,
3045             'title': video_title,
3046             'ext': ext,
3047             'format': format,
3048         }
3049
3050         return [info]
3051
3052
3053 class YoukuIE(InfoExtractor):
3054     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3055
3056     def report_download_webpage(self, file_id):
3057         """Report webpage download."""
3058         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3059
3060     def report_extraction(self, file_id):
3061         """Report information extraction."""
3062         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3063
3064     def _gen_sid(self):
3065         nowTime = int(time.time() * 1000)
3066         random1 = random.randint(1000,1998)
3067         random2 = random.randint(1000,9999)
3068
3069         return "%d%d%d" %(nowTime,random1,random2)
3070
3071     def _get_file_ID_mix_string(self, seed):
3072         mixed = []
3073         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3074         seed = float(seed)
3075         for i in range(len(source)):
3076             seed  =  (seed * 211 + 30031 ) % 65536
3077             index  =  math.floor(seed / 65536 * len(source) )
3078             mixed.append(source[int(index)])
3079             source.remove(source[int(index)])
3080         #return ''.join(mixed)
3081         return mixed
3082
3083     def _get_file_id(self, fileId, seed):
3084         mixed = self._get_file_ID_mix_string(seed)
3085         ids = fileId.split('*')
3086         realId = []
3087         for ch in ids:
3088             if ch:
3089                 realId.append(mixed[int(ch)])
3090         return ''.join(realId)
3091
3092     def _real_extract(self, url):
3093         mobj = re.match(self._VALID_URL, url)
3094         if mobj is None:
3095             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3096             return
3097         video_id = mobj.group('ID')
3098
3099         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3100
3101         request = compat_urllib_request.Request(info_url, None, std_headers)
3102         try:
3103             self.report_download_webpage(video_id)
3104             jsondata = compat_urllib_request.urlopen(request).read()
3105         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3106             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3107             return
3108
3109         self.report_extraction(video_id)
3110         try:
3111             jsonstr = jsondata.decode('utf-8')
3112             config = json.loads(jsonstr)
3113
3114             video_title =  config['data'][0]['title']
3115             seed = config['data'][0]['seed']
3116
3117             format = self._downloader.params.get('format', None)
3118             supported_format = list(config['data'][0]['streamfileids'].keys())
3119
3120             if format is None or format == 'best':
3121                 if 'hd2' in supported_format:
3122                     format = 'hd2'
3123                 else:
3124                     format = 'flv'
3125                 ext = u'flv'
3126             elif format == 'worst':
3127                 format = 'mp4'
3128                 ext = u'mp4'
3129             else:
3130                 format = 'flv'
3131                 ext = u'flv'
3132
3133
3134             fileid = config['data'][0]['streamfileids'][format]
3135             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3136         except (UnicodeDecodeError, ValueError, KeyError):
3137             self._downloader.trouble(u'ERROR: unable to extract info section')
3138             return
3139
3140         files_info=[]
3141         sid = self._gen_sid()
3142         fileid = self._get_file_id(fileid, seed)
3143
3144         #column 8,9 of fileid represent the segment number
3145         #fileid[7:9] should be changed
3146         for index, key in enumerate(keys):
3147
3148             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3149             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3150
3151             info = {
3152                 'id': '%s_part%02d' % (video_id, index),
3153                 'url': download_url,
3154                 'uploader': None,
3155                 'upload_date': None,
3156                 'title': video_title,
3157                 'ext': ext,
3158             }
3159             files_info.append(info)
3160
3161         return files_info
3162
3163
3164 class XNXXIE(InfoExtractor):
3165     """Information extractor for xnxx.com"""
3166
3167     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3168     IE_NAME = u'xnxx'
3169     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3170     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3171     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3172
3173     def report_webpage(self, video_id):
3174         """Report information extraction"""
3175         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3176
3177     def report_extraction(self, video_id):
3178         """Report information extraction"""
3179         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3180
3181     def _real_extract(self, url):
3182         mobj = re.match(self._VALID_URL, url)
3183         if mobj is None:
3184             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3185             return
3186         video_id = mobj.group(1)
3187
3188         self.report_webpage(video_id)
3189
3190         # Get webpage content
3191         try:
3192             webpage_bytes = compat_urllib_request.urlopen(url).read()
3193             webpage = webpage_bytes.decode('utf-8')
3194         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3195             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3196             return
3197
3198         result = re.search(self.VIDEO_URL_RE, webpage)
3199         if result is None:
3200             self._downloader.trouble(u'ERROR: unable to extract video url')
3201             return
3202         video_url = compat_urllib_parse.unquote(result.group(1))
3203
3204         result = re.search(self.VIDEO_TITLE_RE, webpage)
3205         if result is None:
3206             self._downloader.trouble(u'ERROR: unable to extract video title')
3207             return
3208         video_title = result.group(1)
3209
3210         result = re.search(self.VIDEO_THUMB_RE, webpage)
3211         if result is None:
3212             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3213             return
3214         video_thumbnail = result.group(1)
3215
3216         return [{
3217             'id': video_id,
3218             'url': video_url,
3219             'uploader': None,
3220             'upload_date': None,
3221             'title': video_title,
3222             'ext': 'flv',
3223             'thumbnail': video_thumbnail,
3224             'description': None,
3225         }]
3226
3227
3228 class GooglePlusIE(InfoExtractor):
3229     """Information extractor for plus.google.com."""
3230
3231     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3232     IE_NAME = u'plus.google'
3233
3234     def __init__(self, downloader=None):
3235         InfoExtractor.__init__(self, downloader)
3236
3237     def report_extract_entry(self, url):
3238         """Report downloading extry"""
3239         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3240
3241     def report_date(self, upload_date):
3242         """Report downloading extry"""
3243         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3244
3245     def report_uploader(self, uploader):
3246         """Report downloading extry"""
3247         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3248
3249     def report_title(self, video_title):
3250         """Report downloading extry"""
3251         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3252
3253     def report_extract_vid_page(self, video_page):
3254         """Report information extraction."""
3255         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3256
3257     def _real_extract(self, url):
3258         # Extract id from URL
3259         mobj = re.match(self._VALID_URL, url)
3260         if mobj is None:
3261             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3262             return
3263
3264         post_url = mobj.group(0)
3265         video_id = mobj.group(1)
3266
3267         video_extension = 'flv'
3268
3269         # Step 1, Retrieve post webpage to extract further information
3270         self.report_extract_entry(post_url)
3271         request = compat_urllib_request.Request(post_url)
3272         try:
3273             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3274         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3275             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3276             return
3277
3278         # Extract update date
3279         upload_date = None
3280         pattern = 'title="Timestamp">(.*?)</a>'
3281         mobj = re.search(pattern, webpage)
3282         if mobj:
3283             upload_date = mobj.group(1)
3284             # Convert timestring to a format suitable for filename
3285             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3286             upload_date = upload_date.strftime('%Y%m%d')
3287         self.report_date(upload_date)
3288
3289         # Extract uploader
3290         uploader = None
3291         pattern = r'rel\="author".*?>(.*?)</a>'
3292         mobj = re.search(pattern, webpage)
3293         if mobj:
3294             uploader = mobj.group(1)
3295         self.report_uploader(uploader)
3296
3297         # Extract title
3298         # Get the first line for title
3299         video_title = u'NA'
3300         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3301         mobj = re.search(pattern, webpage)
3302         if mobj:
3303             video_title = mobj.group(1)
3304         self.report_title(video_title)
3305
3306         # Step 2, Stimulate clicking the image box to launch video
3307         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3308         mobj = re.search(pattern, webpage)
3309         if mobj is None:
3310             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3311
3312         video_page = mobj.group(1)
3313         request = compat_urllib_request.Request(video_page)
3314         try:
3315             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3316         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3317             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3318             return
3319         self.report_extract_vid_page(video_page)
3320
3321
3322         # Extract video links on video page
3323         """Extract video links of all sizes"""
3324         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3325         mobj = re.findall(pattern, webpage)
3326         if len(mobj) == 0:
3327             self._downloader.trouble(u'ERROR: unable to extract video links')
3328
3329         # Sort in resolution
3330         links = sorted(mobj)
3331
3332         # Choose the lowest of the sort, i.e. highest resolution
3333         video_url = links[-1]
3334         # Only get the url. The resolution part in the tuple has no use anymore
3335         video_url = video_url[-1]
3336         # Treat escaped \u0026 style hex
3337         try:
3338             video_url = video_url.decode("unicode_escape")
3339         except AttributeError: # Python 3
3340             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3341
3342
3343         return [{
3344             'id':       video_id,
3345             'url':      video_url,
3346             'uploader': uploader,
3347             'upload_date':  upload_date,
3348             'title':    video_title,
3349             'ext':      video_extension,
3350         }]
3351
3352 class NBAIE(InfoExtractor):
3353     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3354     IE_NAME = u'nba'
3355
3356     def _real_extract(self, url):
3357         mobj = re.match(self._VALID_URL, url)
3358         if mobj is None:
3359             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3360             return
3361
3362         video_id = mobj.group(1)
3363         if video_id.endswith('/index.html'):
3364             video_id = video_id[:-len('/index.html')]
3365
3366         webpage = self._download_webpage(url, video_id)
3367
3368         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3369         def _findProp(rexp, default=None):
3370             m = re.search(rexp, webpage)
3371             if m:
3372                 return unescapeHTML(m.group(1))
3373             else:
3374                 return default
3375
3376         shortened_video_id = video_id.rpartition('/')[2]
3377         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3378         info = {
3379             'id': shortened_video_id,
3380             'url': video_url,
3381             'ext': 'mp4',
3382             'title': title,
3383             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3384             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3385         }
3386         return [info]
3387
3388 class JustinTVIE(InfoExtractor):
3389     """Information extractor for justin.tv and twitch.tv"""
3390     # TODO: One broadcast may be split into multiple videos. The key
3391     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3392     # starts at 1 and increases. Can we treat all parts as one video?
3393
3394     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3395         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3396     _JUSTIN_PAGE_LIMIT = 100
3397     IE_NAME = u'justin.tv'
3398
3399     def report_extraction(self, file_id):
3400         """Report information extraction."""
3401         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3402
3403     def report_download_page(self, channel, offset):
3404         """Report attempt to download a single page of videos."""
3405         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3406                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3407
3408     # Return count of items, list of *valid* items
3409     def _parse_page(self, url):
3410         try:
3411             urlh = compat_urllib_request.urlopen(url)
3412             webpage_bytes = urlh.read()
3413             webpage = webpage_bytes.decode('utf-8', 'ignore')
3414         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3415             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3416             return
3417
3418         response = json.loads(webpage)
3419         if type(response) != list:
3420             error_text = response.get('error', 'unknown error')
3421             self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3422             return
3423         info = []
3424         for clip in response:
3425             video_url = clip['video_file_url']
3426             if video_url:
3427                 video_extension = os.path.splitext(video_url)[1][1:]
3428                 video_date = re.sub('-', '', clip['start_time'][:10])
3429                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3430                 video_id = clip['id']
3431                 video_title = clip.get('title', video_id)
3432                 info.append({
3433                     'id': video_id,
3434                     'url': video_url,
3435                     'title': video_title,
3436                     'uploader': clip.get('channel_name', video_uploader_id),
3437                     'uploader_id': video_uploader_id,
3438                     'upload_date': video_date,
3439                     'ext': video_extension,
3440                 })
3441         return (len(response), info)
3442
3443     def _real_extract(self, url):
3444         mobj = re.match(self._VALID_URL, url)
3445         if mobj is None:
3446             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3447             return
3448
3449         api = 'http://api.justin.tv'
3450         video_id = mobj.group(mobj.lastindex)
3451         paged = False
3452         if mobj.lastindex == 1:
3453             paged = True
3454             api += '/channel/archives/%s.json'
3455         else:
3456             api += '/broadcast/by_archive/%s.json'
3457         api = api % (video_id,)
3458
3459         self.report_extraction(video_id)
3460
3461         info = []
3462         offset = 0
3463         limit = self._JUSTIN_PAGE_LIMIT
3464         while True:
3465             if paged:
3466                 self.report_download_page(video_id, offset)
3467             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3468             page_count, page_info = self._parse_page(page_url)
3469             info.extend(page_info)
3470             if not paged or page_count != limit:
3471                 break
3472             offset += limit
3473         return info
3474
3475 class FunnyOrDieIE(InfoExtractor):
3476     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3477
3478     def _real_extract(self, url):
3479         mobj = re.match(self._VALID_URL, url)
3480         if mobj is None:
3481             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3482             return
3483
3484         video_id = mobj.group('id')
3485         webpage = self._download_webpage(url, video_id)
3486
3487         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3488         if not m:
3489             self._downloader.trouble(u'ERROR: unable to find video information')
3490         video_url = unescapeHTML(m.group('url'))
3491
3492         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3493         if not m:
3494             self._downloader.trouble(u'Cannot find video title')
3495         title = unescapeHTML(m.group('title'))
3496
3497         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3498         if m:
3499             desc = unescapeHTML(m.group('desc'))
3500         else:
3501             desc = None
3502
3503         info = {
3504             'id': video_id,
3505             'url': video_url,
3506             'ext': 'mp4',
3507             'title': title,
3508             'description': desc,
3509         }
3510         return [info]
3511
3512 class TweetReelIE(InfoExtractor):
3513     _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3514
3515     def _real_extract(self, url):
3516         mobj = re.match(self._VALID_URL, url)
3517         if mobj is None:
3518             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3519             return
3520
3521         video_id = mobj.group('id')
3522         webpage = self._download_webpage(url, video_id)
3523
3524         m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3525         if not m:
3526             self._downloader.trouble(u'ERROR: Cannot find status ID')
3527         status_id = m.group(1)
3528
3529         m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3530         if not m:
3531             self._downloader.trouble(u'WARNING: Cannot find description')
3532         desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3533
3534         m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3535         if not m:
3536             self._downloader.trouble(u'ERROR: Cannot find uploader')
3537         uploader = unescapeHTML(m.group('uploader'))
3538         uploader_id = unescapeHTML(m.group('uploader_id'))
3539
3540         m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3541         if not m:
3542             self._downloader.trouble(u'ERROR: Cannot find upload date')
3543         upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3544
3545         title = desc
3546         video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3547
3548         info = {
3549             'id': video_id,
3550             'url': video_url,
3551             'ext': 'mov',
3552             'title': title,
3553             'description': desc,
3554             'uploader': uploader,
3555             'uploader_id': uploader_id,
3556             'internal_id': status_id,
3557             'upload_date': upload_date
3558         }
3559         return [info]
3560
3561 class SteamIE(InfoExtractor):
3562     _VALID_URL = r"""http://store.steampowered.com/
3563                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3564                 (?P<gameID>\d+)/?
3565                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3566                 """
3567
3568     def suitable(self, url):
3569         """Receives a URL and returns True if suitable for this IE."""
3570         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3571
3572     def _real_extract(self, url):
3573         m = re.match(self._VALID_URL, url, re.VERBOSE)
3574         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3575         gameID = m.group('gameID')
3576         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3577         webpage = self._download_webpage(videourl, gameID)
3578         mweb = re.finditer(urlRE, webpage)
3579         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3580         titles = re.finditer(namesRE, webpage)
3581         videos = []
3582         for vid,vtitle in zip(mweb,titles):
3583             video_id = vid.group('videoID')
3584             title = vtitle.group('videoName')
3585             video_url = vid.group('videoURL')
3586             if not video_url:
3587                 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3588             info = {
3589                 'id':video_id,
3590                 'url':video_url,
3591                 'ext': 'flv',
3592                 'title': unescapeHTML(title)
3593                   }
3594             videos.append(info)
3595         return videos
3596
3597 class UstreamIE(InfoExtractor):
3598     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3599     IE_NAME = u'ustream'
3600
3601     def _real_extract(self, url):
3602         m = re.match(self._VALID_URL, url)
3603         video_id = m.group('videoID')
3604         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3605         webpage = self._download_webpage(url, video_id)
3606         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3607         title = m.group('title')
3608         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3609         uploader = m.group('uploader')
3610         info = {
3611                 'id':video_id,
3612                 'url':video_url,
3613                 'ext': 'flv',
3614                 'title': title,
3615                 'uploader': uploader
3616                   }
3617         return [info]
3618
3619 class RBMARadioIE(InfoExtractor):
3620     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3621
3622     def _real_extract(self, url):
3623         m = re.match(self._VALID_URL, url)
3624         video_id = m.group('videoID')
3625
3626         webpage = self._download_webpage(url, video_id)
3627         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3628         if not m:
3629             raise ExtractorError(u'Cannot find metadata')
3630         json_data = m.group(1)
3631
3632         try:
3633             data = json.loads(json_data)
3634         except ValueError as e:
3635             raise ExtractorError(u'Invalid JSON: ' + str(e))
3636
3637         video_url = data['akamai_url'] + '&cbr=256'
3638         url_parts = compat_urllib_parse_urlparse(video_url)
3639         video_ext = url_parts.path.rpartition('.')[2]
3640         info = {
3641                 'id': video_id,
3642                 'url': video_url,
3643                 'ext': video_ext,
3644                 'title': data['title'],
3645                 'description': data.get('teaser_text'),
3646                 'location': data.get('country_of_origin'),
3647                 'uploader': data.get('host', {}).get('name'),
3648                 'uploader_id': data.get('host', {}).get('slug'),
3649                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3650                 'duration': data.get('duration'),
3651         }
3652         return [info]
3653
3654
3655 class YouPornIE(InfoExtractor):
3656     """Information extractor for youporn.com."""
3657     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3658
3659     def _print_formats(self, formats):
3660         """Print all available formats"""
3661         print(u'Available formats:')
3662         print(u'ext\t\tformat')
3663         print(u'---------------------------------')
3664         for format in formats:
3665             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3666
3667     def _specific(self, req_format, formats):
3668         for x in formats:
3669             if(x["format"]==req_format):
3670                 return x
3671         return None
3672
3673     def _real_extract(self, url):
3674         mobj = re.match(self._VALID_URL, url)
3675         if mobj is None:
3676             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3677             return
3678
3679         video_id = mobj.group('videoid')
3680
3681         req = compat_urllib_request.Request(url)
3682         req.add_header('Cookie', 'age_verified=1')
3683         webpage = self._download_webpage(req, video_id)
3684
3685         # Get the video title
3686         result = re.search(r'videoTitleArea">(?P<title>.*)</h1>', webpage)
3687         if result is None:
3688             raise ExtractorError(u'ERROR: unable to extract video title')
3689         video_title = result.group('title').strip()
3690
3691         # Get the video date
3692         result = re.search(r'Date:</b>(?P<date>.*)</li>', webpage)
3693         if result is None:
3694             self._downloader.to_stderr(u'WARNING: unable to extract video date')
3695             upload_date = None
3696         else:
3697             upload_date = result.group('date').strip()
3698
3699         # Get the video uploader
3700         result = re.search(r'Submitted:</b>(?P<uploader>.*)</li>', webpage)
3701         if result is None:
3702             self._downloader.to_stderr(u'ERROR: unable to extract uploader')
3703             video_uploader = None
3704         else:
3705             video_uploader = result.group('uploader').strip()
3706             video_uploader = clean_html( video_uploader )
3707
3708         # Get all of the formats available
3709         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3710         result = re.search(DOWNLOAD_LIST_RE, webpage)
3711         if result is None:
3712             raise ExtractorError(u'Unable to extract download list')
3713         download_list_html = result.group('download_list').strip()
3714
3715         # Get all of the links from the page
3716         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3717         links = re.findall(LINK_RE, download_list_html)
3718         if(len(links) == 0):
3719             raise ExtractorError(u'ERROR: no known formats available for video')
3720
3721         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3722
3723         formats = []
3724         for link in links:
3725
3726             # A link looks like this:
3727             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3728             # A path looks like this:
3729             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3730             video_url = unescapeHTML( link )
3731             path = compat_urllib_parse_urlparse( video_url ).path
3732             extension = os.path.splitext( path )[1][1:]
3733             format = path.split('/')[4].split('_')[:2]
3734             size = format[0]
3735             bitrate = format[1]
3736             format = "-".join( format )
3737             title = u'%s-%s-%s' % (video_title, size, bitrate)
3738
3739             formats.append({
3740                 'id': video_id,
3741                 'url': video_url,
3742                 'uploader': video_uploader,
3743                 'upload_date': upload_date,
3744                 'title': title,
3745                 'ext': extension,
3746                 'format': format,
3747                 'thumbnail': None,
3748                 'description': None,
3749                 'player_url': None
3750             })
3751
3752         if self._downloader.params.get('listformats', None):
3753             self._print_formats(formats)
3754             return
3755
3756         req_format = self._downloader.params.get('format', None)
3757         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3758
3759         if req_format is None or req_format == 'best':
3760             return [formats[0]]
3761         elif req_format == 'worst':
3762             return [formats[-1]]
3763         elif req_format in ('-1', 'all'):
3764             return formats
3765         else:
3766             format = self._specific( req_format, formats )
3767             if result is None:
3768                 self._downloader.trouble(u'ERROR: requested format not available')
3769                 return
3770             return [format]
3771
3772
3773
3774 class PornotubeIE(InfoExtractor):
3775     """Information extractor for pornotube.com."""
3776     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3777
3778     def _real_extract(self, url):
3779         mobj = re.match(self._VALID_URL, url)
3780         if mobj is None:
3781             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3782             return
3783
3784         video_id = mobj.group('videoid')
3785         video_title = mobj.group('title')
3786
3787         # Get webpage content
3788         webpage = self._download_webpage(url, video_id)
3789
3790         # Get the video URL
3791         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3792         result = re.search(VIDEO_URL_RE, webpage)
3793         if result is None:
3794             self._downloader.trouble(u'ERROR: unable to extract video url')
3795             return
3796         video_url = compat_urllib_parse.unquote(result.group('url'))
3797
3798         #Get the uploaded date
3799         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3800         result = re.search(VIDEO_UPLOADED_RE, webpage)
3801         if result is None:
3802             self._downloader.trouble(u'ERROR: unable to extract video title')
3803             return
3804         upload_date = result.group('date')
3805
3806         info = {'id': video_id,
3807                 'url': video_url,
3808                 'uploader': None,
3809                 'upload_date': upload_date,
3810                 'title': video_title,
3811                 'ext': 'flv',
3812                 'format': 'flv'}
3813
3814         return [info]
3815
3816 class YouJizzIE(InfoExtractor):
3817     """Information extractor for youjizz.com."""
3818     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3819
3820     def _real_extract(self, url):
3821         mobj = re.match(self._VALID_URL, url)
3822         if mobj is None:
3823             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3824             return
3825
3826         video_id = mobj.group('videoid')
3827
3828         # Get webpage content
3829         webpage = self._download_webpage(url, video_id)
3830
3831         # Get the video title
3832         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3833         if result is None:
3834             raise ExtractorError(u'ERROR: unable to extract video title')
3835         video_title = result.group('title').strip()
3836
3837         # Get the embed page
3838         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3839         if result is None:
3840             raise ExtractorError(u'ERROR: unable to extract embed page')
3841
3842         embed_page_url = result.group(0).strip()
3843         video_id = result.group('videoid')
3844
3845         webpage = self._download_webpage(embed_page_url, video_id)
3846
3847         # Get the video URL
3848         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3849         if result is None:
3850             raise ExtractorError(u'ERROR: unable to extract video url')
3851         video_url = result.group('source')
3852
3853         info = {'id': video_id,
3854                 'url': video_url,
3855                 'title': video_title,
3856                 'ext': 'flv',
3857                 'format': 'flv',
3858                 'player_url': embed_page_url}
3859
3860         return [info]
3861
3862 class EightTracksIE(InfoExtractor):
3863     IE_NAME = '8tracks'
3864     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3865
3866     def _real_extract(self, url):
3867         mobj = re.match(self._VALID_URL, url)
3868         if mobj is None:
3869             raise ExtractorError(u'Invalid URL: %s' % url)
3870         playlist_id = mobj.group('id')
3871
3872         webpage = self._download_webpage(url, playlist_id)
3873
3874         m = re.search(r"new TRAX.Mix\((.*?)\);\n*\s*TRAX.initSearchAutocomplete\('#search'\);", webpage, flags=re.DOTALL)
3875         if not m:
3876             raise ExtractorError(u'Cannot find trax information')
3877         json_like = m.group(1)
3878         data = json.loads(json_like)
3879
3880         session = str(random.randint(0, 1000000000))
3881         mix_id = data['id']
3882         track_count = data['tracks_count']
3883         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3884         next_url = first_url
3885         res = []
3886         for i in itertools.count():
3887             api_json = self._download_webpage(next_url, playlist_id,
3888                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3889                 errnote=u'Failed to download song information')
3890             api_data = json.loads(api_json)
3891             track_data = api_data[u'set']['track']
3892             info = {
3893                 'id': track_data['id'],
3894                 'url': track_data['track_file_stream_url'],
3895                 'title': track_data['performer'] + u' - ' + track_data['name'],
3896                 'raw_title': track_data['name'],
3897                 'uploader_id': data['user']['login'],
3898                 'ext': 'm4a',
3899             }
3900             res.append(info)
3901             if api_data['set']['at_last_track']:
3902                 break
3903             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3904         return res
3905
3906 def gen_extractors():
3907     """ Return a list of an instance of every supported extractor.
3908     The order does matter; the first extractor matched is the one handling the URL.
3909     """
3910     return [
3911         YoutubePlaylistIE(),
3912         YoutubeChannelIE(),
3913         YoutubeUserIE(),
3914         YoutubeSearchIE(),
3915         YoutubeIE(),
3916         MetacafeIE(),
3917         DailymotionIE(),
3918         GoogleSearchIE(),
3919         PhotobucketIE(),
3920         YahooIE(),
3921         YahooSearchIE(),
3922         DepositFilesIE(),
3923         FacebookIE(),
3924         BlipTVUserIE(),
3925         BlipTVIE(),
3926         VimeoIE(),
3927         MyVideoIE(),
3928         ComedyCentralIE(),
3929         EscapistIE(),
3930         CollegeHumorIE(),
3931         XVideosIE(),
3932         SoundcloudIE(),
3933         InfoQIE(),
3934         MixcloudIE(),
3935         StanfordOpenClassroomIE(),
3936         MTVIE(),
3937         YoukuIE(),
3938         XNXXIE(),
3939         YouJizzIE(),
3940         PornotubeIE(),
3941         YouPornIE(),
3942         GooglePlusIE(),
3943         ArteTvIE(),
3944         NBAIE(),
3945         JustinTVIE(),
3946         FunnyOrDieIE(),
3947         TweetReelIE(),
3948         SteamIE(),
3949         UstreamIE(),
3950         RBMARadioIE(),
3951         EightTracksIE(),
3952         GenericIE()
3953     ]
3954
3955