_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 import base64
   2 import datetime
   3 import itertools
   4 import netrc
   5 import os
   6 import re
   7 import socket
   8 import time
   9 import email.utils
  10 import xml.etree.ElementTree
  11 import random
  12 import math
  13 import operator
  14 import hashlib
  15 import binascii
  16 import urllib
  17
  18 from .utils import *
  19 from .extractor.common import InfoExtractor, SearchInfoExtractor
  20
  21 from .extractor.ard import ARDIE
  22 from .extractor.arte import ArteTvIE
  23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
  24 from .extractor.comedycentral import ComedyCentralIE
  25 from .extractor.collegehumor import CollegeHumorIE
  26 from .extractor.dailymotion import DailymotionIE
  27 from .extractor.depositfiles import DepositFilesIE
  28 from .extractor.escapist import EscapistIE
  29 from .extractor.facebook import FacebookIE
  30 from .extractor.funnyordie import FunnyOrDieIE
  31 from .extractor.gametrailers import GametrailersIE
  32 from .extractor.generic import GenericIE
  33 from .extractor.googleplus import GooglePlusIE
  34 from .extractor.googlesearch import GoogleSearchIE
  35 from .extractor.infoq import InfoQIE
  36 from .extractor.justintv import JustinTVIE
  37 from .extractor.metacafe import MetacafeIE
  38 from .extractor.mixcloud import MixcloudIE
  39 from .extractor.mtv import MTVIE
  40 from .extractor.myvideo import MyVideoIE
  41 from .extractor.nba import NBAIE
  42 from .extractor.statigram import StatigramIE
  43 from .extractor.photobucket import PhotobucketIE
  44 from .extractor.rbmaradio import RBMARadioIE
  45 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
  46 from .extractor.stanfordoc import StanfordOpenClassroomIE
  47 from .extractor.steam import SteamIE
  48 from .extractor.ted import TEDIE
  49 from .extractor.ustream import UstreamIE
  50 from .extractor.vimeo import VimeoIE
  51 from .extractor.worldstarhiphop import WorldStarHipHopIE
  52 from .extractor.xnxx import XNXXIE
  53 from .extractor.xvideos import XVideosIE
  54 from .extractor.yahoo import YahooIE, YahooSearchIE
  55 from .extractor.youku import YoukuIE
  56 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
  57 from .extractor.zdf import ZDFIE
  58
  59
  60
  61
  62
  63
  64
  65
  66
  67
  68
  69
  70
  71
  72
  73
  74
  75
  76
  77
  78 class YouPornIE(InfoExtractor):
  79     """Information extractor for youporn.com."""
  80     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
  81
  82     def _print_formats(self, formats):
  83         """Print all available formats"""
  84         print(u'Available formats:')
  85         print(u'ext\t\tformat')
  86         print(u'---------------------------------')
  87         for format in formats:
  88             print(u'%s\t\t%s'  % (format['ext'], format['format']))
  89
  90     def _specific(self, req_format, formats):
  91         for x in formats:
  92             if(x["format"]==req_format):
  93                 return x
  94         return None
  95
  96     def _real_extract(self, url):
  97         mobj = re.match(self._VALID_URL, url)
  98         if mobj is None:
  99             raise ExtractorError(u'Invalid URL: %s' % url)
 100         video_id = mobj.group('videoid')
 101
 102         req = compat_urllib_request.Request(url)
 103         req.add_header('Cookie', 'age_verified=1')
 104         webpage = self._download_webpage(req, video_id)
 105
 106         # Get JSON parameters
 107         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
 108         try:
 109             params = json.loads(json_params)
 110         except:
 111             raise ExtractorError(u'Invalid JSON')
 112
 113         self.report_extraction(video_id)
 114         try:
 115             video_title = params['title']
 116             upload_date = unified_strdate(params['release_date_f'])
 117             video_description = params['description']
 118             video_uploader = params['submitted_by']
 119             thumbnail = params['thumbnails'][0]['image']
 120         except KeyError:
 121             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
 122
 123         # Get all of the formats available
 124         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
 125         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
 126             webpage, u'download list').strip()
 127
 128         # Get all of the links from the page
 129         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
 130         links = re.findall(LINK_RE, download_list_html)
 131         if(len(links) == 0):
 132             raise ExtractorError(u'ERROR: no known formats available for video')
 133
 134         self.to_screen(u'Links found: %d' % len(links))
 135
 136         formats = []
 137         for link in links:
 138
 139             # A link looks like this:
 140             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
 141             # A path looks like this:
 142             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
 143             video_url = unescapeHTML( link )
 144             path = compat_urllib_parse_urlparse( video_url ).path
 145             extension = os.path.splitext( path )[1][1:]
 146             format = path.split('/')[4].split('_')[:2]
 147             size = format[0]
 148             bitrate = format[1]
 149             format = "-".join( format )
 150             # title = u'%s-%s-%s' % (video_title, size, bitrate)
 151
 152             formats.append({
 153                 'id': video_id,
 154                 'url': video_url,
 155                 'uploader': video_uploader,
 156                 'upload_date': upload_date,
 157                 'title': video_title,
 158                 'ext': extension,
 159                 'format': format,
 160                 'thumbnail': thumbnail,
 161                 'description': video_description
 162             })
 163
 164         if self._downloader.params.get('listformats', None):
 165             self._print_formats(formats)
 166             return
 167
 168         req_format = self._downloader.params.get('format', None)
 169         self.to_screen(u'Format: %s' % req_format)
 170
 171         if req_format is None or req_format == 'best':
 172             return [formats[0]]
 173         elif req_format == 'worst':
 174             return [formats[-1]]
 175         elif req_format in ('-1', 'all'):
 176             return formats
 177         else:
 178             format = self._specific( req_format, formats )
 179             if result is None:
 180                 raise ExtractorError(u'Requested format not available')
 181             return [format]
 182
 183
 184
 185 class PornotubeIE(InfoExtractor):
 186     """Information extractor for pornotube.com."""
 187     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
 188
 189     def _real_extract(self, url):
 190         mobj = re.match(self._VALID_URL, url)
 191         if mobj is None:
 192             raise ExtractorError(u'Invalid URL: %s' % url)
 193
 194         video_id = mobj.group('videoid')
 195         video_title = mobj.group('title')
 196
 197         # Get webpage content
 198         webpage = self._download_webpage(url, video_id)
 199
 200         # Get the video URL
 201         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
 202         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
 203         video_url = compat_urllib_parse.unquote(video_url)
 204
 205         #Get the uploaded date
 206         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
 207         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
 208         if upload_date: upload_date = unified_strdate(upload_date)
 209
 210         info = {'id': video_id,
 211                 'url': video_url,
 212                 'uploader': None,
 213                 'upload_date': upload_date,
 214                 'title': video_title,
 215                 'ext': 'flv',
 216                 'format': 'flv'}
 217
 218         return [info]
 219
 220 class YouJizzIE(InfoExtractor):
 221     """Information extractor for youjizz.com."""
 222     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
 223
 224     def _real_extract(self, url):
 225         mobj = re.match(self._VALID_URL, url)
 226         if mobj is None:
 227             raise ExtractorError(u'Invalid URL: %s' % url)
 228
 229         video_id = mobj.group('videoid')
 230
 231         # Get webpage content
 232         webpage = self._download_webpage(url, video_id)
 233
 234         # Get the video title
 235         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
 236             webpage, u'title').strip()
 237
 238         # Get the embed page
 239         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
 240         if result is None:
 241             raise ExtractorError(u'ERROR: unable to extract embed page')
 242
 243         embed_page_url = result.group(0).strip()
 244         video_id = result.group('videoid')
 245
 246         webpage = self._download_webpage(embed_page_url, video_id)
 247
 248         # Get the video URL
 249         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
 250             webpage, u'video URL')
 251
 252         info = {'id': video_id,
 253                 'url': video_url,
 254                 'title': video_title,
 255                 'ext': 'flv',
 256                 'format': 'flv',
 257                 'player_url': embed_page_url}
 258
 259         return [info]
 260
 261 class EightTracksIE(InfoExtractor):
 262     IE_NAME = '8tracks'
 263     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
 264
 265     def _real_extract(self, url):
 266         mobj = re.match(self._VALID_URL, url)
 267         if mobj is None:
 268             raise ExtractorError(u'Invalid URL: %s' % url)
 269         playlist_id = mobj.group('id')
 270
 271         webpage = self._download_webpage(url, playlist_id)
 272
 273         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
 274         data = json.loads(json_like)
 275
 276         session = str(random.randint(0, 1000000000))
 277         mix_id = data['id']
 278         track_count = data['tracks_count']
 279         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
 280         next_url = first_url
 281         res = []
 282         for i in itertools.count():
 283             api_json = self._download_webpage(next_url, playlist_id,
 284                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
 285                 errnote=u'Failed to download song information')
 286             api_data = json.loads(api_json)
 287             track_data = api_data[u'set']['track']
 288             info = {
 289                 'id': track_data['id'],
 290                 'url': track_data['track_file_stream_url'],
 291                 'title': track_data['performer'] + u' - ' + track_data['name'],
 292                 'raw_title': track_data['name'],
 293                 'uploader_id': data['user']['login'],
 294                 'ext': 'm4a',
 295             }
 296             res.append(info)
 297             if api_data['set']['at_last_track']:
 298                 break
 299             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
 300         return res
 301
 302 class KeekIE(InfoExtractor):
 303     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
 304     IE_NAME = u'keek'
 305
 306     def _real_extract(self, url):
 307         m = re.match(self._VALID_URL, url)
 308         video_id = m.group('videoID')
 309
 310         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
 311         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
 312         webpage = self._download_webpage(url, video_id)
 313
 314         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
 315             webpage, u'title')
 316
 317         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
 318             webpage, u'uploader', fatal=False)
 319
 320         info = {
 321                 'id': video_id,
 322                 'url': video_url,
 323                 'ext': 'mp4',
 324                 'title': video_title,
 325                 'thumbnail': thumbnail,
 326                 'uploader': uploader
 327         }
 328         return [info]
 329
 330
 331 class MySpassIE(InfoExtractor):
 332     _VALID_URL = r'http://www.myspass.de/.*'
 333
 334     def _real_extract(self, url):
 335         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
 336
 337         # video id is the last path element of the URL
 338         # usually there is a trailing slash, so also try the second but last
 339         url_path = compat_urllib_parse_urlparse(url).path
 340         url_parent_path, video_id = os.path.split(url_path)
 341         if not video_id:
 342             _, video_id = os.path.split(url_parent_path)
 343
 344         # get metadata
 345         metadata_url = META_DATA_URL_TEMPLATE % video_id
 346         metadata_text = self._download_webpage(metadata_url, video_id)
 347         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
 348
 349         # extract values from metadata
 350         url_flv_el = metadata.find('url_flv')
 351         if url_flv_el is None:
 352             raise ExtractorError(u'Unable to extract download url')
 353         video_url = url_flv_el.text
 354         extension = os.path.splitext(video_url)[1][1:]
 355         title_el = metadata.find('title')
 356         if title_el is None:
 357             raise ExtractorError(u'Unable to extract title')
 358         title = title_el.text
 359         format_id_el = metadata.find('format_id')
 360         if format_id_el is None:
 361             format = ext
 362         else:
 363             format = format_id_el.text
 364         description_el = metadata.find('description')
 365         if description_el is not None:
 366             description = description_el.text
 367         else:
 368             description = None
 369         imagePreview_el = metadata.find('imagePreview')
 370         if imagePreview_el is not None:
 371             thumbnail = imagePreview_el.text
 372         else:
 373             thumbnail = None
 374         info = {
 375             'id': video_id,
 376             'url': video_url,
 377             'title': title,
 378             'ext': extension,
 379             'format': format,
 380             'thumbnail': thumbnail,
 381             'description': description
 382         }
 383         return [info]
 384
 385 class SpiegelIE(InfoExtractor):
 386     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
 387
 388     def _real_extract(self, url):
 389         m = re.match(self._VALID_URL, url)
 390         video_id = m.group('videoID')
 391
 392         webpage = self._download_webpage(url, video_id)
 393
 394         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
 395             webpage, u'title')
 396
 397         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
 398         xml_code = self._download_webpage(xml_url, video_id,
 399                     note=u'Downloading XML', errnote=u'Failed to download XML')
 400
 401         idoc = xml.etree.ElementTree.fromstring(xml_code)
 402         last_type = idoc[-1]
 403         filename = last_type.findall('./filename')[0].text
 404         duration = float(last_type.findall('./duration')[0].text)
 405
 406         video_url = 'http://video2.spiegel.de/flash/' + filename
 407         video_ext = filename.rpartition('.')[2]
 408         info = {
 409             'id': video_id,
 410             'url': video_url,
 411             'ext': video_ext,
 412             'title': video_title,
 413             'duration': duration,
 414         }
 415         return [info]
 416
 417 class LiveLeakIE(InfoExtractor):
 418
 419     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
 420     IE_NAME = u'liveleak'
 421
 422     def _real_extract(self, url):
 423         mobj = re.match(self._VALID_URL, url)
 424         if mobj is None:
 425             raise ExtractorError(u'Invalid URL: %s' % url)
 426
 427         video_id = mobj.group('video_id')
 428
 429         webpage = self._download_webpage(url, video_id)
 430
 431         video_url = self._search_regex(r'file: "(.*?)",',
 432             webpage, u'video URL')
 433
 434         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
 435             webpage, u'title').replace('LiveLeak.com -', '').strip()
 436
 437         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
 438             webpage, u'description', fatal=False)
 439
 440         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
 441             webpage, u'uploader', fatal=False)
 442
 443         info = {
 444             'id':  video_id,
 445             'url': video_url,
 446             'ext': 'mp4',
 447             'title': video_title,
 448             'description': video_description,
 449             'uploader': video_uploader
 450         }
 451
 452         return [info]
 453
 454
 455
 456 class TumblrIE(InfoExtractor):
 457     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
 458
 459     def _real_extract(self, url):
 460         m_url = re.match(self._VALID_URL, url)
 461         video_id = m_url.group('id')
 462         blog = m_url.group('blog_name')
 463
 464         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
 465         webpage = self._download_webpage(url, video_id)
 466
 467         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
 468         video = re.search(re_video, webpage)
 469         if video is None:
 470            raise ExtractorError(u'Unable to extract video')
 471         video_url = video.group('video_url')
 472         ext = video.group('ext')
 473
 474         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
 475             webpage, u'thumbnail', fatal=False)  # We pick the first poster
 476         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
 477
 478         # The only place where you can get a title, it's not complete,
 479         # but searching in other places doesn't work for all videos
 480         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
 481             webpage, u'title', flags=re.DOTALL)
 482
 483         return [{'id': video_id,
 484                  'url': video_url,
 485                  'title': video_title,
 486                  'thumbnail': video_thumbnail,
 487                  'ext': ext
 488                  }]
 489
 490 class BandcampIE(InfoExtractor):
 491     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
 492
 493     def _real_extract(self, url):
 494         mobj = re.match(self._VALID_URL, url)
 495         title = mobj.group('title')
 496         webpage = self._download_webpage(url, title)
 497         # We get the link to the free download page
 498         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
 499         if m_download is None:
 500             raise ExtractorError(u'No free songs found')
 501
 502         download_link = m_download.group(1)
 503         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
 504                        webpage, re.MULTILINE|re.DOTALL).group('id')
 505
 506         download_webpage = self._download_webpage(download_link, id,
 507                                                   'Downloading free downloads page')
 508         # We get the dictionary of the track from some javascrip code
 509         info = re.search(r'items: (.*?),$',
 510                          download_webpage, re.MULTILINE).group(1)
 511         info = json.loads(info)[0]
 512         # We pick mp3-320 for now, until format selection can be easily implemented.
 513         mp3_info = info[u'downloads'][u'mp3-320']
 514         # If we try to use this url it says the link has expired
 515         initial_url = mp3_info[u'url']
 516         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
 517         m_url = re.match(re_url, initial_url)
 518         #We build the url we will use to get the final track url
 519         # This url is build in Bandcamp in the script download_bunde_*.js
 520         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
 521         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
 522         # If we could correctly generate the .rand field the url would be
 523         #in the "download_url" key
 524         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
 525
 526         track_info = {'id':id,
 527                       'title' : info[u'title'],
 528                       'ext' :   'mp3',
 529                       'url' :   final_url,
 530                       'thumbnail' : info[u'thumb_url'],
 531                       'uploader' :  info[u'artist']
 532                       }
 533
 534         return [track_info]
 535
 536 class RedTubeIE(InfoExtractor):
 537     """Information Extractor for redtube"""
 538     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
 539
 540     def _real_extract(self,url):
 541         mobj = re.match(self._VALID_URL, url)
 542         if mobj is None:
 543             raise ExtractorError(u'Invalid URL: %s' % url)
 544
 545         video_id = mobj.group('id')
 546         video_extension = 'mp4'
 547         webpage = self._download_webpage(url, video_id)
 548
 549         self.report_extraction(video_id)
 550
 551         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
 552             webpage, u'video URL')
 553
 554         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
 555             webpage, u'title')
 556
 557         return [{
 558             'id':       video_id,
 559             'url':      video_url,
 560             'ext':      video_extension,
 561             'title':    video_title,
 562         }]
 563
 564 class InaIE(InfoExtractor):
 565     """Information Extractor for Ina.fr"""
 566     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
 567
 568     def _real_extract(self,url):
 569         mobj = re.match(self._VALID_URL, url)
 570
 571         video_id = mobj.group('id')
 572         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
 573         video_extension = 'mp4'
 574         webpage = self._download_webpage(mrss_url, video_id)
 575
 576         self.report_extraction(video_id)
 577
 578         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
 579             webpage, u'video URL')
 580
 581         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
 582             webpage, u'title')
 583
 584         return [{
 585             'id':       video_id,
 586             'url':      video_url,
 587             'ext':      video_extension,
 588             'title':    video_title,
 589         }]
 590
 591 class HowcastIE(InfoExtractor):
 592     """Information Extractor for Howcast.com"""
 593     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
 594
 595     def _real_extract(self, url):
 596         mobj = re.match(self._VALID_URL, url)
 597
 598         video_id = mobj.group('id')
 599         webpage_url = 'http://www.howcast.com/videos/' + video_id
 600         webpage = self._download_webpage(webpage_url, video_id)
 601
 602         self.report_extraction(video_id)
 603
 604         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
 605             webpage, u'video URL')
 606
 607         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
 608             webpage, u'title')
 609
 610         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
 611             webpage, u'description', fatal=False)
 612
 613         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
 614             webpage, u'thumbnail', fatal=False)
 615
 616         return [{
 617             'id':       video_id,
 618             'url':      video_url,
 619             'ext':      'mp4',
 620             'title':    video_title,
 621             'description': video_description,
 622             'thumbnail': thumbnail,
 623         }]
 624
 625 class VineIE(InfoExtractor):
 626     """Information Extractor for Vine.co"""
 627     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
 628
 629     def _real_extract(self, url):
 630         mobj = re.match(self._VALID_URL, url)
 631
 632         video_id = mobj.group('id')
 633         webpage_url = 'https://vine.co/v/' + video_id
 634         webpage = self._download_webpage(webpage_url, video_id)
 635
 636         self.report_extraction(video_id)
 637
 638         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
 639             webpage, u'video URL')
 640
 641         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
 642             webpage, u'title')
 643
 644         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
 645             webpage, u'thumbnail', fatal=False)
 646
 647         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
 648             webpage, u'uploader', fatal=False, flags=re.DOTALL)
 649
 650         return [{
 651             'id':        video_id,
 652             'url':       video_url,
 653             'ext':       'mp4',
 654             'title':     video_title,
 655             'thumbnail': thumbnail,
 656             'uploader':  uploader,
 657         }]
 658
 659 class FlickrIE(InfoExtractor):
 660     """Information Extractor for Flickr videos"""
 661     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
 662
 663     def _real_extract(self, url):
 664         mobj = re.match(self._VALID_URL, url)
 665
 666         video_id = mobj.group('id')
 667         video_uploader_id = mobj.group('uploader_id')
 668         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
 669         webpage = self._download_webpage(webpage_url, video_id)
 670
 671         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
 672
 673         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
 674         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
 675
 676         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
 677             first_xml, u'node_id')
 678
 679         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
 680         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
 681
 682         self.report_extraction(video_id)
 683
 684         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
 685         if mobj is None:
 686             raise ExtractorError(u'Unable to extract video url')
 687         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
 688
 689         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
 690             webpage, u'video title')
 691
 692         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
 693             webpage, u'description', fatal=False)
 694
 695         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
 696             webpage, u'thumbnail', fatal=False)
 697
 698         return [{
 699             'id':          video_id,
 700             'url':         video_url,
 701             'ext':         'mp4',
 702             'title':       video_title,
 703             'description': video_description,
 704             'thumbnail':   thumbnail,
 705             'uploader_id': video_uploader_id,
 706         }]
 707
 708 class TeamcocoIE(InfoExtractor):
 709     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
 710
 711     def _real_extract(self, url):
 712         mobj = re.match(self._VALID_URL, url)
 713         if mobj is None:
 714             raise ExtractorError(u'Invalid URL: %s' % url)
 715         url_title = mobj.group('url_title')
 716         webpage = self._download_webpage(url, url_title)
 717
 718         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
 719             webpage, u'video id')
 720
 721         self.report_extraction(video_id)
 722
 723         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
 724             webpage, u'title')
 725
 726         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
 727             webpage, u'thumbnail', fatal=False)
 728
 729         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
 730             webpage, u'description', fatal=False)
 731
 732         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
 733         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
 734
 735         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
 736             data, u'video URL')
 737
 738         return [{
 739             'id':          video_id,
 740             'url':         video_url,
 741             'ext':         'mp4',
 742             'title':       video_title,
 743             'thumbnail':   thumbnail,
 744             'description': video_description,
 745         }]
 746
 747 class XHamsterIE(InfoExtractor):
 748     """Information Extractor for xHamster"""
 749     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
 750
 751     def _real_extract(self,url):
 752         mobj = re.match(self._VALID_URL, url)
 753
 754         video_id = mobj.group('id')
 755         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
 756         webpage = self._download_webpage(mrss_url, video_id)
 757
 758         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
 759         if mobj is None:
 760             raise ExtractorError(u'Unable to extract media URL')
 761         if len(mobj.group('server')) == 0:
 762             video_url = compat_urllib_parse.unquote(mobj.group('file'))
 763         else:
 764             video_url = mobj.group('server')+'/key='+mobj.group('file')
 765         video_extension = video_url.split('.')[-1]
 766
 767         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
 768             webpage, u'title')
 769
 770         # Can't see the description anywhere in the UI
 771         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
 772         #     webpage, u'description', fatal=False)
 773         # if video_description: video_description = unescapeHTML(video_description)
 774
 775         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
 776         if mobj:
 777             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
 778         else:
 779             video_upload_date = None
 780             self._downloader.report_warning(u'Unable to extract upload date')
 781
 782         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
 783             webpage, u'uploader id', default=u'anonymous')
 784
 785         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
 786             webpage, u'thumbnail', fatal=False)
 787
 788         return [{
 789             'id':       video_id,
 790             'url':      video_url,
 791             'ext':      video_extension,
 792             'title':    video_title,
 793             # 'description': video_description,
 794             'upload_date': video_upload_date,
 795             'uploader_id': video_uploader_id,
 796             'thumbnail': video_thumbnail
 797         }]
 798
 799 class HypemIE(InfoExtractor):
 800     """Information Extractor for hypem"""
 801     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
 802
 803     def _real_extract(self, url):
 804         mobj = re.match(self._VALID_URL, url)
 805         if mobj is None:
 806             raise ExtractorError(u'Invalid URL: %s' % url)
 807         track_id = mobj.group(1)
 808
 809         data = { 'ax': 1, 'ts': time.time() }
 810         data_encoded = compat_urllib_parse.urlencode(data)
 811         complete_url = url + "?" + data_encoded
 812         request = compat_urllib_request.Request(complete_url)
 813         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
 814         cookie = urlh.headers.get('Set-Cookie', '')
 815
 816         self.report_extraction(track_id)
 817
 818         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
 819             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
 820         try:
 821             track_list = json.loads(html_tracks)
 822             track = track_list[u'tracks'][0]
 823         except ValueError:
 824             raise ExtractorError(u'Hypemachine contained invalid JSON.')
 825
 826         key = track[u"key"]
 827         track_id = track[u"id"]
 828         artist = track[u"artist"]
 829         title = track[u"song"]
 830
 831         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
 832         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
 833         request.add_header('cookie', cookie)
 834         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
 835         try:
 836             song_data = json.loads(song_data_json)
 837         except ValueError:
 838             raise ExtractorError(u'Hypemachine contained invalid JSON.')
 839         final_url = song_data[u"url"]
 840
 841         return [{
 842             'id':       track_id,
 843             'url':      final_url,
 844             'ext':      "mp3",
 845             'title':    title,
 846             'artist':   artist,
 847         }]
 848
 849 class Vbox7IE(InfoExtractor):
 850     """Information Extractor for Vbox7"""
 851     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
 852
 853     def _real_extract(self,url):
 854         mobj = re.match(self._VALID_URL, url)
 855         if mobj is None:
 856             raise ExtractorError(u'Invalid URL: %s' % url)
 857         video_id = mobj.group(1)
 858
 859         redirect_page, urlh = self._download_webpage_handle(url, video_id)
 860         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
 861         redirect_url = urlh.geturl() + new_location
 862         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
 863
 864         title = self._html_search_regex(r'<title>(.*)</title>',
 865             webpage, u'title').split('/')[0].strip()
 866
 867         ext = "flv"
 868         info_url = "http://vbox7.com/play/magare.do"
 869         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
 870         info_request = compat_urllib_request.Request(info_url, data)
 871         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
 872         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
 873         if info_response is None:
 874             raise ExtractorError(u'Unable to extract the media url')
 875         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
 876
 877         return [{
 878             'id':        video_id,
 879             'url':       final_url,
 880             'ext':       ext,
 881             'title':     title,
 882             'thumbnail': thumbnail_url,
 883         }]
 884
 885
 886 def gen_extractors():
 887     """ Return a list of an instance of every supported extractor.
 888     The order does matter; the first extractor matched is the one handling the URL.
 889     """
 890     return [
 891         YoutubePlaylistIE(),
 892         YoutubeChannelIE(),
 893         YoutubeUserIE(),
 894         YoutubeSearchIE(),
 895         YoutubeIE(),
 896         MetacafeIE(),
 897         DailymotionIE(),
 898         GoogleSearchIE(),
 899         PhotobucketIE(),
 900         YahooIE(),
 901         YahooSearchIE(),
 902         DepositFilesIE(),
 903         FacebookIE(),
 904         BlipTVIE(),
 905         BlipTVUserIE(),
 906         VimeoIE(),
 907         MyVideoIE(),
 908         ComedyCentralIE(),
 909         EscapistIE(),
 910         CollegeHumorIE(),
 911         XVideosIE(),
 912         SoundcloudSetIE(),
 913         SoundcloudIE(),
 914         InfoQIE(),
 915         MixcloudIE(),
 916         StanfordOpenClassroomIE(),
 917         MTVIE(),
 918         YoukuIE(),
 919         XNXXIE(),
 920         YouJizzIE(),
 921         PornotubeIE(),
 922         YouPornIE(),
 923         GooglePlusIE(),
 924         ArteTvIE(),
 925         NBAIE(),
 926         WorldStarHipHopIE(),
 927         JustinTVIE(),
 928         FunnyOrDieIE(),
 929         SteamIE(),
 930         UstreamIE(),
 931         RBMARadioIE(),
 932         EightTracksIE(),
 933         KeekIE(),
 934         TEDIE(),
 935         MySpassIE(),
 936         SpiegelIE(),
 937         LiveLeakIE(),
 938         ARDIE(),
 939         ZDFIE(),
 940         TumblrIE(),
 941         BandcampIE(),
 942         RedTubeIE(),
 943         InaIE(),
 944         HowcastIE(),
 945         VineIE(),
 946         FlickrIE(),
 947         TeamcocoIE(),
 948         XHamsterIE(),
 949         HypemIE(),
 950         Vbox7IE(),
 951         GametrailersIE(),
 952         StatigramIE(),
 953         GenericIE()
 954     ]
 955
 956 def get_info_extractor(ie_name):
 957     """Returns the info extractor class with the given ie_name"""
 958     return globals()[ie_name+'IE']