10 import xml.etree.ElementTree
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.dailymotion import DailymotionIE
26 from .extractor.depositfiles import DepositFilesIE
27 from .extractor.facebook import FacebookIE
28 from .extractor.gametrailers import GametrailersIE
29 from .extractor.generic import GenericIE
30 from .extractor.googleplus import GooglePlusIE
31 from .extractor.googlesearch import GoogleSearchIE
32 from .extractor.metacafe import MetacafeIE
33 from .extractor.myvideo import MyVideoIE
34 from .extractor.statigram import StatigramIE
35 from .extractor.photobucket import PhotobucketIE
36 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
37 from .extractor.vimeo import VimeoIE
38 from .extractor.yahoo import YahooIE, YahooSearchIE
39 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
40 from .extractor.zdf import ZDFIE
69 class EscapistIE(InfoExtractor):
70 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
72 def _real_extract(self, url):
73 mobj = re.match(self._VALID_URL, url)
75 raise ExtractorError(u'Invalid URL: %s' % url)
76 showName = mobj.group('showname')
77 videoId = mobj.group('episode')
79 self.report_extraction(videoId)
80 webpage = self._download_webpage(url, videoId)
82 videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
83 webpage, u'description', fatal=False)
85 imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
86 webpage, u'thumbnail', fatal=False)
88 playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
89 webpage, u'player url')
91 title = self._html_search_regex('<meta name="title" content="([^"]*)"',
92 webpage, u'player url').split(' : ')[-1]
94 configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
95 configUrl = compat_urllib_parse.unquote(configUrl)
97 configJSON = self._download_webpage(configUrl, videoId,
98 u'Downloading configuration',
99 u'unable to download configuration')
101 # Technically, it's JavaScript, not JSON
102 configJSON = configJSON.replace("'", '"')
105 config = json.loads(configJSON)
106 except (ValueError,) as err:
107 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
109 playlist = config['playlist']
110 videoUrl = playlist[1]['url']
115 'uploader': showName,
120 'description': videoDesc,
121 'player_url': playerUrl,
126 class CollegeHumorIE(InfoExtractor):
127 """Information extractor for collegehumor.com"""
130 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
131 IE_NAME = u'collegehumor'
133 def report_manifest(self, video_id):
134 """Report information extraction."""
135 self.to_screen(u'%s: Downloading XML manifest' % video_id)
137 def _real_extract(self, url):
138 mobj = re.match(self._VALID_URL, url)
140 raise ExtractorError(u'Invalid URL: %s' % url)
141 video_id = mobj.group('videoid')
149 self.report_extraction(video_id)
150 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
152 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
153 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
154 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
156 mdoc = xml.etree.ElementTree.fromstring(metaXml)
158 videoNode = mdoc.findall('./video')[0]
159 info['description'] = videoNode.findall('./description')[0].text
160 info['title'] = videoNode.findall('./caption')[0].text
161 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
162 manifest_url = videoNode.findall('./file')[0].text
164 raise ExtractorError(u'Invalid metadata XML file')
166 manifest_url += '?hdcore=2.10.3'
167 self.report_manifest(video_id)
169 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
170 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
171 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
173 adoc = xml.etree.ElementTree.fromstring(manifestXml)
175 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
176 node_id = media_node.attrib['url']
177 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
178 except IndexError as err:
179 raise ExtractorError(u'Invalid manifest file')
181 url_pr = compat_urllib_parse_urlparse(manifest_url)
182 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
189 class XVideosIE(InfoExtractor):
190 """Information extractor for xvideos.com"""
192 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
195 def _real_extract(self, url):
196 mobj = re.match(self._VALID_URL, url)
198 raise ExtractorError(u'Invalid URL: %s' % url)
199 video_id = mobj.group(1)
201 webpage = self._download_webpage(url, video_id)
203 self.report_extraction(video_id)
206 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
207 webpage, u'video URL'))
210 video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
213 # Extract video thumbnail
214 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
215 webpage, u'thumbnail', fatal=False)
222 'title': video_title,
224 'thumbnail': video_thumbnail,
233 class InfoQIE(InfoExtractor):
234 """Information extractor for infoq.com"""
235 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
237 def _real_extract(self, url):
238 mobj = re.match(self._VALID_URL, url)
240 raise ExtractorError(u'Invalid URL: %s' % url)
242 webpage = self._download_webpage(url, video_id=url)
243 self.report_extraction(url)
246 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
248 raise ExtractorError(u'Unable to extract video url')
249 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
250 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
253 video_title = self._search_regex(r'contentTitle = "(.*?)";',
256 # Extract description
257 video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
258 webpage, u'description', fatal=False)
260 video_filename = video_url.split('/')[-1]
261 video_id, extension = video_filename.split('.')
268 'title': video_title,
269 'ext': extension, # Extension is always(?) mp4, but seems to be flv
271 'description': video_description,
276 class MixcloudIE(InfoExtractor):
277 """Information extractor for www.mixcloud.com"""
279 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
280 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
281 IE_NAME = u'mixcloud'
283 def report_download_json(self, file_id):
284 """Report JSON download."""
285 self.to_screen(u'Downloading json')
287 def get_urls(self, jsonData, fmt, bitrate='best'):
288 """Get urls from 'audio_formats' section in json"""
291 bitrate_list = jsonData[fmt]
292 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
293 bitrate = max(bitrate_list) # select highest
295 url_list = jsonData[fmt][bitrate]
296 except TypeError: # we have no bitrate info.
297 url_list = jsonData[fmt]
300 def check_urls(self, url_list):
301 """Returns 1st active url from list"""
304 compat_urllib_request.urlopen(url)
306 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
311 def _print_formats(self, formats):
312 print('Available formats:')
313 for fmt in formats.keys():
314 for b in formats[fmt]:
316 ext = formats[fmt][b][0]
317 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
318 except TypeError: # we have no bitrate info
319 ext = formats[fmt][0]
320 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
323 def _real_extract(self, url):
324 mobj = re.match(self._VALID_URL, url)
326 raise ExtractorError(u'Invalid URL: %s' % url)
327 # extract uploader & filename from url
328 uploader = mobj.group(1).decode('utf-8')
329 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
331 # construct API request
332 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
333 # retrieve .json file with links to files
334 request = compat_urllib_request.Request(file_url)
336 self.report_download_json(file_url)
337 jsonData = compat_urllib_request.urlopen(request).read()
338 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
339 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
342 json_data = json.loads(jsonData)
343 player_url = json_data['player_swf_url']
344 formats = dict(json_data['audio_formats'])
346 req_format = self._downloader.params.get('format', None)
349 if self._downloader.params.get('listformats', None):
350 self._print_formats(formats)
353 if req_format is None or req_format == 'best':
354 for format_param in formats.keys():
355 url_list = self.get_urls(formats, format_param)
357 file_url = self.check_urls(url_list)
358 if file_url is not None:
361 if req_format not in formats:
362 raise ExtractorError(u'Format is not available')
364 url_list = self.get_urls(formats, req_format)
365 file_url = self.check_urls(url_list)
366 format_param = req_format
369 'id': file_id.decode('utf-8'),
370 'url': file_url.decode('utf-8'),
371 'uploader': uploader.decode('utf-8'),
373 'title': json_data['name'],
374 'ext': file_url.split('.')[-1].decode('utf-8'),
375 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
376 'thumbnail': json_data['thumbnail_url'],
377 'description': json_data['description'],
378 'player_url': player_url.decode('utf-8'),
381 class StanfordOpenClassroomIE(InfoExtractor):
382 """Information extractor for Stanford's Open ClassRoom"""
384 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
385 IE_NAME = u'stanfordoc'
387 def _real_extract(self, url):
388 mobj = re.match(self._VALID_URL, url)
390 raise ExtractorError(u'Invalid URL: %s' % url)
392 if mobj.group('course') and mobj.group('video'): # A specific video
393 course = mobj.group('course')
394 video = mobj.group('video')
396 'id': course + '_' + video,
401 self.report_extraction(info['id'])
402 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
403 xmlUrl = baseUrl + video + '.xml'
405 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
406 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
407 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
408 mdoc = xml.etree.ElementTree.fromstring(metaXml)
410 info['title'] = mdoc.findall('./title')[0].text
411 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
413 raise ExtractorError(u'Invalid metadata XML file')
414 info['ext'] = info['url'].rpartition('.')[2]
416 elif mobj.group('course'): # A course page
417 course = mobj.group('course')
425 coursepage = self._download_webpage(url, info['id'],
426 note='Downloading course info page',
427 errnote='Unable to download course info page')
429 info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
431 info['description'] = self._html_search_regex('<description>([^<]+)</description>',
432 coursepage, u'description', fatal=False)
434 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
438 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
442 for entry in info['list']:
443 assert entry['type'] == 'reference'
444 results += self.extract(entry['url'])
448 'id': 'Stanford OpenClassroom',
454 self.report_download_webpage(info['id'])
455 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
457 rootpage = compat_urllib_request.urlopen(rootURL).read()
458 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
459 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
461 info['title'] = info['id']
463 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
467 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
472 for entry in info['list']:
473 assert entry['type'] == 'reference'
474 results += self.extract(entry['url'])
477 class MTVIE(InfoExtractor):
478 """Information extractor for MTV.com"""
480 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
483 def _real_extract(self, url):
484 mobj = re.match(self._VALID_URL, url)
486 raise ExtractorError(u'Invalid URL: %s' % url)
487 if not mobj.group('proto'):
488 url = 'http://' + url
489 video_id = mobj.group('videoid')
491 webpage = self._download_webpage(url, video_id)
493 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
494 webpage, u'song name', fatal=False)
496 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
499 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
500 webpage, u'mtvn_uri', fatal=False)
502 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
503 webpage, u'content id', fatal=False)
505 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
506 self.report_extraction(video_id)
507 request = compat_urllib_request.Request(videogen_url)
509 metadataXml = compat_urllib_request.urlopen(request).read()
510 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
511 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
513 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
514 renditions = mdoc.findall('.//rendition')
516 # For now, always pick the highest quality.
517 rendition = renditions[-1]
520 _,_,ext = rendition.attrib['type'].partition('/')
521 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
522 video_url = rendition.find('./src').text
524 raise ExtractorError('Invalid rendition field.')
529 'uploader': performer,
531 'title': video_title,
539 class YoukuIE(InfoExtractor):
540 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
543 nowTime = int(time.time() * 1000)
544 random1 = random.randint(1000,1998)
545 random2 = random.randint(1000,9999)
547 return "%d%d%d" %(nowTime,random1,random2)
549 def _get_file_ID_mix_string(self, seed):
551 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
553 for i in range(len(source)):
554 seed = (seed * 211 + 30031 ) % 65536
555 index = math.floor(seed / 65536 * len(source) )
556 mixed.append(source[int(index)])
557 source.remove(source[int(index)])
558 #return ''.join(mixed)
561 def _get_file_id(self, fileId, seed):
562 mixed = self._get_file_ID_mix_string(seed)
563 ids = fileId.split('*')
567 realId.append(mixed[int(ch)])
568 return ''.join(realId)
570 def _real_extract(self, url):
571 mobj = re.match(self._VALID_URL, url)
573 raise ExtractorError(u'Invalid URL: %s' % url)
574 video_id = mobj.group('ID')
576 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
578 jsondata = self._download_webpage(info_url, video_id)
580 self.report_extraction(video_id)
582 config = json.loads(jsondata)
584 video_title = config['data'][0]['title']
585 seed = config['data'][0]['seed']
587 format = self._downloader.params.get('format', None)
588 supported_format = list(config['data'][0]['streamfileids'].keys())
590 if format is None or format == 'best':
591 if 'hd2' in supported_format:
596 elif format == 'worst':
604 fileid = config['data'][0]['streamfileids'][format]
605 keys = [s['k'] for s in config['data'][0]['segs'][format]]
606 except (UnicodeDecodeError, ValueError, KeyError):
607 raise ExtractorError(u'Unable to extract info section')
610 sid = self._gen_sid()
611 fileid = self._get_file_id(fileid, seed)
613 #column 8,9 of fileid represent the segment number
614 #fileid[7:9] should be changed
615 for index, key in enumerate(keys):
617 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
618 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
621 'id': '%s_part%02d' % (video_id, index),
625 'title': video_title,
628 files_info.append(info)
633 class XNXXIE(InfoExtractor):
634 """Information extractor for xnxx.com"""
636 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
638 VIDEO_URL_RE = r'flv_url=(.*?)&'
639 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
640 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
642 def _real_extract(self, url):
643 mobj = re.match(self._VALID_URL, url)
645 raise ExtractorError(u'Invalid URL: %s' % url)
646 video_id = mobj.group(1)
648 # Get webpage content
649 webpage = self._download_webpage(url, video_id)
651 video_url = self._search_regex(self.VIDEO_URL_RE,
652 webpage, u'video URL')
653 video_url = compat_urllib_parse.unquote(video_url)
655 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
658 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
659 webpage, u'thumbnail', fatal=False)
666 'title': video_title,
668 'thumbnail': video_thumbnail,
674 class NBAIE(InfoExtractor):
675 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
678 def _real_extract(self, url):
679 mobj = re.match(self._VALID_URL, url)
681 raise ExtractorError(u'Invalid URL: %s' % url)
683 video_id = mobj.group(1)
685 webpage = self._download_webpage(url, video_id)
687 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
689 shortened_video_id = video_id.rpartition('/')[2]
690 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
691 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
693 # It isn't there in the HTML it returns to us
694 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
696 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
699 'id': shortened_video_id,
703 # 'uploader_date': uploader_date,
704 'description': description,
708 class JustinTVIE(InfoExtractor):
709 """Information extractor for justin.tv and twitch.tv"""
710 # TODO: One broadcast may be split into multiple videos. The key
711 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
712 # starts at 1 and increases. Can we treat all parts as one video?
714 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
716 (?P<channelid>[^/]+)|
717 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
718 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
722 _JUSTIN_PAGE_LIMIT = 100
723 IE_NAME = u'justin.tv'
725 def report_download_page(self, channel, offset):
726 """Report attempt to download a single page of videos."""
727 self.to_screen(u'%s: Downloading video information from %d to %d' %
728 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
730 # Return count of items, list of *valid* items
731 def _parse_page(self, url, video_id):
732 webpage = self._download_webpage(url, video_id,
733 u'Downloading video info JSON',
734 u'unable to download video info JSON')
736 response = json.loads(webpage)
737 if type(response) != list:
738 error_text = response.get('error', 'unknown error')
739 raise ExtractorError(u'Justin.tv API: %s' % error_text)
741 for clip in response:
742 video_url = clip['video_file_url']
744 video_extension = os.path.splitext(video_url)[1][1:]
745 video_date = re.sub('-', '', clip['start_time'][:10])
746 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
747 video_id = clip['id']
748 video_title = clip.get('title', video_id)
752 'title': video_title,
753 'uploader': clip.get('channel_name', video_uploader_id),
754 'uploader_id': video_uploader_id,
755 'upload_date': video_date,
756 'ext': video_extension,
758 return (len(response), info)
760 def _real_extract(self, url):
761 mobj = re.match(self._VALID_URL, url)
763 raise ExtractorError(u'invalid URL: %s' % url)
765 api_base = 'http://api.justin.tv'
767 if mobj.group('channelid'):
769 video_id = mobj.group('channelid')
770 api = api_base + '/channel/archives/%s.json' % video_id
771 elif mobj.group('chapterid'):
772 chapter_id = mobj.group('chapterid')
774 webpage = self._download_webpage(url, chapter_id)
775 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
777 raise ExtractorError(u'Cannot find archive of a chapter')
778 archive_id = m.group(1)
780 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
781 chapter_info_xml = self._download_webpage(api, chapter_id,
782 note=u'Downloading chapter information',
783 errnote=u'Chapter information download failed')
784 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
785 for a in doc.findall('.//archive'):
786 if archive_id == a.find('./id').text:
789 raise ExtractorError(u'Could not find chapter in chapter information')
791 video_url = a.find('./video_file_url').text
792 video_ext = video_url.rpartition('.')[2] or u'flv'
794 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
795 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
796 note='Downloading chapter metadata',
797 errnote='Download of chapter metadata failed')
798 chapter_info = json.loads(chapter_info_json)
800 bracket_start = int(doc.find('.//bracket_start').text)
801 bracket_end = int(doc.find('.//bracket_end').text)
803 # TODO determine start (and probably fix up file)
804 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
805 #video_url += u'?start=' + TODO:start_timestamp
806 # bracket_start is 13290, but we want 51670615
807 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
808 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
811 'id': u'c' + chapter_id,
814 'title': chapter_info['title'],
815 'thumbnail': chapter_info['preview'],
816 'description': chapter_info['description'],
817 'uploader': chapter_info['channel']['display_name'],
818 'uploader_id': chapter_info['channel']['name'],
822 video_id = mobj.group('videoid')
823 api = api_base + '/broadcast/by_archive/%s.json' % video_id
825 self.report_extraction(video_id)
829 limit = self._JUSTIN_PAGE_LIMIT
832 self.report_download_page(video_id, offset)
833 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
834 page_count, page_info = self._parse_page(page_url, video_id)
835 info.extend(page_info)
836 if not paged or page_count != limit:
841 class FunnyOrDieIE(InfoExtractor):
842 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
844 def _real_extract(self, url):
845 mobj = re.match(self._VALID_URL, url)
847 raise ExtractorError(u'invalid URL: %s' % url)
849 video_id = mobj.group('id')
850 webpage = self._download_webpage(url, video_id)
852 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
853 webpage, u'video URL', flags=re.DOTALL)
855 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
856 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
858 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
859 webpage, u'description', fatal=False, flags=re.DOTALL)
866 'description': video_description,
870 class SteamIE(InfoExtractor):
871 _VALID_URL = r"""http://store\.steampowered\.com/
873 (?P<urltype>video|app)/ #If the page is only for videos or for a game
875 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
877 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
878 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
881 def suitable(cls, url):
882 """Receives a URL and returns True if suitable for this IE."""
883 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
885 def _real_extract(self, url):
886 m = re.match(self._VALID_URL, url, re.VERBOSE)
887 gameID = m.group('gameID')
889 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
890 webpage = self._download_webpage(videourl, gameID)
892 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
893 videourl = self._AGECHECK_TEMPLATE % gameID
894 self.report_age_confirmation()
895 webpage = self._download_webpage(videourl, gameID)
897 self.report_extraction(gameID)
898 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
899 webpage, 'game title')
901 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
902 mweb = re.finditer(urlRE, webpage)
903 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
904 titles = re.finditer(namesRE, webpage)
905 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
906 thumbs = re.finditer(thumbsRE, webpage)
908 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
909 video_id = vid.group('videoID')
910 title = vtitle.group('videoName')
911 video_url = vid.group('videoURL')
912 video_thumb = thumb.group('thumbnail')
914 raise ExtractorError(u'Cannot find video url for %s' % video_id)
919 'title': unescapeHTML(title),
920 'thumbnail': video_thumb
923 return [self.playlist_result(videos, gameID, game_title)]
925 class UstreamIE(InfoExtractor):
926 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
929 def _real_extract(self, url):
930 m = re.match(self._VALID_URL, url)
931 video_id = m.group('videoID')
933 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
934 webpage = self._download_webpage(url, video_id)
936 self.report_extraction(video_id)
938 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
941 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
942 webpage, u'uploader', fatal=False, flags=re.DOTALL)
944 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
945 webpage, u'thumbnail', fatal=False)
951 'title': video_title,
952 'uploader': uploader,
953 'thumbnail': thumbnail,
957 class WorldStarHipHopIE(InfoExtractor):
958 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
959 IE_NAME = u'WorldStarHipHop'
961 def _real_extract(self, url):
962 m = re.match(self._VALID_URL, url)
963 video_id = m.group('id')
965 webpage_src = self._download_webpage(url, video_id)
967 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
968 webpage_src, u'video URL')
970 if 'mp4' in video_url:
975 video_title = self._html_search_regex(r"<title>(.*)</title>",
976 webpage_src, u'title')
978 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
979 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
980 webpage_src, u'thumbnail', fatal=False)
983 _title = r"""candytitles.*>(.*)</span>"""
984 mobj = re.search(_title, webpage_src)
986 video_title = mobj.group(1)
991 'title' : video_title,
992 'thumbnail' : thumbnail,
997 class RBMARadioIE(InfoExtractor):
998 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
1000 def _real_extract(self, url):
1001 m = re.match(self._VALID_URL, url)
1002 video_id = m.group('videoID')
1004 webpage = self._download_webpage(url, video_id)
1006 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
1007 webpage, u'json data', flags=re.MULTILINE)
1010 data = json.loads(json_data)
1011 except ValueError as e:
1012 raise ExtractorError(u'Invalid JSON: ' + str(e))
1014 video_url = data['akamai_url'] + '&cbr=256'
1015 url_parts = compat_urllib_parse_urlparse(video_url)
1016 video_ext = url_parts.path.rpartition('.')[2]
1021 'title': data['title'],
1022 'description': data.get('teaser_text'),
1023 'location': data.get('country_of_origin'),
1024 'uploader': data.get('host', {}).get('name'),
1025 'uploader_id': data.get('host', {}).get('slug'),
1026 'thumbnail': data.get('image', {}).get('large_url_2x'),
1027 'duration': data.get('duration'),
1032 class YouPornIE(InfoExtractor):
1033 """Information extractor for youporn.com."""
1034 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
1036 def _print_formats(self, formats):
1037 """Print all available formats"""
1038 print(u'Available formats:')
1039 print(u'ext\t\tformat')
1040 print(u'---------------------------------')
1041 for format in formats:
1042 print(u'%s\t\t%s' % (format['ext'], format['format']))
1044 def _specific(self, req_format, formats):
1046 if(x["format"]==req_format):
1050 def _real_extract(self, url):
1051 mobj = re.match(self._VALID_URL, url)
1053 raise ExtractorError(u'Invalid URL: %s' % url)
1054 video_id = mobj.group('videoid')
1056 req = compat_urllib_request.Request(url)
1057 req.add_header('Cookie', 'age_verified=1')
1058 webpage = self._download_webpage(req, video_id)
1060 # Get JSON parameters
1061 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
1063 params = json.loads(json_params)
1065 raise ExtractorError(u'Invalid JSON')
1067 self.report_extraction(video_id)
1069 video_title = params['title']
1070 upload_date = unified_strdate(params['release_date_f'])
1071 video_description = params['description']
1072 video_uploader = params['submitted_by']
1073 thumbnail = params['thumbnails'][0]['image']
1075 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
1077 # Get all of the formats available
1078 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
1079 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
1080 webpage, u'download list').strip()
1082 # Get all of the links from the page
1083 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
1084 links = re.findall(LINK_RE, download_list_html)
1085 if(len(links) == 0):
1086 raise ExtractorError(u'ERROR: no known formats available for video')
1088 self.to_screen(u'Links found: %d' % len(links))
1093 # A link looks like this:
1094 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
1095 # A path looks like this:
1096 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
1097 video_url = unescapeHTML( link )
1098 path = compat_urllib_parse_urlparse( video_url ).path
1099 extension = os.path.splitext( path )[1][1:]
1100 format = path.split('/')[4].split('_')[:2]
1103 format = "-".join( format )
1104 # title = u'%s-%s-%s' % (video_title, size, bitrate)
1109 'uploader': video_uploader,
1110 'upload_date': upload_date,
1111 'title': video_title,
1114 'thumbnail': thumbnail,
1115 'description': video_description
1118 if self._downloader.params.get('listformats', None):
1119 self._print_formats(formats)
1122 req_format = self._downloader.params.get('format', None)
1123 self.to_screen(u'Format: %s' % req_format)
1125 if req_format is None or req_format == 'best':
1127 elif req_format == 'worst':
1128 return [formats[-1]]
1129 elif req_format in ('-1', 'all'):
1132 format = self._specific( req_format, formats )
1134 raise ExtractorError(u'Requested format not available')
1139 class PornotubeIE(InfoExtractor):
1140 """Information extractor for pornotube.com."""
1141 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
1143 def _real_extract(self, url):
1144 mobj = re.match(self._VALID_URL, url)
1146 raise ExtractorError(u'Invalid URL: %s' % url)
1148 video_id = mobj.group('videoid')
1149 video_title = mobj.group('title')
1151 # Get webpage content
1152 webpage = self._download_webpage(url, video_id)
1155 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
1156 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
1157 video_url = compat_urllib_parse.unquote(video_url)
1159 #Get the uploaded date
1160 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
1161 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
1162 if upload_date: upload_date = unified_strdate(upload_date)
1164 info = {'id': video_id,
1167 'upload_date': upload_date,
1168 'title': video_title,
1174 class YouJizzIE(InfoExtractor):
1175 """Information extractor for youjizz.com."""
1176 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
1178 def _real_extract(self, url):
1179 mobj = re.match(self._VALID_URL, url)
1181 raise ExtractorError(u'Invalid URL: %s' % url)
1183 video_id = mobj.group('videoid')
1185 # Get webpage content
1186 webpage = self._download_webpage(url, video_id)
1188 # Get the video title
1189 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
1190 webpage, u'title').strip()
1192 # Get the embed page
1193 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
1195 raise ExtractorError(u'ERROR: unable to extract embed page')
1197 embed_page_url = result.group(0).strip()
1198 video_id = result.group('videoid')
1200 webpage = self._download_webpage(embed_page_url, video_id)
1203 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
1204 webpage, u'video URL')
1206 info = {'id': video_id,
1208 'title': video_title,
1211 'player_url': embed_page_url}
1215 class EightTracksIE(InfoExtractor):
1217 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
1219 def _real_extract(self, url):
1220 mobj = re.match(self._VALID_URL, url)
1222 raise ExtractorError(u'Invalid URL: %s' % url)
1223 playlist_id = mobj.group('id')
1225 webpage = self._download_webpage(url, playlist_id)
1227 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
1228 data = json.loads(json_like)
1230 session = str(random.randint(0, 1000000000))
1232 track_count = data['tracks_count']
1233 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
1234 next_url = first_url
1236 for i in itertools.count():
1237 api_json = self._download_webpage(next_url, playlist_id,
1238 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
1239 errnote=u'Failed to download song information')
1240 api_data = json.loads(api_json)
1241 track_data = api_data[u'set']['track']
1243 'id': track_data['id'],
1244 'url': track_data['track_file_stream_url'],
1245 'title': track_data['performer'] + u' - ' + track_data['name'],
1246 'raw_title': track_data['name'],
1247 'uploader_id': data['user']['login'],
1251 if api_data['set']['at_last_track']:
1253 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
1256 class KeekIE(InfoExtractor):
1257 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
1260 def _real_extract(self, url):
1261 m = re.match(self._VALID_URL, url)
1262 video_id = m.group('videoID')
1264 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
1265 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
1266 webpage = self._download_webpage(url, video_id)
1268 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1271 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
1272 webpage, u'uploader', fatal=False)
1278 'title': video_title,
1279 'thumbnail': thumbnail,
1280 'uploader': uploader
1284 class TEDIE(InfoExtractor):
1285 _VALID_URL=r'''http://www\.ted\.com/
1287 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
1289 ((?P<type_talk>talks)) # We have a simple talk
1291 (/lang/(.*?))? # The url may contain the language
1292 /(?P<name>\w+) # Here goes the name and then ".html"
1296 def suitable(cls, url):
1297 """Receives a URL and returns True if suitable for this IE."""
1298 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1300 def _real_extract(self, url):
1301 m=re.match(self._VALID_URL, url, re.VERBOSE)
1302 if m.group('type_talk'):
1303 return [self._talk_info(url)]
1305 playlist_id=m.group('playlist_id')
1306 name=m.group('name')
1307 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
1308 return [self._playlist_videos_info(url,name,playlist_id)]
1310 def _playlist_videos_info(self,url,name,playlist_id=0):
1311 '''Returns the videos of the playlist'''
1313 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
1314 ([.\s]*?)data-playlist_item_id="(\d+)"
1315 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
1317 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
1318 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
1319 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
1320 m_names=re.finditer(video_name_RE,webpage)
1322 playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
1323 webpage, 'playlist title')
1325 playlist_entries = []
1326 for m_video, m_name in zip(m_videos,m_names):
1327 video_id=m_video.group('video_id')
1328 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
1329 playlist_entries.append(self.url_result(talk_url, 'TED'))
1330 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
1332 def _talk_info(self, url, video_id=0):
1333 """Return the video for the talk in the url"""
1334 m = re.match(self._VALID_URL, url,re.VERBOSE)
1335 video_name = m.group('name')
1336 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
1337 self.report_extraction(video_name)
1338 # If the url includes the language we get the title translated
1339 title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
1341 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
1342 webpage, 'json data')
1343 info = json.loads(json_data)
1344 desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
1345 webpage, 'description', flags = re.DOTALL)
1347 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
1348 webpage, 'thumbnail')
1351 'url': info['htmlStreams'][-1]['file'],
1354 'thumbnail': thumbnail,
1355 'description': desc,
1359 class MySpassIE(InfoExtractor):
1360 _VALID_URL = r'http://www.myspass.de/.*'
1362 def _real_extract(self, url):
1363 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
1365 # video id is the last path element of the URL
1366 # usually there is a trailing slash, so also try the second but last
1367 url_path = compat_urllib_parse_urlparse(url).path
1368 url_parent_path, video_id = os.path.split(url_path)
1370 _, video_id = os.path.split(url_parent_path)
1373 metadata_url = META_DATA_URL_TEMPLATE % video_id
1374 metadata_text = self._download_webpage(metadata_url, video_id)
1375 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
1377 # extract values from metadata
1378 url_flv_el = metadata.find('url_flv')
1379 if url_flv_el is None:
1380 raise ExtractorError(u'Unable to extract download url')
1381 video_url = url_flv_el.text
1382 extension = os.path.splitext(video_url)[1][1:]
1383 title_el = metadata.find('title')
1384 if title_el is None:
1385 raise ExtractorError(u'Unable to extract title')
1386 title = title_el.text
1387 format_id_el = metadata.find('format_id')
1388 if format_id_el is None:
1391 format = format_id_el.text
1392 description_el = metadata.find('description')
1393 if description_el is not None:
1394 description = description_el.text
1397 imagePreview_el = metadata.find('imagePreview')
1398 if imagePreview_el is not None:
1399 thumbnail = imagePreview_el.text
1408 'thumbnail': thumbnail,
1409 'description': description
1413 class SpiegelIE(InfoExtractor):
1414 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
1416 def _real_extract(self, url):
1417 m = re.match(self._VALID_URL, url)
1418 video_id = m.group('videoID')
1420 webpage = self._download_webpage(url, video_id)
1422 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
1425 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
1426 xml_code = self._download_webpage(xml_url, video_id,
1427 note=u'Downloading XML', errnote=u'Failed to download XML')
1429 idoc = xml.etree.ElementTree.fromstring(xml_code)
1430 last_type = idoc[-1]
1431 filename = last_type.findall('./filename')[0].text
1432 duration = float(last_type.findall('./duration')[0].text)
1434 video_url = 'http://video2.spiegel.de/flash/' + filename
1435 video_ext = filename.rpartition('.')[2]
1440 'title': video_title,
1441 'duration': duration,
1445 class LiveLeakIE(InfoExtractor):
1447 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
1448 IE_NAME = u'liveleak'
1450 def _real_extract(self, url):
1451 mobj = re.match(self._VALID_URL, url)
1453 raise ExtractorError(u'Invalid URL: %s' % url)
1455 video_id = mobj.group('video_id')
1457 webpage = self._download_webpage(url, video_id)
1459 video_url = self._search_regex(r'file: "(.*?)",',
1460 webpage, u'video URL')
1462 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1463 webpage, u'title').replace('LiveLeak.com -', '').strip()
1465 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1466 webpage, u'description', fatal=False)
1468 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
1469 webpage, u'uploader', fatal=False)
1475 'title': video_title,
1476 'description': video_description,
1477 'uploader': video_uploader
1484 class TumblrIE(InfoExtractor):
1485 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1487 def _real_extract(self, url):
1488 m_url = re.match(self._VALID_URL, url)
1489 video_id = m_url.group('id')
1490 blog = m_url.group('blog_name')
1492 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
1493 webpage = self._download_webpage(url, video_id)
1495 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
1496 video = re.search(re_video, webpage)
1498 raise ExtractorError(u'Unable to extract video')
1499 video_url = video.group('video_url')
1500 ext = video.group('ext')
1502 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1503 webpage, u'thumbnail', fatal=False) # We pick the first poster
1504 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
1506 # The only place where you can get a title, it's not complete,
1507 # but searching in other places doesn't work for all videos
1508 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
1509 webpage, u'title', flags=re.DOTALL)
1511 return [{'id': video_id,
1513 'title': video_title,
1514 'thumbnail': video_thumbnail,
1518 class BandcampIE(InfoExtractor):
1519 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
1521 def _real_extract(self, url):
1522 mobj = re.match(self._VALID_URL, url)
1523 title = mobj.group('title')
1524 webpage = self._download_webpage(url, title)
1525 # We get the link to the free download page
1526 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
1527 if m_download is None:
1528 raise ExtractorError(u'No free songs found')
1530 download_link = m_download.group(1)
1531 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
1532 webpage, re.MULTILINE|re.DOTALL).group('id')
1534 download_webpage = self._download_webpage(download_link, id,
1535 'Downloading free downloads page')
1536 # We get the dictionary of the track from some javascrip code
1537 info = re.search(r'items: (.*?),$',
1538 download_webpage, re.MULTILINE).group(1)
1539 info = json.loads(info)[0]
1540 # We pick mp3-320 for now, until format selection can be easily implemented.
1541 mp3_info = info[u'downloads'][u'mp3-320']
1542 # If we try to use this url it says the link has expired
1543 initial_url = mp3_info[u'url']
1544 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
1545 m_url = re.match(re_url, initial_url)
1546 #We build the url we will use to get the final track url
1547 # This url is build in Bandcamp in the script download_bunde_*.js
1548 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
1549 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
1550 # If we could correctly generate the .rand field the url would be
1551 #in the "download_url" key
1552 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
1554 track_info = {'id':id,
1555 'title' : info[u'title'],
1558 'thumbnail' : info[u'thumb_url'],
1559 'uploader' : info[u'artist']
1564 class RedTubeIE(InfoExtractor):
1565 """Information Extractor for redtube"""
1566 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
1568 def _real_extract(self,url):
1569 mobj = re.match(self._VALID_URL, url)
1571 raise ExtractorError(u'Invalid URL: %s' % url)
1573 video_id = mobj.group('id')
1574 video_extension = 'mp4'
1575 webpage = self._download_webpage(url, video_id)
1577 self.report_extraction(video_id)
1579 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
1580 webpage, u'video URL')
1582 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
1588 'ext': video_extension,
1589 'title': video_title,
1592 class InaIE(InfoExtractor):
1593 """Information Extractor for Ina.fr"""
1594 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1596 def _real_extract(self,url):
1597 mobj = re.match(self._VALID_URL, url)
1599 video_id = mobj.group('id')
1600 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
1601 video_extension = 'mp4'
1602 webpage = self._download_webpage(mrss_url, video_id)
1604 self.report_extraction(video_id)
1606 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1607 webpage, u'video URL')
1609 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1615 'ext': video_extension,
1616 'title': video_title,
1619 class HowcastIE(InfoExtractor):
1620 """Information Extractor for Howcast.com"""
1621 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1623 def _real_extract(self, url):
1624 mobj = re.match(self._VALID_URL, url)
1626 video_id = mobj.group('id')
1627 webpage_url = 'http://www.howcast.com/videos/' + video_id
1628 webpage = self._download_webpage(webpage_url, video_id)
1630 self.report_extraction(video_id)
1632 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1633 webpage, u'video URL')
1635 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
1638 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1639 webpage, u'description', fatal=False)
1641 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1642 webpage, u'thumbnail', fatal=False)
1648 'title': video_title,
1649 'description': video_description,
1650 'thumbnail': thumbnail,
1653 class VineIE(InfoExtractor):
1654 """Information Extractor for Vine.co"""
1655 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1657 def _real_extract(self, url):
1658 mobj = re.match(self._VALID_URL, url)
1660 video_id = mobj.group('id')
1661 webpage_url = 'https://vine.co/v/' + video_id
1662 webpage = self._download_webpage(webpage_url, video_id)
1664 self.report_extraction(video_id)
1666 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
1667 webpage, u'video URL')
1669 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1672 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
1673 webpage, u'thumbnail', fatal=False)
1675 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
1676 webpage, u'uploader', fatal=False, flags=re.DOTALL)
1682 'title': video_title,
1683 'thumbnail': thumbnail,
1684 'uploader': uploader,
1687 class FlickrIE(InfoExtractor):
1688 """Information Extractor for Flickr videos"""
1689 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1691 def _real_extract(self, url):
1692 mobj = re.match(self._VALID_URL, url)
1694 video_id = mobj.group('id')
1695 video_uploader_id = mobj.group('uploader_id')
1696 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1697 webpage = self._download_webpage(webpage_url, video_id)
1699 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
1701 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1702 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1704 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1705 first_xml, u'node_id')
1707 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1708 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1710 self.report_extraction(video_id)
1712 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
1714 raise ExtractorError(u'Unable to extract video url')
1715 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1717 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
1718 webpage, u'video title')
1720 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
1721 webpage, u'description', fatal=False)
1723 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
1724 webpage, u'thumbnail', fatal=False)
1730 'title': video_title,
1731 'description': video_description,
1732 'thumbnail': thumbnail,
1733 'uploader_id': video_uploader_id,
1736 class TeamcocoIE(InfoExtractor):
1737 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
1739 def _real_extract(self, url):
1740 mobj = re.match(self._VALID_URL, url)
1742 raise ExtractorError(u'Invalid URL: %s' % url)
1743 url_title = mobj.group('url_title')
1744 webpage = self._download_webpage(url, url_title)
1746 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
1747 webpage, u'video id')
1749 self.report_extraction(video_id)
1751 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1754 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
1755 webpage, u'thumbnail', fatal=False)
1757 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
1758 webpage, u'description', fatal=False)
1760 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
1761 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
1763 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
1770 'title': video_title,
1771 'thumbnail': thumbnail,
1772 'description': video_description,
1775 class XHamsterIE(InfoExtractor):
1776 """Information Extractor for xHamster"""
1777 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
1779 def _real_extract(self,url):
1780 mobj = re.match(self._VALID_URL, url)
1782 video_id = mobj.group('id')
1783 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
1784 webpage = self._download_webpage(mrss_url, video_id)
1786 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
1788 raise ExtractorError(u'Unable to extract media URL')
1789 if len(mobj.group('server')) == 0:
1790 video_url = compat_urllib_parse.unquote(mobj.group('file'))
1792 video_url = mobj.group('server')+'/key='+mobj.group('file')
1793 video_extension = video_url.split('.')[-1]
1795 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
1798 # Can't see the description anywhere in the UI
1799 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1800 # webpage, u'description', fatal=False)
1801 # if video_description: video_description = unescapeHTML(video_description)
1803 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
1805 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
1807 video_upload_date = None
1808 self._downloader.report_warning(u'Unable to extract upload date')
1810 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1811 webpage, u'uploader id', default=u'anonymous')
1813 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1814 webpage, u'thumbnail', fatal=False)
1819 'ext': video_extension,
1820 'title': video_title,
1821 # 'description': video_description,
1822 'upload_date': video_upload_date,
1823 'uploader_id': video_uploader_id,
1824 'thumbnail': video_thumbnail
1827 class HypemIE(InfoExtractor):
1828 """Information Extractor for hypem"""
1829 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1831 def _real_extract(self, url):
1832 mobj = re.match(self._VALID_URL, url)
1834 raise ExtractorError(u'Invalid URL: %s' % url)
1835 track_id = mobj.group(1)
1837 data = { 'ax': 1, 'ts': time.time() }
1838 data_encoded = compat_urllib_parse.urlencode(data)
1839 complete_url = url + "?" + data_encoded
1840 request = compat_urllib_request.Request(complete_url)
1841 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1842 cookie = urlh.headers.get('Set-Cookie', '')
1844 self.report_extraction(track_id)
1846 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1847 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1849 track_list = json.loads(html_tracks)
1850 track = track_list[u'tracks'][0]
1852 raise ExtractorError(u'Hypemachine contained invalid JSON.')
1855 track_id = track[u"id"]
1856 artist = track[u"artist"]
1857 title = track[u"song"]
1859 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1860 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1861 request.add_header('cookie', cookie)
1862 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1864 song_data = json.loads(song_data_json)
1866 raise ExtractorError(u'Hypemachine contained invalid JSON.')
1867 final_url = song_data[u"url"]
1877 class Vbox7IE(InfoExtractor):
1878 """Information Extractor for Vbox7"""
1879 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1881 def _real_extract(self,url):
1882 mobj = re.match(self._VALID_URL, url)
1884 raise ExtractorError(u'Invalid URL: %s' % url)
1885 video_id = mobj.group(1)
1887 redirect_page, urlh = self._download_webpage_handle(url, video_id)
1888 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1889 redirect_url = urlh.geturl() + new_location
1890 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1892 title = self._html_search_regex(r'<title>(.*)</title>',
1893 webpage, u'title').split('/')[0].strip()
1896 info_url = "http://vbox7.com/play/magare.do"
1897 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1898 info_request = compat_urllib_request.Request(info_url, data)
1899 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1900 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1901 if info_response is None:
1902 raise ExtractorError(u'Unable to extract the media url')
1903 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1910 'thumbnail': thumbnail_url,
1914 def gen_extractors():
1915 """ Return a list of an instance of every supported extractor.
1916 The order does matter; the first extractor matched is the one handling the URL.
1919 YoutubePlaylistIE(),
1944 StanfordOpenClassroomIE(),
1954 WorldStarHipHopIE(),
1984 def get_info_extractor(ie_name):
1985 """Returns the info extractor class with the given ie_name"""
1986 return globals()[ie_name+'IE']