cc685e274cc5b9b15783cda6966ae66b0eb3f216
[youtube-dl] / youtube_dl / InfoExtractors.py
1 import base64
2 import datetime
3 import itertools
4 import netrc
5 import os
6 import re
7 import socket
8 import time
9 import email.utils
10 import xml.etree.ElementTree
11 import random
12 import math
13 import operator
14 import hashlib
15 import binascii
16 import urllib
17
18 from .utils import *
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
20
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bandcamp import BandcampIE
24 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
25 from .extractor.comedycentral import ComedyCentralIE
26 from .extractor.collegehumor import CollegeHumorIE
27 from .extractor.dailymotion import DailymotionIE
28 from .extractor.depositfiles import DepositFilesIE
29 from .extractor.eighttracks import EightTracksIE
30 from .extractor.escapist import EscapistIE
31 from .extractor.facebook import FacebookIE
32 from .extractor.funnyordie import FunnyOrDieIE
33 from .extractor.gametrailers import GametrailersIE
34 from .extractor.generic import GenericIE
35 from .extractor.googleplus import GooglePlusIE
36 from .extractor.googlesearch import GoogleSearchIE
37 from .extractor.howcast import HowcastIE
38 from .extractor.hypem import HypemIE
39 from .extractor.ina import InaIE
40 from .extractor.infoq import InfoQIE
41 from .extractor.justintv import JustinTVIE
42 from .extractor.keek import KeekIE
43 from .extractor.liveleak import LiveLeakIE
44 from .extractor.metacafe import MetacafeIE
45 from .extractor.mixcloud import MixcloudIE
46 from .extractor.mtv import MTVIE
47 from .extractor.myspass import MySpassIE
48 from .extractor.myvideo import MyVideoIE
49 from .extractor.nba import NBAIE
50 from .extractor.statigram import StatigramIE
51 from .extractor.photobucket import PhotobucketIE
52 from .extractor.pornotube import PornotubeIE
53 from .extractor.rbmaradio import RBMARadioIE
54 from .extractor.redtube import RedTubeIE
55 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
56 from .extractor.spiegel import SpiegelIE
57 from .extractor.stanfordoc import StanfordOpenClassroomIE
58 from .extractor.steam import SteamIE
59 from .extractor.ted import TEDIE
60 from .extractor.tumblr import TumblrIE
61 from .extractor.ustream import UstreamIE
62 from .extractor.vbox7 import Vbox7IE
63 from .extractor.vimeo import VimeoIE
64 from .extractor.vine import VineIE
65 from .extractor.worldstarhiphop import WorldStarHipHopIE
66 from .extractor.xnxx import XNXXIE
67 from .extractor.xvideos import XVideosIE
68 from .extractor.yahoo import YahooIE, YahooSearchIE
69 from .extractor.youjizz import YouJizzIE
70 from .extractor.youku import YoukuIE
71 from .extractor.youporn import YouPornIE
72 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
73 from .extractor.zdf import ZDFIE
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112 class FlickrIE(InfoExtractor):
113     """Information Extractor for Flickr videos"""
114     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
115
116     def _real_extract(self, url):
117         mobj = re.match(self._VALID_URL, url)
118
119         video_id = mobj.group('id')
120         video_uploader_id = mobj.group('uploader_id')
121         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
122         webpage = self._download_webpage(webpage_url, video_id)
123
124         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
125
126         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
127         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
128
129         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
130             first_xml, u'node_id')
131
132         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
133         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
134
135         self.report_extraction(video_id)
136
137         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
138         if mobj is None:
139             raise ExtractorError(u'Unable to extract video url')
140         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
141
142         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
143             webpage, u'video title')
144
145         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
146             webpage, u'description', fatal=False)
147
148         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
149             webpage, u'thumbnail', fatal=False)
150
151         return [{
152             'id':          video_id,
153             'url':         video_url,
154             'ext':         'mp4',
155             'title':       video_title,
156             'description': video_description,
157             'thumbnail':   thumbnail,
158             'uploader_id': video_uploader_id,
159         }]
160
161 class TeamcocoIE(InfoExtractor):
162     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
163
164     def _real_extract(self, url):
165         mobj = re.match(self._VALID_URL, url)
166         if mobj is None:
167             raise ExtractorError(u'Invalid URL: %s' % url)
168         url_title = mobj.group('url_title')
169         webpage = self._download_webpage(url, url_title)
170
171         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
172             webpage, u'video id')
173
174         self.report_extraction(video_id)
175
176         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
177             webpage, u'title')
178
179         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
180             webpage, u'thumbnail', fatal=False)
181
182         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
183             webpage, u'description', fatal=False)
184
185         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
186         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
187
188         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
189             data, u'video URL')
190
191         return [{
192             'id':          video_id,
193             'url':         video_url,
194             'ext':         'mp4',
195             'title':       video_title,
196             'thumbnail':   thumbnail,
197             'description': video_description,
198         }]
199
200 class XHamsterIE(InfoExtractor):
201     """Information Extractor for xHamster"""
202     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
203
204     def _real_extract(self,url):
205         mobj = re.match(self._VALID_URL, url)
206
207         video_id = mobj.group('id')
208         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
209         webpage = self._download_webpage(mrss_url, video_id)
210
211         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
212         if mobj is None:
213             raise ExtractorError(u'Unable to extract media URL')
214         if len(mobj.group('server')) == 0:
215             video_url = compat_urllib_parse.unquote(mobj.group('file'))
216         else:
217             video_url = mobj.group('server')+'/key='+mobj.group('file')
218         video_extension = video_url.split('.')[-1]
219
220         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
221             webpage, u'title')
222
223         # Can't see the description anywhere in the UI
224         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
225         #     webpage, u'description', fatal=False)
226         # if video_description: video_description = unescapeHTML(video_description)
227
228         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
229         if mobj:
230             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
231         else:
232             video_upload_date = None
233             self._downloader.report_warning(u'Unable to extract upload date')
234
235         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
236             webpage, u'uploader id', default=u'anonymous')
237
238         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
239             webpage, u'thumbnail', fatal=False)
240
241         return [{
242             'id':       video_id,
243             'url':      video_url,
244             'ext':      video_extension,
245             'title':    video_title,
246             # 'description': video_description,
247             'upload_date': video_upload_date,
248             'uploader_id': video_uploader_id,
249             'thumbnail': video_thumbnail
250         }]
251
252
253
254
255
256 def gen_extractors():
257     """ Return a list of an instance of every supported extractor.
258     The order does matter; the first extractor matched is the one handling the URL.
259     """
260     return [
261         YoutubePlaylistIE(),
262         YoutubeChannelIE(),
263         YoutubeUserIE(),
264         YoutubeSearchIE(),
265         YoutubeIE(),
266         MetacafeIE(),
267         DailymotionIE(),
268         GoogleSearchIE(),
269         PhotobucketIE(),
270         YahooIE(),
271         YahooSearchIE(),
272         DepositFilesIE(),
273         FacebookIE(),
274         BlipTVIE(),
275         BlipTVUserIE(),
276         VimeoIE(),
277         MyVideoIE(),
278         ComedyCentralIE(),
279         EscapistIE(),
280         CollegeHumorIE(),
281         XVideosIE(),
282         SoundcloudSetIE(),
283         SoundcloudIE(),
284         InfoQIE(),
285         MixcloudIE(),
286         StanfordOpenClassroomIE(),
287         MTVIE(),
288         YoukuIE(),
289         XNXXIE(),
290         YouJizzIE(),
291         PornotubeIE(),
292         YouPornIE(),
293         GooglePlusIE(),
294         ArteTvIE(),
295         NBAIE(),
296         WorldStarHipHopIE(),
297         JustinTVIE(),
298         FunnyOrDieIE(),
299         SteamIE(),
300         UstreamIE(),
301         RBMARadioIE(),
302         EightTracksIE(),
303         KeekIE(),
304         TEDIE(),
305         MySpassIE(),
306         SpiegelIE(),
307         LiveLeakIE(),
308         ARDIE(),
309         ZDFIE(),
310         TumblrIE(),
311         BandcampIE(),
312         RedTubeIE(),
313         InaIE(),
314         HowcastIE(),
315         VineIE(),
316         FlickrIE(),
317         TeamcocoIE(),
318         XHamsterIE(),
319         HypemIE(),
320         Vbox7IE(),
321         GametrailersIE(),
322         StatigramIE(),
323         GenericIE()
324     ]
325
326 def get_info_extractor(ie_name):
327     """Returns the info extractor class with the given ie_name"""
328     return globals()[ie_name+'IE']