Merge remote-tracking branch 'dstftw/generic-webpage-unescape'
[youtube-dl] / youtube_dl / extractor / ninegag.py
1 from __future__ import unicode_literals
2
3 import re
4
5 from .common import InfoExtractor
6
7
8 class NineGagIE(InfoExtractor):
9     IE_NAME = '9gag'
10     _VALID_URL = r'^https?://(?:www\.)?9gag\.tv/v/(?P<id>[0-9]+)'
11
12     _TEST = {
13         "url": "http://9gag.tv/v/1912",
14         "info_dict": {
15             "id": "1912",
16             "ext": "mp4",
17             "description": "This 3-minute video will make you smile and then make you feel untalented and insignificant. Anyway, you should share this awesomeness. (Thanks, Dino!)",
18             "title": "\"People Are Awesome 2013\" Is Absolutely Awesome",
19             "view_count": int,
20             "thumbnail": "re:^https?://",
21         },
22         'add_ie': ['Youtube']
23     }
24
25     def _real_extract(self, url):
26         mobj = re.match(self._VALID_URL, url)
27         video_id = mobj.group('id')
28
29         webpage = self._download_webpage(url, video_id)
30
31         youtube_id = self._html_search_regex(
32             r'(?s)id="jsid-video-post-container".*?data-external-id="([^"]+)"',
33             webpage, 'video ID')
34         description = self._html_search_regex(
35             r'(?s)<div class="video-caption">.*?<p>(.*?)</p>', webpage,
36             'description', fatal=False)
37         view_count_str = self._html_search_regex(
38             r'<p><b>([0-9][0-9,]*)</b> views</p>', webpage, 'view count',
39             fatal=False)
40         view_count = (
41             None if view_count_str is None
42             else int(view_count_str.replace(',', '')))
43
44         return {
45             '_type': 'url_transparent',
46             'url': youtube_id,
47             'ie_key': 'Youtube',
48             'id': video_id,
49             'title': self._og_search_title(webpage),
50             'description': description,
51             'view_count': view_count,
52             'thumbnail': self._og_search_thumbnail(webpage),
53         }