[youtube] Fix extraction.
[youtube-dl] / youtube_dl / extractor / radiojavan.py
index de90f92706b7eb2c443f6ba2272a630a445bf497..3f74f0c01ffd570dcf996d9c004070d80d50731e 100644 (file)
@@ -1,12 +1,17 @@
-# coding: utf-8
 from __future__ import unicode_literals
 
+import re
+
 from .common import InfoExtractor
-from ..utils import(
-    parse_duration,
-    str_to_int
+from ..utils import (
+    parse_resolution,
+    str_to_int,
+    unified_strdate,
+    urlencode_postdata,
+    urljoin,
 )
 
+
 class RadioJavanIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?radiojavan\.com/videos/video/(?P<id>[^/]+)/?'
     _TEST = {
@@ -16,56 +21,63 @@ class RadioJavanIE(InfoExtractor):
             'id': 'chaartaar-ashoobam',
             'ext': 'mp4',
             'title': 'Chaartaar - Ashoobam',
-            'description': 'Chaartaar - Ashoobam',
-            'thumbnail': 're:^https?://.*\.jpe?g$',
+            'thumbnail': r're:^https?://.*\.jpe?g$',
+            'upload_date': '20150215',
+            'view_count': int,
+            'like_count': int,
+            'dislike_count': int,
         }
     }
 
     def _real_extract(self, url):
-        display_id = self._match_id(url)
-        webpage = self._download_webpage(url, display_id)
-        urls = list()
-        prefix = 'https://media.rdjavan.com/media/music_video/'
+        video_id = self._match_id(url)
+
+        download_host = self._download_json(
+            'https://www.radiojavan.com/videos/video_host', video_id,
+            data=urlencode_postdata({'id': video_id}),
+            headers={
+                'Content-Type': 'application/x-www-form-urlencoded',
+                'Referer': url,
+            }).get('host', 'https://host1.rjmusicmedia.com')
 
-        video_url_480 = self._search_regex(
-            r'RJ\.video480p = \'([^\']+)\'', webpage, '480 video url', fatal= False)
-        video_url_720 = self._search_regex(
-            r'RJ\.video720p = \'([^\']+)\'', webpage, '720 video url', fatal= False)
-        video_url_1080 = self._search_regex(
-            r'RJ\.video1080p = \'([^\']+)\'', webpage, '1080 video url', fatal= False)
+        webpage = self._download_webpage(url, video_id)
 
-        if video_url_480:
-            urls.append({'url': prefix + video_url_480, 'format': '480p'})
-        if video_url_720:
-            urls.append({'url': prefix + video_url_720, 'format': '720p'})
-        if video_url_1080:
-            urls.append({'url': prefix + video_url_1080, 'format': '1080p'})
+        formats = []
+        for format_id, _, video_path in re.findall(
+                r'RJ\.video(?P<format_id>\d+[pPkK])\s*=\s*(["\'])(?P<url>(?:(?!\2).)+)\2',
+                webpage):
+            f = parse_resolution(format_id)
+            f.update({
+                'url': urljoin(download_host, video_path),
+                'format_id': format_id,
+            })
+            formats.append(f)
+        self._sort_formats(formats)
 
         title = self._og_search_title(webpage)
         thumbnail = self._og_search_thumbnail(webpage)
-        formats = [{
-            'url': url['url'],
-            'format': url['format']
-        } for url in urls]
 
-        likes = self._search_regex(
-            r'<span class="rating">([\d,]+)\s*likes</span>', webpage, 'Likes Count', fatal=False )
-        likes = likes.replace(',', '')
-        dislikes = self._search_regex(
-            r'<span class="rating">([\d,]+)\s*dislikes</span>', webpage, 'Dislikes Count', fatal=False )
-        dislikes = dislikes.replace(',', '')
+        upload_date = unified_strdate(self._search_regex(
+            r'class="date_added">Date added: ([^<]+)<',
+            webpage, 'upload date', fatal=False))
 
-        plays = self._search_regex(
-            r'views_publish[">\s]*<span[^>]+class="views">Plays: ([\d,]+)</span>', webpage, 'Play Count', fatal=False )
-        plays = plays.replace(',', '')
+        view_count = str_to_int(self._search_regex(
+            r'class="views">Plays: ([\d,]+)',
+            webpage, 'view count', fatal=False))
+        like_count = str_to_int(self._search_regex(
+            r'class="rating">([\d,]+) likes',
+            webpage, 'like count', fatal=False))
+        dislike_count = str_to_int(self._search_regex(
+            r'class="rating">([\d,]+) dislikes',
+            webpage, 'dislike count', fatal=False))
 
         return {
-            'formats': formats,
-            'id': display_id,
+            'id': video_id,
             'title': title,
-            'description': title, # no description provided in RadioJavan
             'thumbnail': thumbnail,
-            'like_count': str_to_int(likes),
-            'dislike_count': str_to_int(dislikes),
-            'viewCount': str_to_int(plays)
-        }
\ No newline at end of file
+            'upload_date': upload_date,
+            'view_count': view_count,
+            'like_count': like_count,
+            'dislike_count': dislike_count,
+            'formats': formats,
+        }