Merge pull request #12906 from Tithen-Firion/clean-html-fix
authorYen Chi Hsuan <yan12125@gmail.com>
Sat, 29 Apr 2017 07:58:45 +0000 (15:58 +0800)
committerGitHub <noreply@github.com>
Sat, 29 Apr 2017 07:58:45 +0000 (15:58 +0800)
[utils] Fix inconsistent output of clean_html

ChangeLog
youtube_dl/downloader/external.py
youtube_dl/extractor/tvplayer.py
youtube_dl/extractor/xtube.py

index cd49ac42d4e8725340bddc679ebe73f4dfbb7e4b..aa75433252516c3831dac39868593a3e36e5b6a5 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+version <unreleased>
+
+Extractors
+* [xtube] Fix extraction for older FLV videos (#12734)
+
+
 version 2017.04.28
 
 Core
index e13cf547d10cbf472440c9f23d010a586b2c453c..e78169a0dbeb2ab5613db5a55cb6ad25cd14555a 100644 (file)
@@ -29,7 +29,17 @@ class ExternalFD(FileDownloader):
         self.report_destination(filename)
         tmpfilename = self.temp_name(filename)
 
-        retval = self._call_downloader(tmpfilename, info_dict)
+        try:
+            retval = self._call_downloader(tmpfilename, info_dict)
+        except KeyboardInterrupt:
+            if not info_dict.get('is_live'):
+                raise
+            # Live stream downloading cancellation should be considered as
+            # correct and expected termination thus all postprocessing
+            # should take place
+            retval = 0
+            self.to_screen('[%s] Interrupted by user' % self.get_basename())
+
         if retval == 0:
             fsize = os.path.getsize(encodeFilename(tmpfilename))
             self.to_screen('\r[%s] Downloaded %s bytes' % (self.get_basename(), fsize))
index b6537141ae7c998267fa4dc7c9d72440c14d4590..ebde6053f16be20096bb1eed812ffb20f3c52dc2 100644 (file)
@@ -2,9 +2,13 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
-from ..compat import compat_HTTPError
+from ..compat import (
+    compat_HTTPError,
+    compat_str,
+)
 from ..utils import (
     extract_attributes,
+    try_get,
     urlencode_postdata,
     ExtractorError,
 )
@@ -34,25 +38,32 @@ class TVPlayerIE(InfoExtractor):
             webpage, 'channel element'))
         title = current_channel['data-name']
 
-        resource_id = self._search_regex(
-            r'resourceId\s*=\s*"(\d+)"', webpage, 'resource id')
-        platform = self._search_regex(
-            r'platform\s*=\s*"([^"]+)"', webpage, 'platform')
+        resource_id = current_channel['data-id']
+
         token = self._search_regex(
-            r'token\s*=\s*"([^"]+)"', webpage, 'token', default='null')
-        validate = self._search_regex(
-            r'validate\s*=\s*"([^"]+)"', webpage, 'validate', default='null')
+            r'data-token=(["\'])(?P<token>(?!\1).+)\1', webpage,
+            'token', group='token')
+
+        context = self._download_json(
+            'https://tvplayer.com/watch/context', display_id,
+            'Downloading JSON context', query={
+                'resource': resource_id,
+                'nonce': token,
+            })
+
+        validate = context['validate']
+        platform = try_get(
+            context, lambda x: x['platform']['key'], compat_str) or 'firefox'
 
         try:
             response = self._download_json(
                 'http://api.tvplayer.com/api/v2/stream/live',
-                resource_id, headers={
+                display_id, 'Downloading JSON stream', headers={
                     'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
                 }, data=urlencode_postdata({
+                    'id': resource_id,
                     'service': 1,
                     'platform': platform,
-                    'id': resource_id,
-                    'token': token,
                     'validate': validate,
                 }))['tvplayer']['response']
         except ExtractorError as e:
@@ -63,7 +74,7 @@ class TVPlayerIE(InfoExtractor):
                     '%s said: %s' % (self.IE_NAME, response['error']), expected=True)
             raise
 
-        formats = self._extract_m3u8_formats(response['stream'], resource_id, 'mp4')
+        formats = self._extract_m3u8_formats(response['stream'], display_id, 'mp4')
         self._sort_formats(formats)
 
         return {
index 5584674a061fc5a67bbb65bc0b58fc96e96eae3b..bea9b87ad4123f90bcf554d7115cfd35d431afe6 100644 (file)
@@ -6,6 +6,7 @@ import re
 from .common import InfoExtractor
 from ..utils import (
     int_or_none,
+    js_to_json,
     orderedSet,
     parse_duration,
     sanitized_Request,
@@ -37,6 +38,22 @@ class XTubeIE(InfoExtractor):
             'comment_count': int,
             'age_limit': 18,
         }
+    }, {
+        # FLV videos with duplicated formats
+        'url': 'http://www.xtube.com/video-watch/A-Super-Run-Part-1-YT-9299752',
+        'md5': 'a406963eb349dd43692ec54631efd88b',
+        'info_dict': {
+            'id': '9299752',
+            'display_id': 'A-Super-Run-Part-1-YT',
+            'ext': 'flv',
+            'title': 'A Super Run - Part 1 (YT)',
+            'description': 'md5:ca0d47afff4a9b2942e4b41aa970fd93',
+            'uploader': 'tshirtguy59',
+            'duration': 579,
+            'view_count': int,
+            'comment_count': int,
+            'age_limit': 18,
+        },
     }, {
         # new URL schema
         'url': 'http://www.xtube.com/video-watch/strange-erotica-625837',
@@ -68,8 +85,9 @@ class XTubeIE(InfoExtractor):
             })
 
         sources = self._parse_json(self._search_regex(
-            r'(["\'])sources\1\s*:\s*(?P<sources>{.+?}),',
-            webpage, 'sources', group='sources'), video_id)
+            r'(["\'])?sources\1?\s*:\s*(?P<sources>{.+?}),',
+            webpage, 'sources', group='sources'), video_id,
+            transform_source=js_to_json)
 
         formats = []
         for format_id, format_url in sources.items():
@@ -78,6 +96,7 @@ class XTubeIE(InfoExtractor):
                 'format_id': format_id,
                 'height': int_or_none(format_id),
             })
+        self._remove_duplicate_formats(formats)
         self._sort_formats(formats)
 
         title = self._search_regex(