[xminus] Simplify and extend (#4302)

2014-11-25 09:54:54 +01:00
parent c3e74731c2
commit be64b5b098
3 changed files with 95 additions and 20 deletions
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -47,6 +47,7 @@ from youtube_dl.utils import (
    js_to_json,
    intlist_to_bytes,
    args_to_str,
+    parse_filesize,
 )


@@ -367,5 +368,14 @@ class TestUtil(unittest.TestCase):
            'foo ba/r -baz \'2 be\' \'\''
        )

+    def test_parse_filesize(self):
+        self.assertEqual(parse_filesize(None), None)
+        self.assertEqual(parse_filesize(''), None)
+        self.assertEqual(parse_filesize('91 B'), 91)
+        self.assertEqual(parse_filesize('foobar'), None)
+        self.assertEqual(parse_filesize('2 MiB'), 2097152)
+        self.assertEqual(parse_filesize('5 GB'), 5000000000)
+        self.assertEqual(parse_filesize('1.2Tb'), 1200000000000)
+
 if __name__ == '__main__':
    unittest.main()
--- a/youtube_dl/extractor/xminus.py
+++ b/youtube_dl/extractor/xminus.py
@@ -2,7 +2,14 @@
 from __future__ import unicode_literals

 from .common import InfoExtractor
-from ..utils import int_or_none
+from ..compat import (
+    compat_chr,
+    compat_ord,
+)
+from ..utils import (
+    int_or_none,
+    parse_filesize,
+)


 class XMinusIE(InfoExtractor):
@@ -15,39 +22,46 @@ class XMinusIE(InfoExtractor):
            'ext': 'mp3',
            'title': 'Леонид Агутин-Песенка шофера',
            'duration': 156,
+            'tbr': 320,
+            'filesize_approx': 5900000,
+            'view_count': int,
        }
    }

    def _real_extract(self, url):
        video_id = self._match_id(url)
-
-        # TODO more code goes here, for example ...
        webpage = self._download_webpage(url, video_id)
+
        artist = self._html_search_regex(
-            r'minus_track.artist="(.+?)"', webpage, 'artist')
+            r'minus_track\.artist="(.+?)"', webpage, 'artist')
        title = artist + '-' + self._html_search_regex(
-            r'minus_track.title="(.+?)"', webpage, 'title')
+            r'minus_track\.title="(.+?)"', webpage, 'title')
        duration = int_or_none(self._html_search_regex(
-            r'minus_track.dur_sec=\'([0-9]+?)\'', webpage, 'duration'))
+            r'minus_track\.dur_sec=\'([0-9]*?)\'',
+            webpage, 'duration', fatal=False))
+        filesize_approx = parse_filesize(self._html_search_regex(
+            r'<div class="filesize[^"]*"></div>\s*([0-9.]+\s*[a-zA-Z][bB])',
+            webpage, 'approximate filesize', fatal=False))
+        tbr = int_or_none(self._html_search_regex(
+            r'<div class="quality[^"]*"></div>\s*([0-9]+)\s*kbps',
+            webpage, 'bitrate', fatal=False))
+        view_count = int_or_none(self._html_search_regex(
+            r'<div class="quality.*?► ([0-9]+)',
+            webpage, 'view count', fatal=False))
+
        enc_token = self._html_search_regex(
            r'data-mt="(.*?)"', webpage, 'enc_token')
-        token = self._decode_token(enc_token)
-        url = 'http://x-minus.org/dwlf/{}/{}.mp3'.format(video_id, token)
+        token = ''.join(
+            c if pos == 3 else compat_chr(compat_ord(c) - 1)
+            for pos, c in enumerate(reversed(enc_token)))
+        video_url = 'http://x-minus.org/dwlf/%s/%s.mp3' % (video_id, token)

        return {
            'id': video_id,
            'title': title,
-            'url': url,
+            'url': video_url,
            'duration': duration,
+            'filesize_approx': filesize_approx,
+            'tbr': tbr,
+            'view_count': view_count,
        }
-
-    def _decode_token(self, enc_token):
-        token = ''
-        pos = 0
-        for c in reversed(enc_token):
-            if pos != 3:
-                token += chr(ord(c) - 1)
-            else:
-                token += c
-            pos += 1
-        return token
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -1046,6 +1046,57 @@ def format_bytes(bytes):
    return '%.2f%s' % (converted, suffix)


+def parse_filesize(s):
+    if s is None:
+        return None
+
+    # The lower-case forms are of course incorrect and inofficial,
+    # but we support those too
+    _UNIT_TABLE = {
+        'B': 1,
+        'b': 1,
+        'KiB': 1024,
+        'KB': 1000,
+        'kB': 1024,
+        'Kb': 1000,
+        'MiB': 1024 ** 2,
+        'MB': 1000 ** 2,
+        'mB': 1024 ** 2,
+        'Mb': 1000 ** 2,
+        'GiB': 1024 ** 3,
+        'GB': 1000 ** 3,
+        'gB': 1024 ** 3,
+        'Gb': 1000 ** 3,
+        'TiB': 1024 ** 4,
+        'TB': 1000 ** 4,
+        'tB': 1024 ** 4,
+        'Tb': 1000 ** 4,
+        'PiB': 1024 ** 5,
+        'PB': 1000 ** 5,
+        'pB': 1024 ** 5,
+        'Pb': 1000 ** 5,
+        'EiB': 1024 ** 6,
+        'EB': 1000 ** 6,
+        'eB': 1024 ** 6,
+        'Eb': 1000 ** 6,
+        'ZiB': 1024 ** 7,
+        'ZB': 1000 ** 7,
+        'zB': 1024 ** 7,
+        'Zb': 1000 ** 7,
+        'YiB': 1024 ** 8,
+        'YB': 1000 ** 8,
+        'yB': 1024 ** 8,
+        'Yb': 1000 ** 8,
+    }
+
+    units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
+    m = re.match(r'(?P<num>[0-9]+(?:\.[0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
+    if not m:
+        return None
+
+    return int(float(m.group('num')) * _UNIT_TABLE[m.group('unit')])
+
+
 def get_term_width():
    columns = compat_getenv('COLUMNS', None)
    if columns: