nas-tools/app/utils/string_utils.py
HateBaozi 2ac0945cbd 百2
2023-02-16 16:58:17 +08:00

441 lines
13 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import bisect
import datetime
import hashlib
import random
import re
from urllib import parse
import dateparser
import dateutil.parser
import cn2an
from app.utils.exception_utils import ExceptionUtils
from app.utils.types import MediaType
class StringUtils:
@staticmethod
def num_filesize(text):
"""
将文件大小文本转化为字节
"""
if not text:
return 0
if not isinstance(text, str):
text = str(text)
if text.isdigit():
return int(text)
text = text.replace(",", "").replace(" ", "").upper()
size = re.sub(r"[KMGTPI]*B?", "", text, flags=re.IGNORECASE)
try:
size = float(size)
except Exception as e:
ExceptionUtils.exception_traceback(e)
return 0
if text.find("PB") != -1 or text.find("PIB") != -1:
size *= 1024 ** 5
elif text.find("TB") != -1 or text.find("TIB") != -1:
size *= 1024 ** 4
elif text.find("GB") != -1 or text.find("GIB") != -1:
size *= 1024 ** 3
elif text.find("MB") != -1 or text.find("MIB") != -1:
size *= 1024 ** 2
elif text.find("KB") != -1 or text.find("KIB") != -1:
size *= 1024
return round(size)
@staticmethod
def str_timelong(time_sec):
"""
将数字转换为时间描述
"""
if not isinstance(time_sec, int) or not isinstance(time_sec, float):
try:
time_sec = float(time_sec)
except Exception as e:
ExceptionUtils.exception_traceback(e)
return ""
d = [(0, ''), (60 - 1, ''), (3600 - 1, '小时'), (86400 - 1, '')]
s = [x[0] for x in d]
index = bisect.bisect_left(s, time_sec) - 1
if index == -1:
return str(time_sec)
else:
b, u = d[index]
return str(round(time_sec / (b + 1))) + u
@staticmethod
def is_chinese(word):
"""
判断是否含有中文
"""
if isinstance(word, list):
word = " ".join(word)
chn = re.compile(r'[\u4e00-\u9fff]')
if chn.search(word):
return True
else:
return False
@staticmethod
def is_japanese(word):
jap = re.compile(r'[\u3040-\u309F\u30A0-\u30FF]')
if jap.search(word):
return True
else:
return False
@staticmethod
def is_korean(word):
kor = re.compile(r'[\uAC00-\uD7FF]')
if kor.search(word):
return True
else:
return False
@staticmethod
def is_all_chinese(word):
"""
判断是否全是中文
"""
for ch in word:
if ch == ' ':
continue
if '\u4e00' <= ch <= '\u9fff':
continue
else:
return False
return True
@staticmethod
def xstr(s):
"""
字符串None输出为空
"""
return s if s else ''
@staticmethod
def str_sql(in_str):
"""
转化SQL字符
"""
return "" if not in_str else str(in_str)
@staticmethod
def str_int(text):
"""
web字符串转int
:param text:
:return:
"""
int_val = 0
try:
int_val = int(text.strip().replace(',', ''))
except Exception as e:
ExceptionUtils.exception_traceback(e)
return int_val
@staticmethod
def str_float(text):
"""
web字符串转float
:param text:
:return:
"""
float_val = 0.0
try:
float_val = float(text.strip().replace(',', ''))
except Exception as e:
ExceptionUtils.exception_traceback(e)
return float_val
@staticmethod
def handler_special_chars(text, replace_word="", allow_space=False):
"""
忽略特殊字符
"""
# 需要忽略的特殊字符
CONVERT_EMPTY_CHARS = r"[、.。,,·:;!'\"“”()\[\]【】「」\-——\+\|\\_/&#~]"
if not text:
return text
if not isinstance(text, list):
text = re.sub(r"[\u200B-\u200D\uFEFF]",
"",
re.sub(r"%s" % CONVERT_EMPTY_CHARS, replace_word, text),
flags=re.IGNORECASE)
if not allow_space:
return re.sub(r"\s+", "", text)
else:
return re.sub(r"\s+", " ", text).strip()
else:
return [StringUtils.handler_special_chars(x) for x in text]
@staticmethod
def str_filesize(size, pre=2):
"""
将字节计算为文件大小描述(带单位的格式化后返回)
"""
if not size:
return size
size = re.sub(r"\s|B|iB", "", str(size), re.I)
if size.replace(".", "").isdigit():
try:
size = float(size)
d = [(1024 - 1, 'K'), (1024 ** 2 - 1, 'M'), (1024 ** 3 - 1, 'G'), (1024 ** 4 - 1, 'T')]
s = [x[0] for x in d]
index = bisect.bisect_left(s, size) - 1
if index == -1:
return str(size) + "B"
else:
b, u = d[index]
return str(round(size / (b + 1), pre)) + u
except Exception as e:
ExceptionUtils.exception_traceback(e)
return ""
if re.findall(r"[KMGTP]", size, re.I):
return size
else:
return size + "B"
@staticmethod
def url_equal(url1, url2):
"""
比较两个地址是否为同一个网站
"""
if not url1 or not url2:
return False
if url1.startswith("http"):
url1 = parse.urlparse(url1).netloc
if url2.startswith("http"):
url2 = parse.urlparse(url2).netloc
if url1.replace("www.", "") == url2.replace("www.", ""):
return True
return False
@staticmethod
def get_url_netloc(url):
"""
获取URL的协议和域名部分
"""
if not url:
return "", ""
if not url.startswith("http"):
return "http", url
addr = parse.urlparse(url)
return addr.scheme, addr.netloc
@staticmethod
def get_url_domain(url):
"""
获取URL的域名部分不含WWW和HTTP
"""
if not url:
return ""
_, netloc = StringUtils.get_url_netloc(url)
if netloc:
return netloc.lower().replace("www.", "")
return ""
@staticmethod
def get_base_url(url):
"""
获取URL根地址
"""
if not url:
return ""
scheme, netloc = StringUtils.get_url_netloc(url)
return f"{scheme}://{netloc}"
@staticmethod
def clear_file_name(name):
if not name:
return None
return re.sub(r"[*?\\/\"<>~]", "", name, flags=re.IGNORECASE).replace(":", "")
@staticmethod
def get_keyword_from_string(content):
"""
从检索关键字中拆分中年份、季、集、类型
"""
if not content:
return None, None, None, None, None
# 去掉查询中的电影或电视剧关键字
if re.search(r'^电视剧|\s+电视剧|^动漫|\s+动漫', content):
mtype = MediaType.TV
else:
mtype = None
content = re.sub(r'^电影|^电视剧|^动漫|\s+电影|\s+电视剧|\s+动漫', '', content).strip()
# 稍微切一下剧集吧
season_num = None
episode_num = None
year = None
season_re = re.search(r"\s*([0-9一二三四五六七八九十]+)\s*季", content, re.IGNORECASE)
if season_re:
mtype = MediaType.TV
season_num = int(cn2an.cn2an(season_re.group(1), mode='smart'))
episode_re = re.search(r"\s*([0-9一二三四五六七八九十百零]+)\s*集", content, re.IGNORECASE)
if episode_re:
mtype = MediaType.TV
episode_num = int(cn2an.cn2an(episode_re.group(1), mode='smart'))
if episode_num and not season_num:
season_num = 1
year_re = re.search(r"[\s(]+(\d{4})[\s)]*", content)
if year_re:
year = year_re.group(1)
key_word = re.sub(
r'\s*[0-9一二三四五六七八九十]+\s*季|第\s*[0-9一二三四五六七八九十百零]+\s*集|[\s(]+(\d{4})[\s)]*', '',
content,
flags=re.IGNORECASE).strip()
if key_word:
key_word = re.sub(r'\s+', ' ', key_word)
if not key_word:
key_word = year
return mtype, key_word, season_num, episode_num, year, content
@staticmethod
def generate_random_str(randomlength=16):
"""
生成一个指定长度的随机字符串
"""
random_str = ''
base_str = 'ABCDEFGHIGKLMNOPQRSTUVWXYZabcdefghigklmnopqrstuvwxyz0123456789'
length = len(base_str) - 1
for i in range(randomlength):
random_str += base_str[random.randint(0, length)]
return random_str
@staticmethod
def get_time_stamp(date):
tempsTime = None
try:
tempsTime = dateutil.parser.parse(date)
except Exception as err:
ExceptionUtils.exception_traceback(err)
return tempsTime
@staticmethod
def unify_datetime_str(datetime_str):
"""
日期时间格式化 统一转成 2020-10-14 07:48:04 这种格式
# 场景1: 带有时区的日期字符串 eg: Sat, 15 Oct 2022 14:02:54 +0800
# 场景2: 中间带T的日期字符串 eg: 2020-10-14T07:48:04
# 场景3: 中间带T的日期字符串 eg: 2020-10-14T07:48:04.208
# 场景4: 日期字符串以GMT结尾 eg: Fri, 14 Oct 2022 07:48:04 GMT
# 场景5: 日期字符串以UTC结尾 eg: Fri, 14 Oct 2022 07:48:04 UTC
# 场景6: 日期字符串以Z结尾 eg: Fri, 14 Oct 2022 07:48:04Z
# 场景7: 日期字符串为相对时间 eg: 1 month, 2 days ago
:param datetime_str:
:return:
"""
# 传入的参数如果是None 或者空字符串 直接返回
if not datetime_str:
return datetime_str
try:
return dateparser.parse(datetime_str).strftime('%Y-%m-%d %H:%M:%S')
except Exception as e:
ExceptionUtils.exception_traceback(e)
return datetime_str
@staticmethod
def timestamp_to_date(timestamp, date_format='%Y-%m-%d %H:%M:%S'):
"""
时间戳转日期
:param timestamp:
:param date_format:
:return:
"""
try:
return datetime.datetime.fromtimestamp(timestamp).strftime(date_format)
except Exception as e:
ExceptionUtils.exception_traceback(e)
return timestamp
@staticmethod
def to_bool(text, default_val: bool = False) -> bool:
"""
字符串转bool
:param text: 要转换的值
:param default_val: 默认值
:return:
"""
if isinstance(text, str) and not text:
return default_val
if isinstance(text, bool):
return text
if isinstance(text, int) or isinstance(text, float):
return True if text > 0 else False
if isinstance(text, str) and text.lower() in ['y', 'true', '1']:
return True
return False
@staticmethod
def str_from_cookiejar(cj):
"""
将cookiejar转换为字符串
:param cj:
:return:
"""
return '; '.join(['='.join(item) for item in cj.items()])
@staticmethod
def get_idlist_from_string(content, dicts):
"""
从字符串中提取id列表
:param content: 字符串
:param dicts: 字典
:return:
"""
if not content:
return []
id_list = []
content_list = content.split()
for dic in dicts:
if dic.get('name') in content_list and dic.get('id') not in id_list:
id_list.append(dic.get('id'))
content = content.replace(dic.get('name'), '')
return id_list, re.sub(r'\s+', ' ', content).strip()
@staticmethod
def str_title(s):
"""
讲英文的首字母大写
:param s: en_name string
:return: string title
"""
return s.title() if s else s
@staticmethod
def md5_hash(data):
"""
MD5 HASH
"""
if not data:
return ""
return hashlib.md5(str(data).encode()).hexdigest()
@staticmethod
def str_timehours(minutes):
"""
将分钟转换成小时和分钟
:param minutes:
:return:
"""
if not minutes:
return ""
hours = minutes // 60
minutes = minutes % 60
return "%s小时%s" % (hours, minutes)
@staticmethod
def str_amount(amount, curr="$"):
"""
格式化显示金额
"""
if not amount:
return "0"
return curr + format(amount, ",")