Improving ad filtering for non-English languages
parent
f86a44b637
commit
f7380ae15d
|
@ -1,4 +1,5 @@
|
||||||
from app.request import VALID_PARAMS
|
from app.request import VALID_PARAMS
|
||||||
|
from app.utils.misc import BLACKLIST
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from bs4.element import ResultSet
|
from bs4.element import ResultSet
|
||||||
from cryptography.fernet import Fernet
|
from cryptography.fernet import Fernet
|
||||||
|
@ -47,8 +48,8 @@ def filter_link_args(query_link):
|
||||||
return query_link
|
return query_link
|
||||||
|
|
||||||
|
|
||||||
def has_ad_content(element):
|
def has_ad_content(element: str):
|
||||||
return element == 'ad' or element == 'sponsoredⓘ'
|
return element.upper() in (value.upper() for value in BLACKLIST) or 'ⓘ' in element
|
||||||
|
|
||||||
|
|
||||||
class Filter:
|
class Filter:
|
||||||
|
@ -133,7 +134,7 @@ class Filter:
|
||||||
return
|
return
|
||||||
|
|
||||||
for div in [_ for _ in self.main_divs.find_all('div', recursive=True)]:
|
for div in [_ for _ in self.main_divs.find_all('div', recursive=True)]:
|
||||||
has_ad = len([_ for _ in div.find_all('span', recursive=True) if has_ad_content(_.text.lower())])
|
has_ad = len([_ for _ in div.find_all('span', recursive=True) if has_ad_content(_.text)])
|
||||||
_ = div.decompose() if has_ad else None
|
_ = div.decompose() if has_ad else None
|
||||||
|
|
||||||
def fix_question_section(self):
|
def fix_question_section(self):
|
||||||
|
|
|
@ -2,6 +2,11 @@ from cryptography.fernet import Fernet
|
||||||
from flask import current_app as app
|
from flask import current_app as app
|
||||||
|
|
||||||
REQUIRED_SESSION_VALUES = ['uuid', 'config', 'fernet_keys']
|
REQUIRED_SESSION_VALUES = ['uuid', 'config', 'fernet_keys']
|
||||||
|
BLACKLIST = [
|
||||||
|
'ad', 'anuncio', 'annuncio', 'annonce', 'Anzeige', '广告', '廣告', 'Reklama', 'Реклама', 'Anunț', '광고',
|
||||||
|
'annons', 'Annonse', 'Iklan', '広告', 'Augl.', 'Mainos', 'Advertentie', 'إعلان', 'Գովազդ', 'विज्ञापन', 'Reklam',
|
||||||
|
'آگهی', 'Reklāma', 'Reklaam', 'Διαφήμιση', 'מודעה', 'Hirdetés'
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def generate_user_keys(cookies_disabled=False) -> dict:
|
def generate_user_keys(cookies_disabled=False) -> dict:
|
||||||
|
|
Loading…
Reference in New Issue