import statistics from collections import defaultdict import langdetect from revscoring import Datasource, Feature from revscoring.datasources import revision_oriented as ro from revscoring.datasources.meta import dicts, frequencies, indexable, mappers from revscoring.features import wikitext as wt from revscoring.features.meta import aggregators, vectorizers from . import enwiki, mediawiki, wikitext def process_is_a_translation_page(namespace_id, title): return (namespace_id == 8 or namespace_id > 1200) and "/" in title is_a_translation_page = Feature( "revision.page.is_a_translation_page", process_is_a_translation_page, returns=bool, depends_on=[ro.revision.page.namespace.id, ro.revision.page.title]) def process_is_a_default(text): return text == "-" revision_is_a_default = Feature( "revision.is_a_default", process_is_a_default, returns=bool, depends_on=[ro.revision.text]) parent_was_a_default = Feature( "revision.parent.is_a_default", process_is_a_default, returns=bool, depends_on=[ro.revision.parent.text]) # Unicode ranges def process_unicode_stats(words): code_points = [ord(c) for w in words for c in w] while len(code_points) < 2: code_points.append(ord("-")) return (statistics.mean(code_points), statistics.median(code_points), statistics.stdev(code_points)) revision_unicode_stats = Datasource( "revision.unicode_stats", process_unicode_stats, depends_on=[wt.revision.datasources.words]) revision_unicode_mean = indexable.index(0, revision_unicode_stats) revision_unicode_median = indexable.index(1, revision_unicode_stats) revision_unicode_stdev = indexable.index(2, revision_unicode_stats) parent_unicode_stats = Datasource( "revision.parent.unicode_stats", process_unicode_stats, depends_on=[wt.revision.parent.datasources.words]) parent_unicode_mean = indexable.index(0, parent_unicode_stats) parent_unicode_median = indexable.index(1, parent_unicode_stats) parent_unicode_stdev = indexable.index(2, parent_unicode_stats) def diff(val1, val2): return float(val2 - val1) mean_unicode_diff = Feature( "revision.diff.mean_unicode_diff", diff, returns=float, depends_on=[parent_unicode_mean, revision_unicode_mean]) median_unicode_diff = Feature( "revision.diff.median_unicode_diff", diff, returns=float, depends_on=[parent_unicode_median, revision_unicode_median]) stdev_unicode_diff = Feature( "revision.diff.stdev_unicode_diff", diff, returns=float, depends_on=[parent_unicode_stdev, revision_unicode_stdev]) # Introduction of tags XSS_TAGS = ["source", "img", "iframe", "input", "style", "body", "svg"] xss_tags = wt.revision.tag_names_matching( "|".join(XSS_TAGS), name="revision.xss_tags") translatewiki = [is_a_translation_page, revision_is_a_default, parent_was_a_default, xss_tags, mean_unicode_diff, median_unicode_diff, stdev_unicode_diff] def process_translation_title_lang(title): if "/" in title: return title.split("/")[-1].lower() else: return None translation_title_lang = Datasource("revision.page.translation_title_lang", process_translation_title_lang, depends_on=[ro.revision.page.title]) # Langdetect features ALL_LANGS = [ "af", "ar", "bg", "bn", "ca", "cs", "cy", "da", "de", "el", "en", "es", "et", "fa", "fi", "fr", "gu", "he", "hi", "hr", "hu", "id", "it", "ja", "kn", "ko", "lt", "lv", "mk", "ml", "mr", "ne", "nl", "no", "pa", "pl", "pt", "ro", "ru", "sk", "sl", "so", "sq", "sv", "sw", "ta", "te", "th", "tl", "tr", "uk", "ur", "vi", "zh-cn", "zh-tw"] COMMONLY_CONFUSED_LANGUAGE_GROUPS = [ # Galacian is confused with Spanish and Portugese {"es", "pt", "gl"}, # Serbian is confused with Macedonian and Bulgarian {"sr", "mk", "bg"}, # Indonesian and Tagalog get confused {"id", "tl"}, # Norweigian languages {"no", "nb", "nn"} ] COMMON_LANGUAGE_MAP = {ll: "-".join(sorted(lg)) for lg in COMMONLY_CONFUSED_LANGUAGE_GROUPS for ll in lg} ALL_NORMALIZED_LANGS = list(sorted({COMMON_LANGUAGE_MAP.get(lang, lang) for lang in ALL_LANGS})) def process_normalized_lang_map(text): try: lang_map = {l.lang: l.prob for l in langdetect.detect_langs(text or "")} except langdetect.lang_detect_exception.LangDetectException: lang_map = {} normalized_lang_map = defaultdict(lambda: 0.0) for lang in ALL_LANGS: norm_lang = COMMON_LANGUAGE_MAP.get(lang, lang) normalized_lang_map[norm_lang] += lang_map.get(lang, 0.0) return normalized_lang_map revision_lang_map = Datasource( "revision.lang_map", process_normalized_lang_map, depends_on=[ro.revision.text]) parent_lang_map = Datasource( "revision.parent.lang_map", process_normalized_lang_map, depends_on=[ro.revision.parent.text]) parent_lang_vector = vectorizers.vectorize( parent_lang_map, keys=ALL_NORMALIZED_LANGS, returns=float, name="revision.parent.lang_vector") lang_delta = frequencies.delta(parent_lang_map, revision_lang_map) lang_delta_vector = vectorizers.vectorize( lang_delta, keys=ALL_NORMALIZED_LANGS, returns=float, name="revision.diff.lang_delta_vector") lang_delta_sum_diff = aggregators.sum( mappers.abs(dicts.values(lang_delta)), name="revision.diff.lang_delta_sum_diff") def process_title_lang_match(title_lang, lang_delta): return lang_delta.get(title_lang, 0.0) parent_lang_match = Feature("revision.parent.lang_match", process_title_lang_match, depends_on=[translation_title_lang, parent_lang_map], returns=float) match_lang_delta = Feature("revision.diff.match_lang_delta", process_title_lang_match, depends_on=[translation_title_lang, lang_delta], returns=float) detected_langs = [parent_lang_vector, lang_delta_vector, lang_delta_sum_diff, parent_lang_match, match_lang_delta] damaging = \ enwiki.badwords + enwiki.informals + wikitext.diff + \ mediawiki.protected_user + mediawiki.user_rights + \ wikitext.parent + translatewiki + detected_langs "Damaging Features" reverted = damaging goodfaith = damaging