python source code of translatewiki

editquality-master
- datasets
  - enwiktionary.revisions_for_review.5k_2016.json
  - demo
    - enwiki.rev_reverted.20k_2015.tsv.bz2
    - enwiki.features_reverted.testing.20k_2015.tsv.bz2
- editquality
  - config.py
  - editquality.py
  - codegen
    - generate.py
    - __init__.py
    - tests
      - test_generate.py
      - __init__.py
  - __init__.py
  - tests
    - data
      - test_config
        wiki_defaults.yaml
        expected.json
        wikis
        hewiki.yaml
        bawiki.yaml
        arwiki.yaml
        model_defaults.yaml
        manual_wikis.yaml
        globals.yaml
    - test_config.py
    - __init__.py
  - about.py
  - feature_lists
    - nlwiki.py
    - tawiki.py
    - enwiki.py
    - sqwiki.py
    - eswikiquote.py
    - nowiki.py
    - idwiki.py
    - wikipedia.py
    - srwiki.py
    - translatewiki.py
    - dewiki.py
    - rowiki.py
    - elwiki.py
    - lvwiki.py
    - svwiki.py
    - etwiki.py
    - wikibase.py
    - eswiki.py
    - jawiki.py
    - bswiki.py
    - ruwiki.py
    - arwiki.py
    - enwiktionary.py
    - zhwiki.py
    - mediawiki.py
    - ukwiki.py
    - hewiki.py
    - trwiki.py
    - viwiki.py
    - bnwiki.py
    - fiwiki.py
    - eswikibooks.py
    - huwiki.py
    - wikidatawiki.py
    - glwiki.py
    - cswiki.py
    - fawiki.py
    - plwiki.py
    - __init__.py
    - kowiki.py
    - cawiki.py
    - tests
      - test_huwiki.py
      - test_wikidatawiki.py
      - __init__.py
    - frwiki.py
    - iswiki.py
    - itwiki.py
    - ptwiki.py
    - hrwiki.py
    - wikitext.py
  - utilities
    - util.py
    - merge_labels.py
    - generate_make.py
    - extract_damaging.py
    - autolabel.py
    - __init__.py
    - tests
      - test_merge_labels.py
      - __init__.py
    - fetch_labels.py
    - join_observations.py
    - get_revisions.py
- Makefile
- model_info
  - eswikibooks.damaging.md
  - bnwiki.reverted.md
  - tawiki.reverted.md
  - translatewiki.reverted.md
  - itwiki.goodfaith.md
  - dewiki.reverted.md
  - idwiki.reverted.md
  - arwiki.damaging.md
  - rowiki.damaging.md
  - srwiki.damaging.md
  - jawiki.goodfaith.md
  - svwiki.goodfaith.md
  - huwiki.damaging.md
  - eswiki.goodfaith.md
  - cawiki.goodfaith.md
  - trwiki.goodfaith.md
  - enwiki.damaging.md
  - ukwiki.goodfaith.md
  - arwiki.goodfaith.md
  - elwiki.reverted.md
  - fawiki.goodfaith.md
  - enwiktionary.reverted.md
  - kowiki.goodfaith.md
  - eswikibooks.goodfaith.md
  - lvwiki.damaging.md
  - viwiki.reverted.md
  - nowiki.reverted.md
  - nowiki.damaging.md
  - sqwiki.damaging.md
  - itwiki.reverted.md
  - kowiki.damaging.md
  - ptwiki.damaging.md
  - hrwiki.reverted.md
  - etwiki.goodfaith.md
  - ukwiki.reverted.md
  - cswiki.goodfaith.md
  - zhwiki.damaging.md
  - eswikiquote.goodfaith.md
  - frwiki.goodfaith.md
  - huwiki.goodfaith.md
  - dewiki.goodfaith.md
  - lvwiki.goodfaith.md
  - cswiki.damaging.md
  - etwiki.damaging.md
  - glwiki.reverted.md
  - nlwiki.goodfaith.md
  - bswiki.damaging.md
  - eswikiquote.damaging.md
  - fawiki.damaging.md
  - nowiki.goodfaith.md
  - nlwiki.damaging.md
  - .no-delete
  - sqwiki.goodfaith.md
  - ptwiki.goodfaith.md
  - enwiki.goodfaith.md
  - cawiki.damaging.md
  - dewiki.damaging.md
  - hewiki.damaging.md
  - frwiki.damaging.md
  - bswiki.goodfaith.md
  - ukwiki.damaging.md
  - ruwiki.goodfaith.md
  - itwiki.damaging.md
  - eswiki.damaging.md
  - zhwiki.goodfaith.md
  - ruwiki.damaging.md
  - jawiki.damaging.md
  - trwiki.damaging.md
  - iswiki.reverted.md
  - hewiki.goodfaith.md
  - rowiki.goodfaith.md
  - svwiki.damaging.md
  - srwiki.goodfaith.md
- ipython
  - score_edit.ipynb
  - reverted_detection_demo.ipynb
- Makefile.manual
- utility
- LICENSE
- models
  - svwiki.goodfaith.gradient_boosting.model
  - sqwiki.goodfaith.gradient_boosting.model
  - fawiki.damaging.gradient_boosting.model
  - eswikiquote.goodfaith.gradient_boosting.model
  - iswiki.reverted.gradient_boosting.model
  - enwiktionary.reverted.rf.model
  - frwiki.goodfaith.gradient_boosting.model
  - rowiki.damaging.gradient_boosting.model
  - huwiki.damaging.gradient_boosting.model
  - translatewiki.reverted.gradient_boosting.model
  - itwiki.damaging.gradient_boosting.model
  - nowiki.damaging.gradient_boosting.model
  - arwiki.goodfaith.gradient_boosting.model
  - bswiki.goodfaith.gradient_boosting.model
  - elwiki.reverted.gradient_boosting.model
  - dewiki.goodfaith.gradient_boosting.model
  - frwiki.damaging.gradient_boosting.model
  - eswiki.damaging.gradient_boosting.model
  - cswiki.damaging.gradient_boosting.model
  - fawiki.goodfaith.gradient_boosting.model
  - ptwiki.damaging.gradient_boosting.model
  - plwiki.damaging.gradient_boosting.model
  - bnwiki.reverted.gradient_boosting.model
  - etwiki.goodfaith.gradient_boosting.model
  - enwiki.goodfaith.gradient_boosting.model
  - hewiki.damaging.rf.model
  - nlwiki.goodfaith.gradient_boosting.model
  - etwiki.damaging.gradient_boosting.model
  - wikidatawiki.goodfaith.gradient_boosting.model
  - wikidatawiki.damaging.gradient_boosting.model
  - ruwiki.goodfaith.gradient_boosting.model
  - glwiki.reverted.gradient_boosting.model
  - hrwiki.reverted.gradient_boosting.model
  - srwiki.damaging.gradient_boosting.model
  - svwiki.damaging.gradient_boosting.model
  - ruwiki.damaging.gradient_boosting.model
  - eswikibooks.goodfaith.gradient_boosting.model
  - lvwiki.goodfaith.gradient_boosting.model
  - jawiki.goodfaith.gradient_boosting.model
  - viwiki.reverted.gradient_boosting.model
  - srwiki.goodfaith.gradient_boosting.model
  - ukwiki.goodfaith.gradient_boosting.model
  - cawiki.goodfaith.gradient_boosting.model
  - nowiki.goodfaith.gradient_boosting.model
  - tawiki.reverted.gradient_boosting.model
  - ukwiki.damaging.gradient_boosting.model
  - itwiki.goodfaith.gradient_boosting.model
  - jawiki.damaging.gradient_boosting.model
  - kowiki.damaging.gradient_boosting.model
  - cswiki.goodfaith.gradient_boosting.model
  - plwiki.goodfaith.rf.model
  - kowiki.goodfaith.gradient_boosting.model
  - dewiki.damaging.gradient_boosting.model
  - trwiki.goodfaith.gradient_boosting.model
  - rowiki.goodfaith.gradient_boosting.model
  - nlwiki.damaging.gradient_boosting.model
  - bswiki.damaging.rf.model
  - hewiki.goodfaith.gradient_boosting.model
  - trwiki.damaging.gradient_boosting.model
  - cawiki.damaging.gradient_boosting.model
  - eswiki.goodfaith.gradient_boosting.model
  - zhwiki.goodfaith.gradient_boosting.model
  - arwiki.damaging.gradient_boosting.model
  - lvwiki.damaging.gradient_boosting.model
  - fiwiki.goodfaith.gradient_boosting.model
  - enwiki.damaging.gradient_boosting.model
  - fiwiki.damaging.gradient_boosting.model
  - ptwiki.goodfaith.gradient_boosting.model
  - huwiki.goodfaith.gradient_boosting.model
  - eswikiquote.damaging.gradient_boosting.model
  - eswikibooks.damaging.gradient_boosting.model
  - idwiki.reverted.gradient_boosting.model
  - sqwiki.damaging.gradient_boosting.model
  - zhwiki.damaging.gradient_boosting.model
- test-requirements.txt
- convert_wiki_configs.py
- templates
  - Makefile.j2
  - WikiConfConversion.j2
- .gitattributes
- config
  - wiki_defaults.yaml
  - wikis
    - hrwiki.yaml
    - eswikibooks.yaml
    - sqwiki.yaml
    - enwiki.yaml
    - viwiki.yaml
    - idwiki.yaml
    - urwiki.yaml
    - hiwiki.yaml
    - nlwiki.yaml
    - wikidatawiki.yaml.tmp
    - enwiktionary.yaml
    - elwiki.yaml
    - tawiki.yaml
    - itwiki.yaml
    - lvwiki.yaml
    - zhwiki.yaml
    - kowiki.yaml
    - eswikiquote.yaml
    - srwiki.yaml
    - hewiki.yaml
    - trwiki.yaml
    - jawiki.yaml
    - bawiki.yaml
    - svwiki.yaml
    - arwiki.yaml
    - azwiki.yaml
    - frwiki.yaml
    - nowiki.yaml
    - fiwiki.yaml.tmp
    - eswikiversity.yaml
    - eswiki.yaml
    - fawiki.yaml
    - ruwiki.yaml
    - glwiki.yaml
    - bnwikisource.yaml
    - bnwiki.yaml
    - iswiki.yaml
    - rowiki.yaml
    - ptwiki.yaml
    - etwiki.yaml
    - cawiki.yaml
    - cswiki.yaml
    - dewiki.yaml
    - ukwiki.yaml
    - huwiki.yaml
    - bswiki.yaml
  - model_defaults.yaml
  - classifiers.params.yaml
  - manual_wikis.yaml
  - globals.yaml
- CHANGELOG.md
- setup.py
- CODE OF CONDUCT.md
- tuning_reports
  - eswikibooks.damaging.md
  - bnwiki.reverted.md
  - tawiki.reverted.md
  - translatewiki.reverted.md
  - itwiki.goodfaith.md
  - idwiki.reverted.md
  - arwiki.damaging.md
  - rowiki.damaging.md
  - wikidatawiki.damaging.md
  - srwiki.damaging.md
  - jawiki.goodfaith.md
  - svwiki.goodfaith.md
  - huwiki.damaging.md
  - eswiki.goodfaith.md
  - cawiki.goodfaith.md
  - trwiki.goodfaith.md
  - plwiki.goodfaith.md
  - enwiki.damaging.md
  - ukwiki.goodfaith.md
  - arwiki.goodfaith.md
  - elwiki.reverted.md
  - fawiki.goodfaith.md
  - enwiktionary.reverted.md
  - kowiki.goodfaith.md
  - eswikibooks.goodfaith.md
  - lvwiki.damaging.md
  - viwiki.reverted.md
  - eswikibooks.reverted.md
  - nowiki.damaging.md
  - arwiki.reverted.md
  - sqwiki.damaging.md
  - itwiki.reverted.md
  - kowiki.damaging.md
  - ptwiki.damaging.md
  - hrwiki.reverted.md
  - etwiki.goodfaith.md
  - ukwiki.reverted.md
  - cswiki.goodfaith.md
  - zhwiki.damaging.md
  - eswikiquote.goodfaith.md
  - frwiki.goodfaith.md
  - huwiki.goodfaith.md
  - dewiki.goodfaith.md
  - lvwiki.goodfaith.md
  - cswiki.damaging.md
  - etwiki.damaging.md
  - glwiki.reverted.md
  - nlwiki.goodfaith.md
  - wikidatawiki.goodfaith.md
  - bswiki.damaging.md
  - eswikiquote.damaging.md
  - fawiki.damaging.md
  - nowiki.goodfaith.md
  - nlwiki.damaging.md
  - sqwiki.goodfaith.md
  - ptwiki.goodfaith.md
  - enwiki.goodfaith.md
  - cawiki.damaging.md
  - dewiki.damaging.md
  - hewiki.damaging.md
  - plwiki.damaging.md
  - frwiki.damaging.md
  - bswiki.goodfaith.md
  - ukwiki.damaging.md
  - ruwiki.goodfaith.md
  - itwiki.damaging.md
  - eswiki.damaging.md
  - zhwiki.goodfaith.md
  - fiwiki.damaging.md
  - ruwiki.damaging.md
  - jawiki.damaging.md
  - trwiki.damaging.md
  - fiwiki.goodfaith.md
  - iswiki.reverted.md
  - hewiki.goodfaith.md
  - rowiki.goodfaith.md
  - svwiki.damaging.md
  - srwiki.goodfaith.md
- setup.cfg
- .travis.yml
- README.md
- scripts
  - deploy.sh
- requirements.txt
- .codecov.yml
- RELEASE_CRITERIA.md
- .gitignore
- docs
  - Makefile
  - make.bat
  - usage.rst
  - readme.rst
  - tutorials
    - index.rst
    - train_model.rst
  - reference
    - editquality.rst
    - editquality.tests.rst
    - editquality.utilities.rst
    - editquality.feature_lists.rst
    - index.rst
    - editquality.codegen.rst
  - requirements.txt
  - index.rst
  - conf.py
  - changelog.rst
  - installation.rst
- MANIFEST.in
- tox.ini

import statistics
from collections import defaultdict

import langdetect
from revscoring import Datasource, Feature
from revscoring.datasources import revision_oriented as ro
from revscoring.datasources.meta import dicts, frequencies, indexable, mappers
from revscoring.features import wikitext as wt
from revscoring.features.meta import aggregators, vectorizers

from . import enwiki, mediawiki, wikitext


def process_is_a_translation_page(namespace_id, title):
    return (namespace_id == 8 or namespace_id > 1200) and "/" in title


is_a_translation_page = Feature(
    "revision.page.is_a_translation_page", process_is_a_translation_page,
    returns=bool,
    depends_on=[ro.revision.page.namespace.id,
                ro.revision.page.title])


def process_is_a_default(text):
    return text == "-"


revision_is_a_default = Feature(
    "revision.is_a_default", process_is_a_default,
    returns=bool, depends_on=[ro.revision.text])

parent_was_a_default = Feature(
    "revision.parent.is_a_default", process_is_a_default,
    returns=bool, depends_on=[ro.revision.parent.text])


# Unicode ranges
def process_unicode_stats(words):
    code_points = [ord(c) for w in words for c in w]
    while len(code_points) < 2:
        code_points.append(ord("-"))
    return (statistics.mean(code_points), statistics.median(code_points),
            statistics.stdev(code_points))


revision_unicode_stats = Datasource(
    "revision.unicode_stats", process_unicode_stats,
    depends_on=[wt.revision.datasources.words])
revision_unicode_mean = indexable.index(0, revision_unicode_stats)
revision_unicode_median = indexable.index(1, revision_unicode_stats)
revision_unicode_stdev = indexable.index(2, revision_unicode_stats)
parent_unicode_stats = Datasource(
    "revision.parent.unicode_stats", process_unicode_stats,
    depends_on=[wt.revision.parent.datasources.words])
parent_unicode_mean = indexable.index(0, parent_unicode_stats)
parent_unicode_median = indexable.index(1, parent_unicode_stats)
parent_unicode_stdev = indexable.index(2, parent_unicode_stats)


def diff(val1, val2):
    return float(val2 - val1)


mean_unicode_diff = Feature(
    "revision.diff.mean_unicode_diff", diff, returns=float,
    depends_on=[parent_unicode_mean, revision_unicode_mean])
median_unicode_diff = Feature(
    "revision.diff.median_unicode_diff", diff, returns=float,
    depends_on=[parent_unicode_median, revision_unicode_median])
stdev_unicode_diff = Feature(
    "revision.diff.stdev_unicode_diff", diff, returns=float,
    depends_on=[parent_unicode_stdev, revision_unicode_stdev])


# Introduction of tags
XSS_TAGS = ["source", "img", "iframe", "input", "style", "body", "svg"]
xss_tags = wt.revision.tag_names_matching(
    "|".join(XSS_TAGS), name="revision.xss_tags")

translatewiki = [is_a_translation_page, revision_is_a_default,
                 parent_was_a_default, xss_tags, mean_unicode_diff,
                 median_unicode_diff, stdev_unicode_diff]


def process_translation_title_lang(title):
    if "/" in title:
        return title.split("/")[-1].lower()
    else:
        return None


translation_title_lang = Datasource("revision.page.translation_title_lang",
                                    process_translation_title_lang,
                                    depends_on=[ro.revision.page.title])

# Langdetect features
ALL_LANGS = [
    "af", "ar", "bg", "bn", "ca", "cs", "cy", "da", "de", "el", "en",
    "es", "et", "fa", "fi", "fr", "gu", "he", "hi", "hr", "hu", "id",
    "it", "ja", "kn", "ko", "lt", "lv", "mk", "ml", "mr", "ne", "nl",
    "no", "pa", "pl", "pt", "ro", "ru", "sk", "sl", "so", "sq", "sv",
    "sw", "ta", "te", "th", "tl", "tr", "uk", "ur", "vi", "zh-cn", "zh-tw"]
COMMONLY_CONFUSED_LANGUAGE_GROUPS = [
    # Galacian is confused with Spanish and Portugese
    {"es", "pt", "gl"},
    # Serbian is confused with Macedonian and Bulgarian
    {"sr", "mk", "bg"},
    # Indonesian and Tagalog get confused
    {"id", "tl"},
    # Norweigian languages
    {"no", "nb", "nn"}
]
COMMON_LANGUAGE_MAP = {ll: "-".join(sorted(lg))
                       for lg in COMMONLY_CONFUSED_LANGUAGE_GROUPS
                       for ll in lg}
ALL_NORMALIZED_LANGS = list(sorted({COMMON_LANGUAGE_MAP.get(lang, lang)
                                    for lang in ALL_LANGS}))


def process_normalized_lang_map(text):
    try:
        lang_map = {l.lang: l.prob
                    for l in langdetect.detect_langs(text or "")}
    except langdetect.lang_detect_exception.LangDetectException:
        lang_map = {}

    normalized_lang_map = defaultdict(lambda: 0.0)
    for lang in ALL_LANGS:
        norm_lang = COMMON_LANGUAGE_MAP.get(lang, lang)
        normalized_lang_map[norm_lang] += lang_map.get(lang, 0.0)

    return normalized_lang_map


revision_lang_map = Datasource(
    "revision.lang_map", process_normalized_lang_map,
    depends_on=[ro.revision.text])
parent_lang_map = Datasource(
    "revision.parent.lang_map", process_normalized_lang_map,
    depends_on=[ro.revision.parent.text])
parent_lang_vector = vectorizers.vectorize(
    parent_lang_map, keys=ALL_NORMALIZED_LANGS, returns=float,
    name="revision.parent.lang_vector")
lang_delta = frequencies.delta(parent_lang_map, revision_lang_map)
lang_delta_vector = vectorizers.vectorize(
    lang_delta, keys=ALL_NORMALIZED_LANGS, returns=float,
    name="revision.diff.lang_delta_vector")
lang_delta_sum_diff = aggregators.sum(
    mappers.abs(dicts.values(lang_delta)),
    name="revision.diff.lang_delta_sum_diff")


def process_title_lang_match(title_lang, lang_delta):
    return lang_delta.get(title_lang, 0.0)


parent_lang_match = Feature("revision.parent.lang_match",
                            process_title_lang_match,
                            depends_on=[translation_title_lang,
                                        parent_lang_map],
                            returns=float)
match_lang_delta = Feature("revision.diff.match_lang_delta",
                           process_title_lang_match,
                           depends_on=[translation_title_lang,
                                       lang_delta],
                           returns=float)


detected_langs = [parent_lang_vector, lang_delta_vector, lang_delta_sum_diff,
                  parent_lang_match, match_lang_delta]

damaging = \
    enwiki.badwords + enwiki.informals + wikitext.diff + \
    mediawiki.protected_user + mediawiki.user_rights + \
    wikitext.parent + translatewiki + detected_langs
"Damaging Features"

reverted = damaging
goodfaith = damaging