Change how hashtags are normalized (#18795)

* Change how hashtags are normalized

* Fix tests
main
Eugen Rochko 2 years ago committed by GitHub
parent 12ed2d793b
commit e7aa2be828
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -16,6 +16,8 @@ module Admin
if @tag.update(tag_params.merge(reviewed_at: Time.now.utc))
redirect_to admin_tag_path(@tag.id), notice: I18n.t('admin.tags.updated_msg')
else
@time_period = (6.days.ago.to_date...Time.now.utc.to_date)
render :show
end
end
@ -27,7 +29,7 @@ module Admin
end
def tag_params
params.require(:tag).permit(:name, :trendable, :usable, :listable)
params.require(:tag).permit(:name, :display_name, :trendable, :usable, :listable)
end
end
end

@ -13,9 +13,7 @@ class Api::V1::FeaturedTagsController < Api::BaseController
end
def create
@featured_tag = current_account.featured_tags.new(featured_tag_params)
@featured_tag.reset_data
@featured_tag.save!
@featured_tag = current_account.featured_tags.create!(featured_tag_params)
render json: @featured_tag, serializer: REST::FeaturedTagSerializer
end

@ -11,7 +11,6 @@ class Settings::FeaturedTagsController < Settings::BaseController
def create
@featured_tag = current_account.featured_tags.new(featured_tag_params)
@featured_tag.reset_data
if @featured_tag.save
redirect_to settings_featured_tags_path

@ -606,7 +606,20 @@ function insertIntoTagHistory(recognizedTags, text) {
const state = getState();
const oldHistory = state.getIn(['compose', 'tagHistory']);
const me = state.getIn(['meta', 'me']);
const names = recognizedTags.map(tag => text.match(new RegExp(`#${tag.name}`, 'i'))[0].slice(1));
// FIXME: Matching input hashtags with recognized hashtags has become more
// complicated because of new normalization rules, it's no longer just
// a case sensitivity issue
const names = recognizedTags.map(tag => {
const matches = text.match(new RegExp(`#${tag.name}`, 'i'));
if (matches && matches.length > 0) {
return matches[0].slice(1);
} else {
return tag.name;
}
});
const intersectedOldHistory = oldHistory.filter(name => names.findIndex(newName => newName.toLowerCase() === name.toLowerCase()) === -1);
names.push(...intersectedOldHistory.toJS());

@ -0,0 +1,10 @@
# frozen_string_literal: true
class ASCIIFolding
NON_ASCII_CHARS = 'ÀÁÂÃÄÅàáâãäåĀāĂ㥹ÇçĆćĈĉĊċČčÐðĎďĐđÈÉÊËèéêëĒēĔĕĖėĘęĚěĜĝĞğĠġĢģĤĥĦħÌÍÎÏìíîïĨĩĪīĬĭĮįİıĴĵĶķĸĹĺĻļĽľĿŀŁłÑñŃńŅņŇňʼnŊŋÒÓÔÕÖØòóôõöøŌōŎŏŐőŔŕŖŗŘřŚśŜŝŞşŠšſŢţŤťŦŧÙÚÛÜùúûüŨũŪūŬŭŮůŰűŲųŴŵÝýÿŶŷŸŹźŻżŽž'
EQUIVALENT_ASCII_CHARS = 'AAAAAAaaaaaaAaAaAaCcCcCcCcCcDdDdDdEEEEeeeeEeEeEeEeEeGgGgGgGgHhHhIIIIiiiiIiIiIiIiIiJjKkkLlLlLlLlLlNnNnNnNnnNnOOOOOOooooooOoOoOoRrRrRrSsSsSsSssTtTtTtUUUUuuuuUuUuUuUuUuUuWwYyyYyYZzZzZz'
def fold(str)
str.tr(NON_ASCII_CHARS, EQUIVALENT_ASCII_CHARS)
end
end

@ -0,0 +1,25 @@
# frozen_string_literal: true
class HashtagNormalizer
def normalize(str)
remove_invalid_characters(ascii_folding(lowercase(cjk_width(str))))
end
private
def remove_invalid_characters(str)
str.gsub(/[^[:alnum:]#{Tag::HASHTAG_SEPARATORS}]/, '')
end
def ascii_folding(str)
ASCIIFolding.new.fold(str)
end
def lowercase(str)
str.mb_chars.downcase.to_s
end
def cjk_width(str)
str.unicode_normalize(:nfkc)
end
end

@ -62,7 +62,7 @@ class Account < ApplicationRecord
)
USERNAME_RE = /[a-z0-9_]+([a-z0-9_\.-]+[a-z0-9_]+)?/i
MENTION_RE = /(?<=^|[^\/[:word:]])@((#{USERNAME_RE})(?:@[[:word:]\.\-]+[[:word:]]+)?)/i
MENTION_RE = /(?<=^|[^\/[:word:]])@((#{USERNAME_RE})(?:@[[:alnum:]\.\-]+[[:alnum:]]+)?)/i
URL_PREFIX_RE = /\Ahttp(s?):\/\/[^\/]+/
include Attachmentable

@ -3,14 +3,14 @@
#
# Table name: custom_filters
#
# id :bigint not null, primary key
# account_id :bigint
# id :bigint(8) not null, primary key
# account_id :bigint(8)
# expires_at :datetime
# phrase :text default(""), not null
# context :string default([]), not null, is an Array
# created_at :datetime not null
# updated_at :datetime not null
# action :integer default(0), not null
# action :integer default("warn"), not null
#
class CustomFilter < ApplicationRecord

@ -3,8 +3,8 @@
#
# Table name: custom_filter_keywords
#
# id :bigint not null, primary key
# custom_filter_id :bigint not null
# id :bigint(8) not null, primary key
# custom_filter_id :bigint(8) not null
# keyword :text default(""), not null
# whole_word :boolean default(TRUE), not null
# created_at :datetime not null

@ -13,17 +13,19 @@
#
class FeaturedTag < ApplicationRecord
belongs_to :account, inverse_of: :featured_tags, required: true
belongs_to :tag, inverse_of: :featured_tags, required: true
belongs_to :account, inverse_of: :featured_tags
belongs_to :tag, inverse_of: :featured_tags, optional: true # Set after validation
delegate :name, to: :tag, allow_nil: true
validates_associated :tag, on: :create
validates :name, presence: true, on: :create
validate :validate_tag_name, on: :create
validate :validate_featured_tags_limit, on: :create
def name=(str)
self.tag = Tag.find_or_create_by_names(str.strip)&.first
before_create :set_tag
before_create :reset_data
attr_writer :name
def name
tag_id.present? ? tag.name : @name
end
def increment(timestamp)
@ -34,14 +36,23 @@ class FeaturedTag < ApplicationRecord
update(statuses_count: [0, statuses_count - 1].max, last_status_at: account.statuses.where(visibility: %i(public unlisted)).tagged_with(tag).where.not(id: deleted_status_id).select(:created_at).first&.created_at)
end
private
def set_tag
self.tag = Tag.find_or_create_by_names(@name)&.first
end
def reset_data
self.statuses_count = account.statuses.where(visibility: %i(public unlisted)).tagged_with(tag).count
self.last_status_at = account.statuses.where(visibility: %i(public unlisted)).tagged_with(tag).select(:created_at).first&.created_at
end
private
def validate_featured_tags_limit
errors.add(:base, I18n.t('featured_tags.errors.limit')) if account.featured_tags.count >= 10
end
def validate_tag_name
errors.add(:name, :blank) if @name.blank?
errors.add(:name, :invalid) unless @name.match?(/\A(#{Tag::HASHTAG_NAME_RE})\z/i)
end
end

@ -15,6 +15,7 @@
# last_status_at :datetime
# max_score :float
# max_score_at :datetime
# display_name :string
#
class Tag < ApplicationRecord
@ -24,11 +25,12 @@ class Tag < ApplicationRecord
has_many :featured_tags, dependent: :destroy, inverse_of: :tag
HASHTAG_SEPARATORS = "_\u00B7\u200c"
HASHTAG_NAME_RE = "([[:word:]_][[:word:]#{HASHTAG_SEPARATORS}]*[[:alpha:]#{HASHTAG_SEPARATORS}][[:word:]#{HASHTAG_SEPARATORS}]*[[:word:]_])|([[:word:]_]*[[:alpha:]][[:word:]_]*)"
HASHTAG_NAME_RE = "([[:alnum:]_][[:alnum:]#{HASHTAG_SEPARATORS}]*[[:alpha:]#{HASHTAG_SEPARATORS}][[:alnum:]#{HASHTAG_SEPARATORS}]*[[:alnum:]_])|([[:alnum:]_]*[[:alpha:]][[:alnum:]_]*)"
HASHTAG_RE = /(?:^|[^\/\)\w])#(#{HASHTAG_NAME_RE})/i
validates :name, presence: true, format: { with: /\A(#{HASHTAG_NAME_RE})\z/i }
validate :validate_name_change, if: -> { !new_record? && name_changed? }
validate :validate_display_name_change, if: -> { !new_record? && display_name_changed? }
scope :reviewed, -> { where.not(reviewed_at: nil) }
scope :unreviewed, -> { where(reviewed_at: nil) }
@ -46,6 +48,10 @@ class Tag < ApplicationRecord
name
end
def display_name
attributes['display_name'] || name
end
def usable
boolean_with_default('usable', true)
end
@ -90,8 +96,10 @@ class Tag < ApplicationRecord
class << self
def find_or_create_by_names(name_or_names)
Array(name_or_names).map(&method(:normalize)).uniq { |str| str.mb_chars.downcase.to_s }.map do |normalized_name|
tag = matching_name(normalized_name).first || create(name: normalized_name)
names = Array(name_or_names).map { |str| [normalize(str), str] }.uniq(&:first)
names.map do |(normalized_name, display_name)|
tag = matching_name(normalized_name).first || create(name: normalized_name, display_name: display_name)
yield tag if block_given?
@ -129,7 +137,7 @@ class Tag < ApplicationRecord
end
def normalize(str)
str.gsub(/\A#/, '')
HashtagNormalizer.new.normalize(str)
end
end
@ -138,4 +146,8 @@ class Tag < ApplicationRecord
def validate_name_change
errors.add(:name, I18n.t('tags.does_not_match_previous_name')) unless name_was.mb_chars.casecmp(name.mb_chars).zero?
end
def validate_display_name_change
errors.add(:display_name, I18n.t('tags.does_not_match_previous_name')) unless HashtagNormalizer.new.normalize(display_name).casecmp(name.mb_chars).zero?
end
end

@ -10,11 +10,11 @@ class ActivityPub::HashtagSerializer < ActivityPub::Serializer
end
def name
"##{object.name}"
"##{object.display_name}"
end
def href
if object.class.name == 'FeaturedTag'
if object.instance_of?(FeaturedTag)
short_account_tag_url(object.account, object.tag)
else
tag_url(object)

@ -12,4 +12,8 @@ class REST::FeaturedTagSerializer < ActiveModel::Serializer
def url
short_account_tag_url(object.account, object.tag)
end
def name
object.display_name
end
end

@ -8,4 +8,8 @@ class REST::TagSerializer < ActiveModel::Serializer
def url
tag_url(object)
end
def name
object.display_name
end
end

@ -75,7 +75,7 @@
= link_to short_account_tag_path(@account, featured_tag.tag) do
%h4
= fa_icon 'hashtag'
= featured_tag.name
= featured_tag.display_name
%small
- if featured_tag.last_status_at.nil?
= t('accounts.nothing_here')

@ -28,7 +28,7 @@ RSS::Builder.build do |doc|
end
status.tags.each do |tag|
item.category(tag.name)
item.category(tag.display_name)
end
end
end

@ -2,7 +2,7 @@
= javascript_pack_tag 'admin', async: true, crossorigin: 'anonymous'
- content_for :page_title do
= "##{@tag.name}"
= "##{@tag.display_name}"
- if current_user.can?(:view_dashboard)
- content_for :heading_actions do
@ -53,7 +53,7 @@
= render 'shared/error_messages', object: @tag
.fields-group
= f.input :name, wrapper: :with_block_label
= f.input :display_name, wrapper: :with_block_label
.fields-group
= f.input :usable, as: :boolean, wrapper: :with_label

@ -6,7 +6,7 @@
.pending-account__header
= link_to admin_tag_path(tag.id) do
= fa_icon 'hashtag'
= tag.name
= tag.display_name
%br/

@ -1,12 +1,12 @@
<%= raw t('admin_mailer.new_trends.new_trending_tags.title') %>
<% @tags.each do |tag| %>
- #<%= tag.name %>
- #<%= tag.display_name %>
<%= raw t('admin.trends.tags.usage_comparison', today: tag.history.get(Time.now.utc).accounts, yesterday: tag.history.get(Time.now.utc - 1.day).accounts) %> • <%= t('admin.trends.tags.current_score', score: Trends.tags.score(tag.id).round(2)) %>
<% end %>
<% if @lowest_trending_tag %>
<%= raw t('admin_mailer.new_trends.new_trending_tags.requirements', lowest_tag_name: @lowest_trending_tag.name, lowest_tag_score: Trends.tags.score(@lowest_trending_tag.id).round(2), rank: Trends.tags.options[:review_threshold]) %>
<%= raw t('admin_mailer.new_trends.new_trending_tags.requirements', lowest_tag_name: @lowest_trending_tag.display_name, lowest_tag_score: Trends.tags.score(@lowest_trending_tag.id).round(2), rank: Trends.tags.options[:review_threshold]) %>
<% else %>
<%= raw t('admin_mailer.new_trends.new_trending_tags.no_approved_tags') %>
<% end %>

@ -9,7 +9,7 @@
= render 'shared/error_messages', object: @featured_tag
.fields-group
= f.input :name, wrapper: :with_block_label, hint: safe_join([t('simple_form.hints.featured_tag.name'), safe_join(@recently_used_tags.map { |tag| link_to("##{tag.name}", settings_featured_tags_path(featured_tag: { name: tag.name }), method: :post) }, ', ')], ' ')
= f.input :name, wrapper: :with_block_label, hint: safe_join([t('simple_form.hints.featured_tag.name'), safe_join(@recently_used_tags.map { |tag| link_to("##{tag.display_name}", settings_featured_tags_path(featured_tag: { name: tag.name }), method: :post) }, ', ')], ' ')
.actions
= f.button :button, t('featured_tags.add_new'), type: :submit

@ -1,6 +1,6 @@
= opengraph 'og:site_name', t('about.hosted_on', domain: site_hostname)
= opengraph 'og:url', tag_url(@tag)
= opengraph 'og:type', 'website'
= opengraph 'og:title', "##{@tag.name}"
= opengraph 'og:description', strip_tags(t('about.about_hashtag_html', hashtag: @tag.name))
= opengraph 'og:title', "##{@tag.display_name}"
= opengraph 'og:description', strip_tags(t('about.about_hashtag_html', hashtag: @tag.display_name))
= opengraph 'twitter:card', 'summary'

@ -1,5 +1,5 @@
- content_for :page_title do
= "##{@tag.name}"
= "##{@tag.display_name}"
- content_for :header_tags do
%meta{ name: 'robots', content: 'noindex' }/
@ -9,8 +9,8 @@
= render 'og'
.page-header
%h1= "##{@tag.name}"
%p= t('about.about_hashtag_html', hashtag: @tag.name)
%h1= "##{@tag.display_name}"
%p= t('about.about_hashtag_html', hashtag: @tag.display_name)
#mastodon-timeline{ data: { props: Oj.dump(default_props.merge(hashtag: @tag.name, local: @local)) }}
.notranslate#modal-container

@ -1,6 +1,6 @@
RSS::Builder.build do |doc|
doc.title("##{@tag.name}")
doc.description(I18n.t('rss.descriptions.tag', hashtag: @tag.name))
doc.title("##{@tag.display_name}")
doc.description(I18n.t('rss.descriptions.tag', hashtag: @tag.display_name))
doc.link(tag_url(@tag))
doc.last_build_date(@statuses.first.created_at) if @statuses.any?
doc.generator("Mastodon v#{Mastodon::Version.to_s}")
@ -26,7 +26,7 @@ RSS::Builder.build do |doc|
end
status.tags.each do |tag|
item.category(tag.name)
item.category(tag.display_name)
end
end
end

@ -24,6 +24,7 @@ ActiveSupport::Inflector.inflections(:en) do |inflect|
inflect.acronym 'RSS'
inflect.acronym 'REST'
inflect.acronym 'URL'
inflect.acronym 'ASCII'
inflect.singular 'data', 'data'
end

@ -0,0 +1,5 @@
class AddDisplayNameToTags < ActiveRecord::Migration[6.1]
def change
add_column :tags, :display_name, :string
end
end

@ -10,7 +10,7 @@
#
# It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 2022_07_04_024901) do
ActiveRecord::Schema.define(version: 2022_07_10_102457) do
# These are extensions that must be enabled in order to support this database
enable_extension "plpgsql"
@ -940,6 +940,7 @@ ActiveRecord::Schema.define(version: 2022_07_04_024901) do
t.datetime "last_status_at"
t.float "max_score"
t.datetime "max_score_at"
t.string "display_name"
t.index "lower((name)::text) text_pattern_ops", name: "index_tags_on_name_lower_btree", unique: true
end

@ -0,0 +1,29 @@
# frozen_string_literal: true
require 'rails_helper'
describe HashtagNormalizer do
subject { described_class.new }
describe '#normalize' do
it 'converts full-width Latin characters into basic Latin characters' do
expect(subject.normalize('')).to eq 'synthwave'
end
it 'converts half-width Katakana into Kana characters' do
expect(subject.normalize('シーサイドライナー')).to eq 'シーサイドライナー'
end
it 'converts modified Latin characters into basic Latin characters' do
expect(subject.normalize('BLÅHAJ')).to eq 'blahaj'
end
it 'strips out invalid characters' do
expect(subject.normalize('#foo')).to eq 'foo'
end
it 'keeps valid characters' do
expect(subject.normalize('a·b')).to eq 'a·b'
end
end
end

@ -91,7 +91,7 @@ RSpec.describe Tag, type: :model do
upcase_string = 'abcABCやゆよ'
downcase_string = 'abcabcやゆよ';
tag = Fabricate(:tag, name: downcase_string)
tag = Fabricate(:tag, name: HashtagNormalizer.new.normalize(downcase_string))
expect(Tag.find_normalized(upcase_string)).to eq tag
end
end
@ -101,12 +101,12 @@ RSpec.describe Tag, type: :model do
upcase_string = 'abcABCやゆよ'
downcase_string = 'abcabcやゆよ';
tag = Fabricate(:tag, name: downcase_string)
tag = Fabricate(:tag, name: HashtagNormalizer.new.normalize(downcase_string))
expect(Tag.matches_name(upcase_string)).to eq [tag]
end
it 'uses the LIKE operator' do
expect(Tag.matches_name('100%abc').to_sql).to eq %q[SELECT "tags".* FROM "tags" WHERE LOWER("tags"."name") LIKE LOWER('100\\%abc%')]
expect(Tag.matches_name('100%abc').to_sql).to eq %q[SELECT "tags".* FROM "tags" WHERE LOWER("tags"."name") LIKE LOWER('100abc%')]
end
end
@ -115,7 +115,7 @@ RSpec.describe Tag, type: :model do
upcase_string = 'abcABCやゆよ'
downcase_string = 'abcabcやゆよ';
tag = Fabricate(:tag, name: downcase_string)
tag = Fabricate(:tag, name: HashtagNormalizer.new.normalize(downcase_string))
expect(Tag.matching_name(upcase_string)).to eq [tag]
end
end

@ -892,6 +892,34 @@ const startWorker = async (workerId) => {
return arr;
};
/**
* See app/lib/ascii_folder.rb for the canon definitions
* of these constants
*/
const NON_ASCII_CHARS = 'ÀÁÂÃÄÅàáâãäåĀāĂ㥹ÇçĆćĈĉĊċČčÐðĎďĐđÈÉÊËèéêëĒēĔĕĖėĘęĚěĜĝĞğĠġĢģĤĥĦħÌÍÎÏìíîïĨĩĪīĬĭĮįİıĴĵĶķĸĹĺĻļĽľĿŀŁłÑñŃńŅņŇňʼnŊŋÒÓÔÕÖØòóôõöøŌōŎŏŐőŔŕŖŗŘřŚśŜŝŞşŠšſŢţŤťŦŧÙÚÛÜùúûüŨũŪūŬŭŮůŰűŲųŴŵÝýÿŶŷŸŹźŻżŽž';
const EQUIVALENT_ASCII_CHARS = 'AAAAAAaaaaaaAaAaAaCcCcCcCcCcDdDdDdEEEEeeeeEeEeEeEeEeGgGgGgGgHhHhIIIIiiiiIiIiIiIiIiJjKkkLlLlLlLlLlNnNnNnNnnNnOOOOOOooooooOoOoOoRrRrRrSsSsSsSssTtTtTtUUUUuuuuUuUuUuUuUuUuWwYyyYyYZzZzZz';
/**
* @param {string} str
* @return {string}
*/
const foldToASCII = str => {
const regex = new RegExp(NON_ASCII_CHARS.split('').join('|'), 'g');
return str.replace(regex, match => {
const index = NON_ASCII_CHARS.indexOf(match);
return EQUIVALENT_ASCII_CHARS[index];
});
};
/**
* @param {string} str
* @return {string}
*/
const normalizeHashtag = str => {
return foldToASCII(str.normalize('NFKC').toLowerCase()).replace(/[^\p{L}\p{N}_\u00b7\u200c]/gu, '');
};
/**
* @param {any} req
* @param {string} name
@ -968,7 +996,7 @@ const startWorker = async (workerId) => {
reject('No tag for stream provided');
} else {
resolve({
channelIds: [`timeline:hashtag:${params.tag.toLowerCase()}`],
channelIds: [`timeline:hashtag:${normalizeHashtag(params.tag)}`],
options: { needsFiltering: true },
});
}
@ -979,7 +1007,7 @@ const startWorker = async (workerId) => {
reject('No tag for stream provided');
} else {
resolve({
channelIds: [`timeline:hashtag:${params.tag.toLowerCase()}:local`],
channelIds: [`timeline:hashtag:${normalizeHashtag(params.tag)}:local`],
options: { needsFiltering: true },
});
}

Loading…
Cancel
Save