""" Update Emoji.py Refeshes OMZ emoji database based on the latest Unicode spec """ import re import json spec = open("emoji-data.txt", "r") # Regexes # regex_emoji will return, respectively: # the code points, its type (status), the actual emoji, and its official name regex_emoji = r"^([\w ].*?\S)\s*;\s*([\w-]+)\s*#\s*(.*?)\s(\S.*).*$" # regex_group returns the group of subgroup that a line opens regex_group = r"^#\s*(group|subgroup):\s*(.*)$" headers = """ # emoji-char-definitions.zsh - Emoji definitions for oh-my-zsh emoji plugin # # This file is auto-generated by update_emoji.py. Do not edit it manually. # # This contains the definition for: # $emoji - which maps character names to Unicode characters # $emoji_flags - maps country names to Unicode flag characters using region # indicators # $emoji_mod - maps modifier components to Unicode characters # $emoji_groups - a single associative array to avoid cluttering up the # global namespace, and to allow adding additional group # definitions at run time. The keys are the group names, and # the values are whitespace-separated lists of emoji # character names. # Main emoji typeset -gAH emoji # National flags typeset -gAH emoji_flags # Combining modifiers typeset -gAH emoji_mod # Emoji groups typeset -gAH emoji_groups """ ####### # Adding country codes ####### # This is the only part of this script that relies on an external library # (country_converter), and is hence commented out by default. # You can uncomment it to have country codes added as aliases for flag # emojis. (By default, when you install this extension, country codes are # included as aliases, but not if you re-run this script without uncommenting.) # Warning: country_converter is very verbose, and will print warnings all over # your terminal. # import country_converter as coco # pylint: disable=wrong-import-position # cc = coco.CountryConverter() # def country_iso(_all_names, _omz_name): # """ Using the external library country_converter, # this function can detect the ISO2 and ISO3 codes # of the country. It takes as argument the array # with all the names of the emoji, and returns that array.""" # omz_no_underscore = re.sub(r'_', r' ', _omz_name) # iso2 = cc.convert(names=[omz_no_underscore], to='ISO2') # if iso2 != 'not found': # _all_names.append(iso2) # iso3 = cc.convert(names=[omz_no_underscore], to='ISO3') # _all_names.append(iso3) # return _all_names ####### # Helper functions ####### def code_to_omz(_code_points): """ Returns a ZSH-compatible Unicode string from the code point(s) """ return r'\U' + r'\U'.join(_code_points.split(' ')) def name_to_omz(_name, _group, _subgroup, _status): """ Returns a reasonable snake_case name for the emoji. """ def snake_case(_string): """ Does the regex work of snake_case """ remove_dots = re.sub(r'\.\(\)', r'', _string) replace_ands = re.sub(r'\&', r'and', remove_dots) remove_whitespace = re.sub(r'[^\#\*\w]', r'_', replace_ands) return re.sub(r'__', r'_', remove_whitespace) shortname = "" split_at_colon = lambda s: s.split(": ") # Special treatment by group and subgroup # If the emoji is a flag, we strip "flag" from its name if _group == "Flags" and len(split_at_colon(_name)) > 1: shortname = snake_case(split_at_colon(_name)[1]) else: shortname = snake_case(_name) # Special treatment by status # Enables us to have every emoji combination, # even the one that are not officially sanctionned # and are implemented by, say, only one vendor if _status == "unqualified": shortname += "_unqualified" elif _status == "minimally-qualified": shortname += "_minimally" return shortname def increment_name(_shortname): """ Increment the short name by 1. If you get, say, 'woman_detective_unqualified', it returns 'woman_detective_unqualified_1', and then 'woman_detective_unqualified_2', etc. """ last_char = _shortname[-1] if last_char.isdigit(): num = int(last_char) return _shortname[:-1] + str(num + 1) return _shortname + "_1" ######## # Going through every line ######## group, subgroup, short_name_buffer = "", "", "" emoji_database = [] for line in spec: # First, test if this line opens a group or subgroup group_match = re.findall(regex_group, line) if group_match != []: gr_or_sub, name = group_match[0] if gr_or_sub == "group": group = name elif gr_or_sub == "subgroup": subgroup = name continue # Moving on... # Second, test if this line references one emoji emoji_match = re.findall(regex_emoji, line) if emoji_match != []: code_points, status, emoji, name = emoji_match[0] omz_codes = code_to_omz(code_points) omz_name = name_to_omz(name, group, subgroup, status) # If this emoji has the same shortname as the preceding one if omz_name in short_name_buffer: omz_name = increment_name(short_name_buffer) short_name_buffer = omz_name emoji_database.append( [omz_codes, status, emoji, omz_name, group, subgroup]) spec.close() ######## # Write to emoji-char-definitions.zsh ######## # Aliases for emojis are retrieved through the DB of Gemoji # Retrieved on Aug 9 2019 from the following URL: # https://raw.githubusercontent.com/github/gemoji/master/db/emoji.json gemoji_db = open("gemoji_db.json") j = json.load(gemoji_db) aliases_map = {entry['emoji']: entry['aliases'] for entry in j} all_omz_names = [emoji_data[3] for emoji_data in emoji_database] # Let's begin writing to this file output = open("emoji-char-definitions.zsh", "w") output.write(headers) emoji_groups = {"fruits": "\n", "vehicles": "\n", "hands": "\n", "people": "\n", "animals": "\n", "faces": "\n", "flags": "\n"} # First, write every emoji down for _omz_codes, _status, _emoji, _omz_name, _group, _subgroup in emoji_database: # One emoji can be mapped to multiple names (aliases or country codes) names_for_this_emoji = [_omz_name] # Variable that indicates in which map the emoji will be located emoji_map = "emoji" if _status == "component": emoji_map = "emoji_mod" if _group == "Flags": emoji_map = "emoji_flags" # Adding country codes (Optional, see above) # names_for_this_emoji = country_iso(names_for_this_emoji, _omz_name) # Check if there is an alias available in the Gemoji DB if _emoji in aliases_map.keys(): for alias in aliases_map[_emoji]: if alias not in all_omz_names: names_for_this_emoji.append(alias) # And now we write to the definitions file for one_name in names_for_this_emoji: output.write(f"{emoji_map}[{one_name}]=$'{_omz_codes}'\n") # Storing the emoji in defined subgroups for the next step if _status == "fully-qualified": if _subgroup == "food-fruit": emoji_groups["fruits"] += f" {_omz_name}\n" elif "transport-" in _subgroup: emoji_groups["vehicles"] += f" {_omz_name}\n" elif "hand-" in _subgroup: emoji_groups["hands"] += f" {_omz_name}\n" elif "person-" in _subgroup or _subgroup == "family": emoji_groups["people"] += f" {_omz_name}\n" elif "animal-" in _subgroup: emoji_groups["animals"] += f" {_omz_name}\n" elif "face-" in _subgroup: emoji_groups["faces"] += f" {_omz_name}\n" elif _group == "Flags": emoji_groups["flags"] += f" {_omz_name}\n" # Second, write the subgroups to the end of the file for name, string in emoji_groups.items(): output.write(f'\nemoji_groups[{name}]="{string}"\n') output.close()