#!/usr/bin/env python3

from collections import OrderedDict
from html.entities import codepoint2name
import itertools
import string
import sys
import unicodedata

def genchars():
    yield ('Latin-1 Supplement',
           'https://en.wikipedia.org/wiki/Latin-1_Supplement_(Unicode_block)',
          'latin-1')
    yield from range(0xA0, 0x100)

    yield ('Greek and Coptic',
           'https://en.wikipedia.org/wiki/Greek_and_Coptic',
           'greek')
    yield from range(0x391, 0x3A2)
    yield from range(0x3A3, 0x3AA)
    yield from range(0x3B1, 0x3C2)
    yield from range(0x3C3, 0x3CA)
    yield ('Cyrillic',
           'https://en.wikipedia.org/wiki/Cyrillic_(Unicode_block)',
           'cyrillic')
    yield from range(0x410, 0x450)

    yield ('General Punctuation',
           'https://en.wikipedia.org/wiki/General_Punctuation',
           'punctuation')
    yield from range(0x2012, 0x2016)
    yield from range(0x2018, 0x2020)
    yield 0x2026
    yield 0x203D
    yield from range(0x2032, 0x2035)
    yield 0x20A9
    yield 0x20AC
    yield 0x20BD
    yield 0x2105
    yield 0x2122

    yield (
        'Superscripts and Subscripts',
        'https://en.wikipedia.org/wiki/Superscripts_and_Subscripts_(Unicode_block)',
        'subscripts'
    )
    yield 0x2071
    yield 0x2070
    yield 0xB9
    yield 0xB2
    yield 0xB3
    yield from range(0x2074, 0x209D)

    yield ('Mathematical Operators',
           'https://en.wikipedia.org/wiki/Mathematical_Operators',
           'math')
    yield 0xB1
    yield 0x2115
    yield 0x211A
    yield 0x211D
    yield 0x2124
    yield 0x2135
    yield 0x2200
    yield 0x2202
    yield 0x2203
    yield 0x18E
    yield from range(0x2204, 0x2212)
    yield from range(0x2215, 0x2236)
    yield from range(0x2241, 0x224E)
    yield 0x225D
    yield from range(0x225F, 0x227A)
    yield from range(0x2282, 0x22B0)
    yield from range(0x22B2, 0x22B6)
    yield from range(0x22BB, 0x22CB)
    yield from range(0x22EA, 0x2301)
    yield from range(0x2308, 0x2311)

    yield ('Miscellaneous Symbols',
           'https://en.wikipedia.org/wiki/Miscellaneous_Symbols',
           'misc')
    yield from range(0x2311, 0x232C)
    yield 0x23CE
    yield 0x23CF
    yield from range(0x23E9, 0x2400)

    yield ('Number Forms', 'https://en.wikipedia.org/wiki/Number_Forms',
           'number-forms')
    yield 0xBC
    yield 0xBD
    yield 0xBE
    yield from range(0x2150, 0x2160)

    yield ('Geometric Shapes',
           'https://en.wikipedia.org/wiki/Geometric_Shapes',
           'geometry')
    yield from range(0x2190, 0x2200)
    yield from range(0x25A0, 0x25A4)
    yield from range(0x25AA, 0x2607)

    yield ('ASCII', 'https://en.wikipedia.org/wiki/ASCII', 'ascii')
    yield from filter(digraphs.__contains__, range(0xA0))

    yield 'music'
    yield from range(0x2669, 0x2670)

    yield 'emoji'
    yield 0x2713
    yield 0x2721
    yield 0x1F63A
    yield 0x1F431
    yield 0x1F4AF

    yield ('Vim Digraphs', 'https://vimhelp.org/digraph.txt.html', 'vim')
    yield from digraphs.keys()

descriptions = {}
digraphs = OrderedDict()


def paginate(iterable, n):
    def inner():
        yield from iterable

    myiter = inner()
    starter = object()
    stopper = object()
    thenext = starter
    while thenext is not stopper:
        if thenext is starter:
            yield itertools.islice(myiter, n)
        else:
            yield itertools.chain((thenext,), itertools.islice(myiter, n - 1))
        thenext = next(myiter, stopper)


def load_digraphs(fname):
    with open(fname) as f:
        for line in f:
            literal, digraph, hexa, dec, descr = line.split('\t')
            digraph = Digraph(digraph)
            codepoint = int(dec.strip(), 10)
            descriptions.setdefault(codepoint, descr.strip())
            if codepoint in digraphs:
                digraphs[codepoint] = Joining(', ',
                                              digraphs[codepoint], digraph)
            else:
                digraphs[codepoint] = digraph


class Digraph:
    instances = set()

    def __init__(self, digraph):
        self.digraph = digraph
        self.instances.add(digraph)

    def to_markdown(self):
        return '`%s`' % self.digraph

    @classmethod
    def chars(cls):
        return list(sorted(set(c for di in cls.instances for c in di)))


class LiteralMarkdown:
    def __init__(self, markdown):
        self.markdown = markdown

    def to_markdown(self):
        return self.markdown


class Joining:
    def __init__(self, joiner, *markdowns):
        self.joiner = joiner
        self.markdowns = markdowns

    def to_markdown(self):
        return self.joiner.join(m.to_markdown() for m in self.markdowns)


NA = LiteralMarkdown('*n/a*')
UNKNOWN = LiteralMarkdown('*unknown*')


def display(x):
    c = chr(x)
    if x != 0xA0 and (c in string.whitespace or repr(c).startswith("'\\x")):
        return NA
    return c


fields = [
    ('Actual', display),
    ('Description',
     lambda x: (descriptions.get(x) or unicodedata.name(chr(x), UNKNOWN))),
    ('Vim', lambda x: digraphs.get(x, '')),
    ('HTML', lambda x: '&%s;' % codepoint2name.get(x, 'x%X' % x)),
    ('Hex', lambda x: 'x%X' % x),
    ('Dec', str),
    ('UTF-8', lambda x: ' '.join(map('{:02X}'.format, chr(x).encode('utf-8')))),
    ('UTF-16', lambda x: ' '.join(map('{:02X}'.format, chr(x).encode('utf-16')))),
    ('Category', lambda x: unicodedata.category(chr(x))),
]


labels, funcs = zip(*fields)

section_headers = []


def process_row(c):
    blanks = ('',)
    if isinstance(c, str):
        c = (c, None, None)
    if isinstance(c, tuple):
        name, href, anchor = c
        anchor = anchor or name.lower().replace(' ', '-')
        section_headers.append((name, anchor))
        if href:
            value = f'<a id="{anchor}">*Section*</a>', f'[**{name}**]({href})'
        else:
            value = '*Section*', f'<a id="{anchor}">**{name}**</a>'
        return (*value, *(blanks * (len(labels) - len(value))))
    elif isinstance(c, int):
        return tuple(to_markdown(f(c)) for f in funcs)
    else:
        raise TypeError(type(c))


def to_markdown(s):
    if hasattr(s, 'to_markdown'):
        return s.to_markdown()
    return s.replace('&', '&amp;') \
            .replace('<', '&lt;') \
            .replace('>', '&gt;') \
            .replace('*', '\\*') \
            .replace('#', '\\#') \
            .replace('_', '\_')



FRONTMATTER = '''
# Unicode cheat sheet

A curated list of unicode characters I want to have quick reference toward,
including their literal presentation (where possible), description from the
unicode table, various representations, and how to enter it as a Vim digraph\*.

They are grouped by category, including a link to the relevant Unicode block.
Also see [the full list of Unicode
blocks](https://en.wikipedia.org/wiki/Unicode_block)

To update the table by adding, deleting or moving various characters, edit the
function `genchars()` in 01-generate.py, then run this command:

```console
$ python3 01-generate.py 02-vim-digraphs.txt > 00-unicode-cheat-sheet.md
```

### Vim digraphs

The *Vim* column in this table lists Vim *digraphs*. To use these, go into
insertion mode and type `Ctrl-K` followed by the listed pair of keys. For
example, `Ctrl-K` + `C` + `o` generates the copyright symbol &copy;.  See
`:help digraph` or `:help ^K` for more information.

## Data

'''


if __name__ == '__main__':
    load_digraphs(sys.argv[1]);
    table = [*map(process_row, genchars())]
    lengths = [min(3, len(label)) for label in labels]
    for row in table:
        for i, item in enumerate(row):
            lengths[i] = max(lengths[i], len(row[i]))

    def printrow(row):
        print(' | '.join(
            '{0:{1}s}'.format(x, length) for x, length in zip(row, lengths)
        ).rstrip())

    print(FRONTMATTER)
    for header_name, anchor in (*section_headers,
                                ('Unassigned Digraphs', 'unassigned-digraphs')):
        print(f'* [{header_name}](#{anchor})')
    print()
    for row in (labels, ('-' * length for length in lengths), *table):
        printrow(row)

    print('\n\n## Unassigned Digraphs <a id="unassigned-digraphs"></a>\n\n')
    print('Vim provides', len(Digraph.instances), 'pre-defined digraphs',
          'using the following characters:')
    chars = Digraph.chars()
    print('', '```', ''.join(chars), '```', sep='\n')
    unassigned_reverse, unassigned = [], []
    for x in chars:
        if x == '_':
            continue
        for y in chars:
            di = x + y
            if di in Digraph.instances:
                continue
            elif y + x in Digraph.instances:
                unassigned_reverse.append(di)
            elif x < y:
                unassigned.append(di)
    if unassigned_reverse:
        print('The following', len(unassigned_reverse), 'digraphs are'
              'unassigned, but their reverse is assigned and therefore Vim',
              'will treat them as their reverse.\n```')
        for line in paginate(unassigned_reverse, 18):
            print(*line, sep=', ')
        print('```')
    if unassigned:
        print('The following', 2*len(unassigned), 'are fully unassigned.\n```')
        for line in paginate(
            (x+y if y in (x, '_') else ''.join((x, y, '/', y, x))
             for x, y in unassigned),
            10
        ):
            print(*line, sep=', ')
        print('```')
