snac2/format.c

277 lines
7.4 KiB
C
Raw Normal View History

2022-10-07 19:30:54 +03:00
/* snac - A simple, minimalistic ActivityPub instance */
2023-07-28 12:34:18 +03:00
/* copyright (c) 2022 - 2023 grunfink et al. / MIT license */
2022-10-07 19:30:54 +03:00
#include "xs.h"
#include "xs_regex.h"
#include "xs_mime.h"
2022-10-07 19:30:54 +03:00
#include "snac.h"
/* emoticons, people laughing and such */
2023-08-17 19:20:16 +03:00
const char *smileys[] = {
":-)", "🙂",
":-D", "😀",
"X-D", "😆",
";-)", "😉",
"B-)", "😎",
">:-(", "😡",
":-(", "😞",
":-*", "😘",
":-/", "😕",
"8-o", "😲",
"%-)", "🤪",
":_(", "😢",
":-|", "😐",
2023-09-02 10:23:44 +03:00
"<3", "&#10084;&#65039;",
2023-08-17 19:20:16 +03:00
":facepalm:", "&#129318;",
":shrug:", "&#129335;",
":shrug2:", "&#175;\\_(&#12484;)_/&#175;",
":eyeroll:", "&#128580;",
":beer:", "&#127866;",
":beers:", "&#127867;",
":munch:", "&#128561;",
":thumb:", "&#128077;",
NULL, NULL
};
static xs_str *format_line(const char *line, xs_list **attach)
2022-11-13 11:12:20 +03:00
/* formats a line */
2022-10-07 19:30:54 +03:00
{
2023-05-21 21:11:06 +03:00
xs_str *s = xs_str_new(NULL);
2022-10-07 19:30:54 +03:00
char *p, *v;
2022-11-13 11:12:20 +03:00
/* split by markup */
xs *sm = xs_regex_split(line,
"(`[^`]+`|\\*\\*?[^\\*]+\\*?\\*|https?:/" "/[^[:space:]]+)");
int n = 0;
2022-11-13 11:12:20 +03:00
p = sm;
while (xs_list_iter(&p, &v)) {
if ((n & 0x1)) {
/* markup */
if (xs_startswith(v, "`")) {
2023-01-12 11:28:02 +03:00
xs *s1 = xs_crop_i(xs_dup(v), 1, -1);
xs *e1 = encode_html(s1);
xs *s2 = xs_fmt("<code>%s</code>", e1);
2022-11-13 11:12:20 +03:00
s = xs_str_cat(s, s2);
2022-10-07 19:30:54 +03:00
}
else
2022-11-13 11:12:20 +03:00
if (xs_startswith(v, "**")) {
2023-01-12 11:28:02 +03:00
xs *s1 = xs_crop_i(xs_dup(v), 2, -2);
2022-11-13 11:12:20 +03:00
xs *s2 = xs_fmt("<b>%s</b>", s1);
s = xs_str_cat(s, s2);
}
else
if (xs_startswith(v, "*")) {
2023-01-12 11:28:02 +03:00
xs *s1 = xs_crop_i(xs_dup(v), 1, -1);
2022-11-13 11:12:20 +03:00
xs *s2 = xs_fmt("<i>%s</i>", s1);
s = xs_str_cat(s, s2);
}
else
if (xs_startswith(v, "http")) {
2023-06-12 20:01:17 +03:00
xs *u = xs_replace(v, "#", "&#35;");
xs *v2 = xs_strip_chars_i(xs_dup(u), ".");
const char *mime = xs_mime_by_ext(v2);
if (attach != NULL && xs_startswith(mime, "image/")) {
/* if it's a link to an image, insert it as an attachment */
xs *d = xs_dict_new();
d = xs_dict_append(d, "mediaType", mime);
d = xs_dict_append(d, "url", v2);
d = xs_dict_append(d, "name", "");
d = xs_dict_append(d, "type", "Image");
*attach = xs_list_append(*attach, d);
}
else {
2023-06-12 20:01:17 +03:00
xs *s1 = xs_fmt("<a href=\"%s\" target=\"_blank\">%s</a>", v2, u);
s = xs_str_cat(s, s1);
}
2022-11-13 11:12:20 +03:00
}
else
s = xs_str_cat(s, v);
2022-10-07 19:30:54 +03:00
}
2022-11-13 11:12:20 +03:00
else
/* surrounded text, copy directly */
s = xs_str_cat(s, v);
n++;
2022-10-07 19:30:54 +03:00
}
2022-11-13 11:12:20 +03:00
return s;
}
2022-10-07 19:30:54 +03:00
2022-11-13 11:12:20 +03:00
xs_str *not_really_markdown(const char *content, xs_list **attach)
2022-11-13 11:12:20 +03:00
/* formats a content using some Markdown rules */
{
2023-05-21 21:11:06 +03:00
xs_str *s = xs_str_new(NULL);
2022-11-13 11:12:20 +03:00
int in_pre = 0;
int in_blq = 0;
xs *list;
char *p, *v;
/* work by lines */
list = xs_split(content, "\n");
2022-10-07 19:30:54 +03:00
p = list;
2022-10-07 19:30:54 +03:00
while (xs_list_iter(&p, &v)) {
2022-11-13 11:12:20 +03:00
xs *ss = NULL;
2022-10-07 19:30:54 +03:00
2022-11-13 11:12:20 +03:00
if (strcmp(v, "```") == 0) {
2022-10-07 19:30:54 +03:00
if (!in_pre)
s = xs_str_cat(s, "<pre>");
else
s = xs_str_cat(s, "</pre>");
in_pre = !in_pre;
continue;
}
if (in_pre) {
// Encode all HTML characters when we're in pre element until we are out.
2023-07-24 13:52:09 +03:00
ss = encode_html(v);
s = xs_str_cat(s, ss);
s = xs_str_cat(s, "<br>");
continue;
}
2022-11-13 11:12:20 +03:00
else
ss = xs_strip_i(format_line(v, attach));
2022-11-13 11:12:20 +03:00
if (xs_startswith(ss, "---")) {
/* delete the --- */
ss = xs_strip_i(xs_crop_i(ss, 3, 0));
s = xs_str_cat(s, "<hr>");
s = xs_str_cat(s, ss);
continue;
}
2022-10-07 19:30:54 +03:00
if (xs_startswith(ss, ">")) {
/* delete the > and subsequent spaces */
2023-01-12 11:28:02 +03:00
ss = xs_strip_i(xs_crop_i(ss, 1, 0));
2022-10-07 19:30:54 +03:00
if (!in_blq) {
s = xs_str_cat(s, "<blockquote>");
in_blq = 1;
}
s = xs_str_cat(s, ss);
s = xs_str_cat(s, "<br>");
continue;
}
if (in_blq) {
s = xs_str_cat(s, "</blockquote>");
in_blq = 0;
}
s = xs_str_cat(s, ss);
s = xs_str_cat(s, "<br>");
}
if (in_blq)
s = xs_str_cat(s, "</blockquote>");
if (in_pre)
s = xs_str_cat(s, "</pre>");
/* some beauty fixes */
2022-11-13 11:12:20 +03:00
s = xs_replace_i(s, "<br><br><blockquote>", "<br><blockquote>");
2022-10-07 19:30:54 +03:00
s = xs_replace_i(s, "</blockquote><br>", "</blockquote>");
2022-11-01 21:49:35 +03:00
s = xs_replace_i(s, "</pre><br>", "</pre>");
2022-10-07 19:30:54 +03:00
{
/* traditional emoticons */
2023-08-17 19:20:16 +03:00
const char **emo = smileys;
2023-08-17 19:20:16 +03:00
while (*emo) {
s = xs_replace_i(s, emo[0], emo[1]);
emo += 2;
}
}
2022-11-13 10:41:50 +03:00
return s;
2022-10-07 19:30:54 +03:00
}
const char *valid_tags[] = {
"a", "p", "br", "br/", "blockquote", "ul", "ol", "li", "cite", "small",
"span", "i", "b", "u", "pre", "code", "em", "strong", "hr", "img", "del", NULL
};
2023-05-21 21:11:06 +03:00
xs_str *sanitize(const char *content)
/* cleans dangerous HTML output */
{
2023-05-21 21:11:06 +03:00
xs_str *s = xs_str_new(NULL);
xs *sl;
int n = 0;
char *p, *v;
2023-03-07 11:56:16 +03:00
sl = xs_regex_split(content, "</?[^>]+>");
p = sl;
n = 0;
while (xs_list_iter(&p, &v)) {
if (n & 0x1) {
2023-01-12 11:28:02 +03:00
xs *s1 = xs_strip_i(xs_crop_i(xs_dup(v), v[1] == '/' ? 2 : 1, -1));
xs *l1 = xs_split_n(s1, " ", 1);
2023-01-12 11:28:02 +03:00
xs *tag = xs_tolower_i(xs_dup(xs_list_get(l1, 0)));
xs *s2 = NULL;
int i;
/* check if it's one of the valid tags */
for (i = 0; valid_tags[i]; i++) {
if (strcmp(tag, valid_tags[i]) == 0)
break;
}
if (valid_tags[i]) {
/* accepted tag: rebuild it with only the accepted elements */
xs *el = xs_regex_match(v, "(src|href|rel|class|target)=\"[^\"]*\"");
xs *s3 = xs_join(el, " ");
2022-11-16 19:46:55 +03:00
s2 = xs_fmt("<%s%s%s%s>",
2022-11-16 19:49:33 +03:00
v[1] == '/' ? "/" : "", tag, xs_list_len(el) ? " " : "", s3);
s = xs_str_cat(s, s2);
} else {
/* else? just show it with encoded code.. that's it. */
xs *el = encode_html(v);
s = xs_str_cat(s, el);
}
}
else {
/* non-tag */
s = xs_str_cat(s, v);
}
n++;
}
return s;
}
2023-07-11 20:45:58 +03:00
xs_str *encode_html(const char *str)
/* escapes html characters */
{
xs_str *encoded = xs_replace(str, "&", "&amp;");
encoded = xs_replace_i(encoded, "<", "&lt;");
encoded = xs_replace_i(encoded, ">", "&gt;");
encoded = xs_replace_i(encoded, "\"", "&#34;");
encoded = xs_replace_i(encoded, "'", "&#39;");
/* Restore only <br>. Probably safe. Let's hope nothing goes wrong with this. */
encoded = xs_replace_i(encoded, "&lt;br&gt;", "<br>");
return encoded;
}