post parsing now builds a 'corpus' for each post, useful for future search feature, and ... seo keywords?

This commit is contained in:
Adam Piontek 2021-04-01 18:09:52 -04:00
parent b1b9c09a79
commit df2cc13167
4 changed files with 88 additions and 66 deletions

2
.iex.exs Normal file
View file

@ -0,0 +1,2 @@
alias Home73k.Blog
alias Home73k.Blog.Post

View file

@ -17,13 +17,12 @@ defmodule Home73k.Blog do
Post.parse!(post_path)
end
@posts posts
# @posts Enum.sort_by(posts, & &1.date, {:desc, NaiveDateTime})
@posts Enum.sort_by(posts, & &1.date, {:desc, NaiveDateTime})
# @tags posts |> Stream.flat_map(& &1.tags) |> Stream.uniq() |> Enum.sort()
@tags posts |> Stream.flat_map(& &1.tags) |> Stream.uniq() |> Enum.sort()
def list_posts, do: @posts
# def list_tags, do: @tags
def list_tags, do: @tags
# defmodule NotFoundError do
# defexception [:message, plug_status: 404]

View file

@ -1,8 +1,8 @@
defmodule Home73k.Blog.Post do
@enforce_keys [:title, :slug, :date, :author, :tags, :lede, :body]
defstruct [:title, :slug, :date, :author, :tags, :lede, :body]
@enforce_keys [:title, :slug, :date, :author, :tags, :lede, :body, :corpus]
defstruct [:title, :slug, :date, :author, :tags, :lede, :body, :corpus]
@title_slug_regex ~r/[^a-zA-Z0-9 ]/
@strip_words ~w(the and are for not but had has was all any too one you his her can that with have this will your from they want been much some very them into which then now get its youll youre)
@doc """
The public parse!/1 function begins the post parse process by reading
@ -15,13 +15,19 @@ defmodule Home73k.Blog.Post do
|> split_raw_file_data()
|> parse_frontmatter()
|> parse_lede()
|> parse_body()
|> build_corpus()
|> build_post()
end
# """ split_raw_file_data/1
# If we receive {:ok, file_data}, we split frontmatter from markdown
# content and return [raw_frontmatter, markdown]. Otherwise return nil.
# """
defp split_raw_file_data({:ok, file_data}), do: String.split(file_data, ~r/\n-{3,}\n/, parts: 2)
defp split_raw_file_data({:ok, file_data}) do
file_data |> String.split("---", parts: 2, trim: true)
end
defp split_raw_file_data(_), do: nil
# """ parse_frontmatter/1
@ -30,7 +36,7 @@ defmodule Home73k.Blog.Post do
# """
defp parse_frontmatter([fm, md]) do
case parse_frontmatter_string(fm) do
{%{} = parsed_fm, _} -> {parsed_fm, md}
{%{} = parsed_fm, _} -> {set_post_slug(parsed_fm), String.trim(md)}
{:error, _} -> nil
end
end
@ -47,11 +53,50 @@ defmodule Home73k.Blog.Post do
defp parse_lede(_), do: nil
# TODO:
# |> parse_body()
# - convert to markdown
# - extract any code parts to mark with pygments?
# - figure that whole thing out
# """ parse_body/1
# Convert body markdown to html
# TODO: handle syntax highlighting
defp parse_body({fm, md}) do
Map.put(fm, :body, Earmark.as_html!(md))
end
defp parse_body(_), do: nil
# """ build_corpus/1
# Create a searchable word list for the post, for live searching
defp build_corpus(%{title: title, lede: lede, body: body, tags: tags} = post_data) do
# initialize corpus string from: title, lede, body, tags
corpus = (tags ++ [title, (lede && lede) || " ", body]) |> Enum.join(" ") |> String.downcase()
# scrub out (but replace with spaces):
# code blocks, html tags, html entities, newlines, forward and back slashes
html_scrub_regex = ~r/(<pre><code(.|\n)*?<\/code><\/pre>)|(<(.|\n)+?>)|(&#(.)+?;)|(&(.)+?;)|\n|\/|\\/
corpus = Regex.replace(html_scrub_regex, corpus, " ")
# restrict corpus to letters & numbers,
# then split to words (space delim), trimming as we go
# then reject all 0, 1, 2-letter words, and words in @strip_words
# reduce to unique words and join back to space-delim string
corpus =
Regex.replace(~r/[^a-z0-9 ]/, corpus, "")
|> String.split(" ", trim: true)
|> Stream.reject(&reject_word?/1)
|> Stream.uniq()
|> Enum.join(" ")
# Finally, return post_data with corpus
Map.put(post_data, :corpus, corpus)
end
defp build_corpus(_), do: nil
# """ build_post/1
# Create post struct from post data map
defp build_post(%{} = post_data) do
struct!(__MODULE__, post_data)
end
defp build_post(_), do: nil
######################################################################
# HELPERS
@ -74,62 +119,36 @@ defmodule Home73k.Blog.Post do
# Handle split of post body. If lede found, return as html with body.
# Otherwise return nil with body.
# """
defp extract_lede([lede, body]), do: {Earmark.as_html!(lede), body}
defp extract_lede([lede, body]),
do: {String.trim_trailing(lede) |> Earmark.as_html!(), String.trim_leading(body)}
defp extract_lede([body]), do: {nil, body}
# ##################################################
# ##################################################
# ##################################################
# ##################################################
# ##################################################
# defp parse_split_file_data(["", fm, md]) do
# Code.eval_string(fm)
# |> parse_lede(md)
# end
# """ set_frontmatter_slug
# If no slug in frontmatter, convert title to slug and add to map
# """
defp set_post_slug(%{slug: _} = fm), do: fm
# defp parse_split_file_data(_), do: nil
defp set_post_slug(%{title: title} = fm) do
Map.put(fm, :slug, parse_title_to_slug(title))
end
# defp parse_lede({%{summary: summ} = fm, _}, md) do
# Earmark.as_html(md)
# |> parse_post(Earmark.as_html(summ), fm)
# end
# """ parse_title_to_slug
# Takes a post title and returns a slug cleansed for URI request path
# """
defp parse_title_to_slug(title) do
title = String.downcase(title)
# defp parse_lede({%{} = fm, _}, md) do
# String.split(md, "<!--more-->", parts: 2)
# |> parse_lede(fm)
# end
Regex.replace(~r/[^a-z0-9 ]/, title, "")
|> String.split(" ", trim: true)
|> Stream.reject(&reject_word?/1)
|> Enum.join("-")
end
# defp parse_lede([summ, _] = parts, fm) do
# parts
# |> Enum.join(" ")
# |> Earmark.as_html()
# |> parse_post(Earmark.as_html(summ), fm)
# end
# defp parse_lede(md, fm) do
# Earmark.as_html(md)
# |> parse_post({:ok, nil, []}, fm)
# end
# defp parse_title_to_slug(title) do
# Regex.replace(@title_slug_regex, title, "")
# |> String.replace(" ", "-")
# |> String.downcase()
# end
# defp build_post(main_html, summ_html, fm) do
# fm
# |> Map.put_new(:slug, parse_title_to_slug(fm.title))
# |> Map.put_new(:author, "Author Name")
# |> Map.put_new(:tags, [])
# |> Map.put(:summary, summ_html)
# |> Map.put(:body, main_html)
# end
# defp parse_post({:ok, main_html, _}, {:ok, summ_html, _}, fm) do
# post = build_post(main_html, summ_html, fm)
# struct!(__MODULE__, post)
# end
# defp parse_post(_, _, _), do: nil
# """ reject_word?
# Determines if a word should be rejected, based on char length < 3,
# or if word is in @strip_words
# Used by parse_title_to_slug and build_corpus
# """
defp reject_word?(word), do: String.length(word) < 3 || word in @strip_words
end

View file

@ -13,6 +13,7 @@
"git_cli": {:hex, :git_cli, "0.3.0", "a5422f9b95c99483385b976f5d43f7e8233283a47cda13533d7c16131cb14df5", [:mix], [], "hexpm", "78cb952f4c86a41f4d3511f1d3ecb28edb268e3a7df278de2faa1bd4672eaf9b"},
"hackney": {:hex, :hackney, "1.17.4", "99da4674592504d3fb0cfef0db84c3ba02b4508bae2dff8c0108baa0d6e0977c", [:rebar3], [{:certifi, "~>2.6.1", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "~>6.1.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "~>1.0.0", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~>1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:parse_trans, "3.3.1", [hex: :parse_trans, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "~>1.1.0", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}, {:unicode_util_compat, "~>0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "de16ff4996556c8548d512f4dbe22dd58a587bf3332e7fd362430a7ef3986b16"},
"html_entities": {:hex, :html_entities, "0.5.2", "9e47e70598da7de2a9ff6af8758399251db6dbb7eebe2b013f2bbd2515895c3c", [:mix], [], "hexpm", "c53ba390403485615623b9531e97696f076ed415e8d8058b1dbaa28181f4fdcc"},
"html_sanitize_ex": {:hex, :html_sanitize_ex, "1.4.1", "e8a67da405fe9f0d1be121a40a60f70811192033a5b8d00a95dddd807f5e053e", [:mix], [{:mochiweb, "~> 2.15", [hex: :mochiweb, repo: "hexpm", optional: false]}], "hexpm", "68d92656f47cd73598c45ad2394561f025c8c65d146001b955fd7b517858962a"},
"idna": {:hex, :idna, "6.1.1", "8a63070e9f7d0c62eb9d9fcb360a7de382448200fbbd1b106cc96d3d8099df8d", [:rebar3], [{:unicode_util_compat, "~>0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "92376eb7894412ed19ac475e4a86f7b413c1b9fbb5bd16dccd57934157944cea"},
"jason": {:hex, :jason, "1.2.2", "ba43e3f2709fd1aa1dce90aaabfd039d000469c05c56f0b8e31978e03fa39052", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "18a228f5f0058ee183f29f9eae0805c6e59d61c3b006760668d8d18ff0d12179"},
"makeup": {:hex, :makeup, "1.0.5", "d5a830bc42c9800ce07dd97fa94669dfb93d3bf5fcf6ea7a0c67b2e0e4a7f26c", [:mix], [{:nimble_parsec, "~> 0.5 or ~> 1.0", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "cfa158c02d3f5c0c665d0af11512fed3fba0144cf1aadee0f2ce17747fba2ca9"},
@ -20,6 +21,7 @@
"metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm", "69b09adddc4f74a40716ae54d140f93beb0fb8978d8636eaded0c31b6f099f16"},
"mime": {:hex, :mime, "1.5.0", "203ef35ef3389aae6d361918bf3f952fa17a09e8e43b5aa592b93eba05d0fb8d", [:mix], [], "hexpm", "55a94c0f552249fc1a3dd9cd2d3ab9de9d3c89b559c2bd01121f824834f24746"},
"mimerl": {:hex, :mimerl, "1.2.0", "67e2d3f571088d5cfd3e550c383094b47159f3eee8ffa08e64106cdf5e981be3", [:rebar3], [], "hexpm", "f278585650aa581986264638ebf698f8bb19df297f66ad91b18910dfc6e19323"},
"mochiweb": {:hex, :mochiweb, "2.20.1", "e4dbd0ed716f076366ecf62ada5755a844e1d95c781e8c77df1d4114be868cdf", [], [], "hexpm", "d1aeee7870470d2fa9eae0b3d5ab6c33801aa2d82b10e9dade885c5c921b36aa"},
"nimble_parsec": {:hex, :nimble_parsec, "1.1.0", "3a6fca1550363552e54c216debb6a9e95bd8d32348938e13de5eda962c0d7f89", [:mix], [], "hexpm", "08eb32d66b706e913ff748f11694b17981c0b04a33ef470e33e11b3d3ac8f54b"},
"parse_trans": {:hex, :parse_trans, "3.3.1", "16328ab840cc09919bd10dab29e431da3af9e9e7e7e6f0089dd5a2d2820011d8", [:rebar3], [], "hexpm", "07cd9577885f56362d414e8c4c4e6bdf10d43a8767abb92d24cbe8b24c54888b"},
"phoenix": {:hex, :phoenix, "1.5.8", "71cfa7a9bb9a37af4df98939790642f210e35f696b935ca6d9d9c55a884621a4", [:mix], [{:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: true]}, {:phoenix_html, "~> 2.13", [hex: :phoenix_html, repo: "hexpm", optional: true]}, {:phoenix_pubsub, "~> 2.0", [hex: :phoenix_pubsub, repo: "hexpm", optional: false]}, {:plug, "~> 1.10", [hex: :plug, repo: "hexpm", optional: false]}, {:plug_cowboy, "~> 1.0 or ~> 2.2", [hex: :plug_cowboy, repo: "hexpm", optional: true]}, {:plug_crypto, "~> 1.1.2 or ~> 1.2", [hex: :plug_crypto, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "35ded0a32f4836168c7ab6c33b88822eccd201bcd9492125a9bea4c54332d955"},