post parsing now builds a 'corpus' for each post, useful for future search feature, and ... seo keywords?

This commit is contained in:
Adam Piontek 2021-04-01 18:09:52 -04:00
parent b1b9c09a79
commit df2cc13167
4 changed files with 88 additions and 66 deletions

2
.iex.exs Normal file
View file

@ -0,0 +1,2 @@
alias Home73k.Blog
alias Home73k.Blog.Post

View file

@ -17,13 +17,12 @@ defmodule Home73k.Blog do
Post.parse!(post_path) Post.parse!(post_path)
end end
@posts posts @posts Enum.sort_by(posts, & &1.date, {:desc, NaiveDateTime})
# @posts Enum.sort_by(posts, & &1.date, {:desc, NaiveDateTime})
# @tags posts |> Stream.flat_map(& &1.tags) |> Stream.uniq() |> Enum.sort() @tags posts |> Stream.flat_map(& &1.tags) |> Stream.uniq() |> Enum.sort()
def list_posts, do: @posts def list_posts, do: @posts
# def list_tags, do: @tags def list_tags, do: @tags
# defmodule NotFoundError do # defmodule NotFoundError do
# defexception [:message, plug_status: 404] # defexception [:message, plug_status: 404]

View file

@ -1,8 +1,8 @@
defmodule Home73k.Blog.Post do defmodule Home73k.Blog.Post do
@enforce_keys [:title, :slug, :date, :author, :tags, :lede, :body] @enforce_keys [:title, :slug, :date, :author, :tags, :lede, :body, :corpus]
defstruct [:title, :slug, :date, :author, :tags, :lede, :body] defstruct [:title, :slug, :date, :author, :tags, :lede, :body, :corpus]
@title_slug_regex ~r/[^a-zA-Z0-9 ]/ @strip_words ~w(the and are for not but had has was all any too one you his her can that with have this will your from they want been much some very them into which then now get its youll youre)
@doc """ @doc """
The public parse!/1 function begins the post parse process by reading The public parse!/1 function begins the post parse process by reading
@ -15,13 +15,19 @@ defmodule Home73k.Blog.Post do
|> split_raw_file_data() |> split_raw_file_data()
|> parse_frontmatter() |> parse_frontmatter()
|> parse_lede() |> parse_lede()
|> parse_body()
|> build_corpus()
|> build_post()
end end
# """ split_raw_file_data/1 # """ split_raw_file_data/1
# If we receive {:ok, file_data}, we split frontmatter from markdown # If we receive {:ok, file_data}, we split frontmatter from markdown
# content and return [raw_frontmatter, markdown]. Otherwise return nil. # content and return [raw_frontmatter, markdown]. Otherwise return nil.
# """ # """
defp split_raw_file_data({:ok, file_data}), do: String.split(file_data, ~r/\n-{3,}\n/, parts: 2) defp split_raw_file_data({:ok, file_data}) do
file_data |> String.split("---", parts: 2, trim: true)
end
defp split_raw_file_data(_), do: nil defp split_raw_file_data(_), do: nil
# """ parse_frontmatter/1 # """ parse_frontmatter/1
@ -30,7 +36,7 @@ defmodule Home73k.Blog.Post do
# """ # """
defp parse_frontmatter([fm, md]) do defp parse_frontmatter([fm, md]) do
case parse_frontmatter_string(fm) do case parse_frontmatter_string(fm) do
{%{} = parsed_fm, _} -> {parsed_fm, md} {%{} = parsed_fm, _} -> {set_post_slug(parsed_fm), String.trim(md)}
{:error, _} -> nil {:error, _} -> nil
end end
end end
@ -47,11 +53,50 @@ defmodule Home73k.Blog.Post do
defp parse_lede(_), do: nil defp parse_lede(_), do: nil
# TODO: # """ parse_body/1
# |> parse_body() # Convert body markdown to html
# - convert to markdown # TODO: handle syntax highlighting
# - extract any code parts to mark with pygments? defp parse_body({fm, md}) do
# - figure that whole thing out Map.put(fm, :body, Earmark.as_html!(md))
end
defp parse_body(_), do: nil
# """ build_corpus/1
# Create a searchable word list for the post, for live searching
defp build_corpus(%{title: title, lede: lede, body: body, tags: tags} = post_data) do
# initialize corpus string from: title, lede, body, tags
corpus = (tags ++ [title, (lede && lede) || " ", body]) |> Enum.join(" ") |> String.downcase()
# scrub out (but replace with spaces):
# code blocks, html tags, html entities, newlines, forward and back slashes
html_scrub_regex = ~r/(<pre><code(.|\n)*?<\/code><\/pre>)|(<(.|\n)+?>)|(&#(.)+?;)|(&(.)+?;)|\n|\/|\\/
corpus = Regex.replace(html_scrub_regex, corpus, " ")
# restrict corpus to letters & numbers,
# then split to words (space delim), trimming as we go
# then reject all 0, 1, 2-letter words, and words in @strip_words
# reduce to unique words and join back to space-delim string
corpus =
Regex.replace(~r/[^a-z0-9 ]/, corpus, "")
|> String.split(" ", trim: true)
|> Stream.reject(&reject_word?/1)
|> Stream.uniq()
|> Enum.join(" ")
# Finally, return post_data with corpus
Map.put(post_data, :corpus, corpus)
end
defp build_corpus(_), do: nil
# """ build_post/1
# Create post struct from post data map
defp build_post(%{} = post_data) do
struct!(__MODULE__, post_data)
end
defp build_post(_), do: nil
###################################################################### ######################################################################
# HELPERS # HELPERS
@ -74,62 +119,36 @@ defmodule Home73k.Blog.Post do
# Handle split of post body. If lede found, return as html with body. # Handle split of post body. If lede found, return as html with body.
# Otherwise return nil with body. # Otherwise return nil with body.
# """ # """
defp extract_lede([lede, body]), do: {Earmark.as_html!(lede), body} defp extract_lede([lede, body]),
do: {String.trim_trailing(lede) |> Earmark.as_html!(), String.trim_leading(body)}
defp extract_lede([body]), do: {nil, body} defp extract_lede([body]), do: {nil, body}
# ################################################## # """ set_frontmatter_slug
# ################################################## # If no slug in frontmatter, convert title to slug and add to map
# ################################################## # """
# ################################################## defp set_post_slug(%{slug: _} = fm), do: fm
# ##################################################
# defp parse_split_file_data(["", fm, md]) do
# Code.eval_string(fm)
# |> parse_lede(md)
# end
# defp parse_split_file_data(_), do: nil defp set_post_slug(%{title: title} = fm) do
Map.put(fm, :slug, parse_title_to_slug(title))
end
# defp parse_lede({%{summary: summ} = fm, _}, md) do # """ parse_title_to_slug
# Earmark.as_html(md) # Takes a post title and returns a slug cleansed for URI request path
# |> parse_post(Earmark.as_html(summ), fm) # """
# end defp parse_title_to_slug(title) do
title = String.downcase(title)
# defp parse_lede({%{} = fm, _}, md) do Regex.replace(~r/[^a-z0-9 ]/, title, "")
# String.split(md, "<!--more-->", parts: 2) |> String.split(" ", trim: true)
# |> parse_lede(fm) |> Stream.reject(&reject_word?/1)
# end |> Enum.join("-")
end
# defp parse_lede([summ, _] = parts, fm) do # """ reject_word?
# parts # Determines if a word should be rejected, based on char length < 3,
# |> Enum.join(" ") # or if word is in @strip_words
# |> Earmark.as_html() # Used by parse_title_to_slug and build_corpus
# |> parse_post(Earmark.as_html(summ), fm) # """
# end defp reject_word?(word), do: String.length(word) < 3 || word in @strip_words
# defp parse_lede(md, fm) do
# Earmark.as_html(md)
# |> parse_post({:ok, nil, []}, fm)
# end
# defp parse_title_to_slug(title) do
# Regex.replace(@title_slug_regex, title, "")
# |> String.replace(" ", "-")
# |> String.downcase()
# end
# defp build_post(main_html, summ_html, fm) do
# fm
# |> Map.put_new(:slug, parse_title_to_slug(fm.title))
# |> Map.put_new(:author, "Author Name")
# |> Map.put_new(:tags, [])
# |> Map.put(:summary, summ_html)
# |> Map.put(:body, main_html)
# end
# defp parse_post({:ok, main_html, _}, {:ok, summ_html, _}, fm) do
# post = build_post(main_html, summ_html, fm)
# struct!(__MODULE__, post)
# end
# defp parse_post(_, _, _), do: nil
end end

View file

@ -13,6 +13,7 @@
"git_cli": {:hex, :git_cli, "0.3.0", "a5422f9b95c99483385b976f5d43f7e8233283a47cda13533d7c16131cb14df5", [:mix], [], "hexpm", "78cb952f4c86a41f4d3511f1d3ecb28edb268e3a7df278de2faa1bd4672eaf9b"}, "git_cli": {:hex, :git_cli, "0.3.0", "a5422f9b95c99483385b976f5d43f7e8233283a47cda13533d7c16131cb14df5", [:mix], [], "hexpm", "78cb952f4c86a41f4d3511f1d3ecb28edb268e3a7df278de2faa1bd4672eaf9b"},
"hackney": {:hex, :hackney, "1.17.4", "99da4674592504d3fb0cfef0db84c3ba02b4508bae2dff8c0108baa0d6e0977c", [:rebar3], [{:certifi, "~>2.6.1", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "~>6.1.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "~>1.0.0", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~>1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:parse_trans, "3.3.1", [hex: :parse_trans, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "~>1.1.0", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}, {:unicode_util_compat, "~>0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "de16ff4996556c8548d512f4dbe22dd58a587bf3332e7fd362430a7ef3986b16"}, "hackney": {:hex, :hackney, "1.17.4", "99da4674592504d3fb0cfef0db84c3ba02b4508bae2dff8c0108baa0d6e0977c", [:rebar3], [{:certifi, "~>2.6.1", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "~>6.1.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "~>1.0.0", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~>1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:parse_trans, "3.3.1", [hex: :parse_trans, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "~>1.1.0", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}, {:unicode_util_compat, "~>0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "de16ff4996556c8548d512f4dbe22dd58a587bf3332e7fd362430a7ef3986b16"},
"html_entities": {:hex, :html_entities, "0.5.2", "9e47e70598da7de2a9ff6af8758399251db6dbb7eebe2b013f2bbd2515895c3c", [:mix], [], "hexpm", "c53ba390403485615623b9531e97696f076ed415e8d8058b1dbaa28181f4fdcc"}, "html_entities": {:hex, :html_entities, "0.5.2", "9e47e70598da7de2a9ff6af8758399251db6dbb7eebe2b013f2bbd2515895c3c", [:mix], [], "hexpm", "c53ba390403485615623b9531e97696f076ed415e8d8058b1dbaa28181f4fdcc"},
"html_sanitize_ex": {:hex, :html_sanitize_ex, "1.4.1", "e8a67da405fe9f0d1be121a40a60f70811192033a5b8d00a95dddd807f5e053e", [:mix], [{:mochiweb, "~> 2.15", [hex: :mochiweb, repo: "hexpm", optional: false]}], "hexpm", "68d92656f47cd73598c45ad2394561f025c8c65d146001b955fd7b517858962a"},
"idna": {:hex, :idna, "6.1.1", "8a63070e9f7d0c62eb9d9fcb360a7de382448200fbbd1b106cc96d3d8099df8d", [:rebar3], [{:unicode_util_compat, "~>0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "92376eb7894412ed19ac475e4a86f7b413c1b9fbb5bd16dccd57934157944cea"}, "idna": {:hex, :idna, "6.1.1", "8a63070e9f7d0c62eb9d9fcb360a7de382448200fbbd1b106cc96d3d8099df8d", [:rebar3], [{:unicode_util_compat, "~>0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "92376eb7894412ed19ac475e4a86f7b413c1b9fbb5bd16dccd57934157944cea"},
"jason": {:hex, :jason, "1.2.2", "ba43e3f2709fd1aa1dce90aaabfd039d000469c05c56f0b8e31978e03fa39052", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "18a228f5f0058ee183f29f9eae0805c6e59d61c3b006760668d8d18ff0d12179"}, "jason": {:hex, :jason, "1.2.2", "ba43e3f2709fd1aa1dce90aaabfd039d000469c05c56f0b8e31978e03fa39052", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "18a228f5f0058ee183f29f9eae0805c6e59d61c3b006760668d8d18ff0d12179"},
"makeup": {:hex, :makeup, "1.0.5", "d5a830bc42c9800ce07dd97fa94669dfb93d3bf5fcf6ea7a0c67b2e0e4a7f26c", [:mix], [{:nimble_parsec, "~> 0.5 or ~> 1.0", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "cfa158c02d3f5c0c665d0af11512fed3fba0144cf1aadee0f2ce17747fba2ca9"}, "makeup": {:hex, :makeup, "1.0.5", "d5a830bc42c9800ce07dd97fa94669dfb93d3bf5fcf6ea7a0c67b2e0e4a7f26c", [:mix], [{:nimble_parsec, "~> 0.5 or ~> 1.0", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "cfa158c02d3f5c0c665d0af11512fed3fba0144cf1aadee0f2ce17747fba2ca9"},
@ -20,6 +21,7 @@
"metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm", "69b09adddc4f74a40716ae54d140f93beb0fb8978d8636eaded0c31b6f099f16"}, "metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm", "69b09adddc4f74a40716ae54d140f93beb0fb8978d8636eaded0c31b6f099f16"},
"mime": {:hex, :mime, "1.5.0", "203ef35ef3389aae6d361918bf3f952fa17a09e8e43b5aa592b93eba05d0fb8d", [:mix], [], "hexpm", "55a94c0f552249fc1a3dd9cd2d3ab9de9d3c89b559c2bd01121f824834f24746"}, "mime": {:hex, :mime, "1.5.0", "203ef35ef3389aae6d361918bf3f952fa17a09e8e43b5aa592b93eba05d0fb8d", [:mix], [], "hexpm", "55a94c0f552249fc1a3dd9cd2d3ab9de9d3c89b559c2bd01121f824834f24746"},
"mimerl": {:hex, :mimerl, "1.2.0", "67e2d3f571088d5cfd3e550c383094b47159f3eee8ffa08e64106cdf5e981be3", [:rebar3], [], "hexpm", "f278585650aa581986264638ebf698f8bb19df297f66ad91b18910dfc6e19323"}, "mimerl": {:hex, :mimerl, "1.2.0", "67e2d3f571088d5cfd3e550c383094b47159f3eee8ffa08e64106cdf5e981be3", [:rebar3], [], "hexpm", "f278585650aa581986264638ebf698f8bb19df297f66ad91b18910dfc6e19323"},
"mochiweb": {:hex, :mochiweb, "2.20.1", "e4dbd0ed716f076366ecf62ada5755a844e1d95c781e8c77df1d4114be868cdf", [], [], "hexpm", "d1aeee7870470d2fa9eae0b3d5ab6c33801aa2d82b10e9dade885c5c921b36aa"},
"nimble_parsec": {:hex, :nimble_parsec, "1.1.0", "3a6fca1550363552e54c216debb6a9e95bd8d32348938e13de5eda962c0d7f89", [:mix], [], "hexpm", "08eb32d66b706e913ff748f11694b17981c0b04a33ef470e33e11b3d3ac8f54b"}, "nimble_parsec": {:hex, :nimble_parsec, "1.1.0", "3a6fca1550363552e54c216debb6a9e95bd8d32348938e13de5eda962c0d7f89", [:mix], [], "hexpm", "08eb32d66b706e913ff748f11694b17981c0b04a33ef470e33e11b3d3ac8f54b"},
"parse_trans": {:hex, :parse_trans, "3.3.1", "16328ab840cc09919bd10dab29e431da3af9e9e7e7e6f0089dd5a2d2820011d8", [:rebar3], [], "hexpm", "07cd9577885f56362d414e8c4c4e6bdf10d43a8767abb92d24cbe8b24c54888b"}, "parse_trans": {:hex, :parse_trans, "3.3.1", "16328ab840cc09919bd10dab29e431da3af9e9e7e7e6f0089dd5a2d2820011d8", [:rebar3], [], "hexpm", "07cd9577885f56362d414e8c4c4e6bdf10d43a8767abb92d24cbe8b24c54888b"},
"phoenix": {:hex, :phoenix, "1.5.8", "71cfa7a9bb9a37af4df98939790642f210e35f696b935ca6d9d9c55a884621a4", [:mix], [{:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: true]}, {:phoenix_html, "~> 2.13", [hex: :phoenix_html, repo: "hexpm", optional: true]}, {:phoenix_pubsub, "~> 2.0", [hex: :phoenix_pubsub, repo: "hexpm", optional: false]}, {:plug, "~> 1.10", [hex: :plug, repo: "hexpm", optional: false]}, {:plug_cowboy, "~> 1.0 or ~> 2.2", [hex: :plug_cowboy, repo: "hexpm", optional: true]}, {:plug_crypto, "~> 1.1.2 or ~> 1.2", [hex: :plug_crypto, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "35ded0a32f4836168c7ab6c33b88822eccd201bcd9492125a9bea4c54332d955"}, "phoenix": {:hex, :phoenix, "1.5.8", "71cfa7a9bb9a37af4df98939790642f210e35f696b935ca6d9d9c55a884621a4", [:mix], [{:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: true]}, {:phoenix_html, "~> 2.13", [hex: :phoenix_html, repo: "hexpm", optional: true]}, {:phoenix_pubsub, "~> 2.0", [hex: :phoenix_pubsub, repo: "hexpm", optional: false]}, {:plug, "~> 1.10", [hex: :plug, repo: "hexpm", optional: false]}, {:plug_cowboy, "~> 1.0 or ~> 2.2", [hex: :plug_cowboy, repo: "hexpm", optional: true]}, {:plug_crypto, "~> 1.1.2 or ~> 1.2", [hex: :plug_crypto, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "35ded0a32f4836168c7ab6c33b88822eccd201bcd9492125a9bea4c54332d955"},