post parsing now builds a 'corpus' for each post, useful for future search feature, and ... seo keywords?
This commit is contained in:
parent
b1b9c09a79
commit
df2cc13167
4 changed files with 88 additions and 66 deletions
2
.iex.exs
Normal file
2
.iex.exs
Normal file
|
@ -0,0 +1,2 @@
|
|||
alias Home73k.Blog
|
||||
alias Home73k.Blog.Post
|
|
@ -17,13 +17,12 @@ defmodule Home73k.Blog do
|
|||
Post.parse!(post_path)
|
||||
end
|
||||
|
||||
@posts posts
|
||||
# @posts Enum.sort_by(posts, & &1.date, {:desc, NaiveDateTime})
|
||||
@posts Enum.sort_by(posts, & &1.date, {:desc, NaiveDateTime})
|
||||
|
||||
# @tags posts |> Stream.flat_map(& &1.tags) |> Stream.uniq() |> Enum.sort()
|
||||
@tags posts |> Stream.flat_map(& &1.tags) |> Stream.uniq() |> Enum.sort()
|
||||
|
||||
def list_posts, do: @posts
|
||||
# def list_tags, do: @tags
|
||||
def list_tags, do: @tags
|
||||
|
||||
# defmodule NotFoundError do
|
||||
# defexception [:message, plug_status: 404]
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
defmodule Home73k.Blog.Post do
|
||||
@enforce_keys [:title, :slug, :date, :author, :tags, :lede, :body]
|
||||
defstruct [:title, :slug, :date, :author, :tags, :lede, :body]
|
||||
@enforce_keys [:title, :slug, :date, :author, :tags, :lede, :body, :corpus]
|
||||
defstruct [:title, :slug, :date, :author, :tags, :lede, :body, :corpus]
|
||||
|
||||
@title_slug_regex ~r/[^a-zA-Z0-9 ]/
|
||||
@strip_words ~w(the and are for not but had has was all any too one you his her can that with have this will your from they want been much some very them into which then now get its youll youre)
|
||||
|
||||
@doc """
|
||||
The public parse!/1 function begins the post parse process by reading
|
||||
|
@ -15,13 +15,19 @@ defmodule Home73k.Blog.Post do
|
|||
|> split_raw_file_data()
|
||||
|> parse_frontmatter()
|
||||
|> parse_lede()
|
||||
|> parse_body()
|
||||
|> build_corpus()
|
||||
|> build_post()
|
||||
end
|
||||
|
||||
# """ split_raw_file_data/1
|
||||
# If we receive {:ok, file_data}, we split frontmatter from markdown
|
||||
# content and return [raw_frontmatter, markdown]. Otherwise return nil.
|
||||
# """
|
||||
defp split_raw_file_data({:ok, file_data}), do: String.split(file_data, ~r/\n-{3,}\n/, parts: 2)
|
||||
defp split_raw_file_data({:ok, file_data}) do
|
||||
file_data |> String.split("---", parts: 2, trim: true)
|
||||
end
|
||||
|
||||
defp split_raw_file_data(_), do: nil
|
||||
|
||||
# """ parse_frontmatter/1
|
||||
|
@ -30,7 +36,7 @@ defmodule Home73k.Blog.Post do
|
|||
# """
|
||||
defp parse_frontmatter([fm, md]) do
|
||||
case parse_frontmatter_string(fm) do
|
||||
{%{} = parsed_fm, _} -> {parsed_fm, md}
|
||||
{%{} = parsed_fm, _} -> {set_post_slug(parsed_fm), String.trim(md)}
|
||||
{:error, _} -> nil
|
||||
end
|
||||
end
|
||||
|
@ -47,11 +53,50 @@ defmodule Home73k.Blog.Post do
|
|||
|
||||
defp parse_lede(_), do: nil
|
||||
|
||||
# TODO:
|
||||
# |> parse_body()
|
||||
# - convert to markdown
|
||||
# - extract any code parts to mark with pygments?
|
||||
# - figure that whole thing out
|
||||
# """ parse_body/1
|
||||
# Convert body markdown to html
|
||||
# TODO: handle syntax highlighting
|
||||
defp parse_body({fm, md}) do
|
||||
Map.put(fm, :body, Earmark.as_html!(md))
|
||||
end
|
||||
|
||||
defp parse_body(_), do: nil
|
||||
|
||||
# """ build_corpus/1
|
||||
# Create a searchable word list for the post, for live searching
|
||||
defp build_corpus(%{title: title, lede: lede, body: body, tags: tags} = post_data) do
|
||||
# initialize corpus string from: title, lede, body, tags
|
||||
corpus = (tags ++ [title, (lede && lede) || " ", body]) |> Enum.join(" ") |> String.downcase()
|
||||
|
||||
# scrub out (but replace with spaces):
|
||||
# code blocks, html tags, html entities, newlines, forward and back slashes
|
||||
html_scrub_regex = ~r/(<pre><code(.|\n)*?<\/code><\/pre>)|(<(.|\n)+?>)|(&#(.)+?;)|(&(.)+?;)|\n|\/|\\/
|
||||
corpus = Regex.replace(html_scrub_regex, corpus, " ")
|
||||
|
||||
# restrict corpus to letters & numbers,
|
||||
# then split to words (space delim), trimming as we go
|
||||
# then reject all 0, 1, 2-letter words, and words in @strip_words
|
||||
# reduce to unique words and join back to space-delim string
|
||||
corpus =
|
||||
Regex.replace(~r/[^a-z0-9 ]/, corpus, "")
|
||||
|> String.split(" ", trim: true)
|
||||
|> Stream.reject(&reject_word?/1)
|
||||
|> Stream.uniq()
|
||||
|> Enum.join(" ")
|
||||
|
||||
# Finally, return post_data with corpus
|
||||
Map.put(post_data, :corpus, corpus)
|
||||
end
|
||||
|
||||
defp build_corpus(_), do: nil
|
||||
|
||||
# """ build_post/1
|
||||
# Create post struct from post data map
|
||||
defp build_post(%{} = post_data) do
|
||||
struct!(__MODULE__, post_data)
|
||||
end
|
||||
|
||||
defp build_post(_), do: nil
|
||||
|
||||
######################################################################
|
||||
# HELPERS
|
||||
|
@ -74,62 +119,36 @@ defmodule Home73k.Blog.Post do
|
|||
# Handle split of post body. If lede found, return as html with body.
|
||||
# Otherwise return nil with body.
|
||||
# """
|
||||
defp extract_lede([lede, body]), do: {Earmark.as_html!(lede), body}
|
||||
defp extract_lede([lede, body]),
|
||||
do: {String.trim_trailing(lede) |> Earmark.as_html!(), String.trim_leading(body)}
|
||||
|
||||
defp extract_lede([body]), do: {nil, body}
|
||||
|
||||
# ##################################################
|
||||
# ##################################################
|
||||
# ##################################################
|
||||
# ##################################################
|
||||
# ##################################################
|
||||
# defp parse_split_file_data(["", fm, md]) do
|
||||
# Code.eval_string(fm)
|
||||
# |> parse_lede(md)
|
||||
# end
|
||||
# """ set_frontmatter_slug
|
||||
# If no slug in frontmatter, convert title to slug and add to map
|
||||
# """
|
||||
defp set_post_slug(%{slug: _} = fm), do: fm
|
||||
|
||||
# defp parse_split_file_data(_), do: nil
|
||||
defp set_post_slug(%{title: title} = fm) do
|
||||
Map.put(fm, :slug, parse_title_to_slug(title))
|
||||
end
|
||||
|
||||
# defp parse_lede({%{summary: summ} = fm, _}, md) do
|
||||
# Earmark.as_html(md)
|
||||
# |> parse_post(Earmark.as_html(summ), fm)
|
||||
# end
|
||||
# """ parse_title_to_slug
|
||||
# Takes a post title and returns a slug cleansed for URI request path
|
||||
# """
|
||||
defp parse_title_to_slug(title) do
|
||||
title = String.downcase(title)
|
||||
|
||||
# defp parse_lede({%{} = fm, _}, md) do
|
||||
# String.split(md, "<!--more-->", parts: 2)
|
||||
# |> parse_lede(fm)
|
||||
# end
|
||||
Regex.replace(~r/[^a-z0-9 ]/, title, "")
|
||||
|> String.split(" ", trim: true)
|
||||
|> Stream.reject(&reject_word?/1)
|
||||
|> Enum.join("-")
|
||||
end
|
||||
|
||||
# defp parse_lede([summ, _] = parts, fm) do
|
||||
# parts
|
||||
# |> Enum.join(" ")
|
||||
# |> Earmark.as_html()
|
||||
# |> parse_post(Earmark.as_html(summ), fm)
|
||||
# end
|
||||
|
||||
# defp parse_lede(md, fm) do
|
||||
# Earmark.as_html(md)
|
||||
# |> parse_post({:ok, nil, []}, fm)
|
||||
# end
|
||||
|
||||
# defp parse_title_to_slug(title) do
|
||||
# Regex.replace(@title_slug_regex, title, "")
|
||||
# |> String.replace(" ", "-")
|
||||
# |> String.downcase()
|
||||
# end
|
||||
|
||||
# defp build_post(main_html, summ_html, fm) do
|
||||
# fm
|
||||
# |> Map.put_new(:slug, parse_title_to_slug(fm.title))
|
||||
# |> Map.put_new(:author, "Author Name")
|
||||
# |> Map.put_new(:tags, [])
|
||||
# |> Map.put(:summary, summ_html)
|
||||
# |> Map.put(:body, main_html)
|
||||
# end
|
||||
|
||||
# defp parse_post({:ok, main_html, _}, {:ok, summ_html, _}, fm) do
|
||||
# post = build_post(main_html, summ_html, fm)
|
||||
# struct!(__MODULE__, post)
|
||||
# end
|
||||
|
||||
# defp parse_post(_, _, _), do: nil
|
||||
# """ reject_word?
|
||||
# Determines if a word should be rejected, based on char length < 3,
|
||||
# or if word is in @strip_words
|
||||
# Used by parse_title_to_slug and build_corpus
|
||||
# """
|
||||
defp reject_word?(word), do: String.length(word) < 3 || word in @strip_words
|
||||
end
|
||||
|
|
2
mix.lock
2
mix.lock
|
@ -13,6 +13,7 @@
|
|||
"git_cli": {:hex, :git_cli, "0.3.0", "a5422f9b95c99483385b976f5d43f7e8233283a47cda13533d7c16131cb14df5", [:mix], [], "hexpm", "78cb952f4c86a41f4d3511f1d3ecb28edb268e3a7df278de2faa1bd4672eaf9b"},
|
||||
"hackney": {:hex, :hackney, "1.17.4", "99da4674592504d3fb0cfef0db84c3ba02b4508bae2dff8c0108baa0d6e0977c", [:rebar3], [{:certifi, "~>2.6.1", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "~>6.1.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "~>1.0.0", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~>1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:parse_trans, "3.3.1", [hex: :parse_trans, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "~>1.1.0", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}, {:unicode_util_compat, "~>0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "de16ff4996556c8548d512f4dbe22dd58a587bf3332e7fd362430a7ef3986b16"},
|
||||
"html_entities": {:hex, :html_entities, "0.5.2", "9e47e70598da7de2a9ff6af8758399251db6dbb7eebe2b013f2bbd2515895c3c", [:mix], [], "hexpm", "c53ba390403485615623b9531e97696f076ed415e8d8058b1dbaa28181f4fdcc"},
|
||||
"html_sanitize_ex": {:hex, :html_sanitize_ex, "1.4.1", "e8a67da405fe9f0d1be121a40a60f70811192033a5b8d00a95dddd807f5e053e", [:mix], [{:mochiweb, "~> 2.15", [hex: :mochiweb, repo: "hexpm", optional: false]}], "hexpm", "68d92656f47cd73598c45ad2394561f025c8c65d146001b955fd7b517858962a"},
|
||||
"idna": {:hex, :idna, "6.1.1", "8a63070e9f7d0c62eb9d9fcb360a7de382448200fbbd1b106cc96d3d8099df8d", [:rebar3], [{:unicode_util_compat, "~>0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "92376eb7894412ed19ac475e4a86f7b413c1b9fbb5bd16dccd57934157944cea"},
|
||||
"jason": {:hex, :jason, "1.2.2", "ba43e3f2709fd1aa1dce90aaabfd039d000469c05c56f0b8e31978e03fa39052", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "18a228f5f0058ee183f29f9eae0805c6e59d61c3b006760668d8d18ff0d12179"},
|
||||
"makeup": {:hex, :makeup, "1.0.5", "d5a830bc42c9800ce07dd97fa94669dfb93d3bf5fcf6ea7a0c67b2e0e4a7f26c", [:mix], [{:nimble_parsec, "~> 0.5 or ~> 1.0", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "cfa158c02d3f5c0c665d0af11512fed3fba0144cf1aadee0f2ce17747fba2ca9"},
|
||||
|
@ -20,6 +21,7 @@
|
|||
"metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm", "69b09adddc4f74a40716ae54d140f93beb0fb8978d8636eaded0c31b6f099f16"},
|
||||
"mime": {:hex, :mime, "1.5.0", "203ef35ef3389aae6d361918bf3f952fa17a09e8e43b5aa592b93eba05d0fb8d", [:mix], [], "hexpm", "55a94c0f552249fc1a3dd9cd2d3ab9de9d3c89b559c2bd01121f824834f24746"},
|
||||
"mimerl": {:hex, :mimerl, "1.2.0", "67e2d3f571088d5cfd3e550c383094b47159f3eee8ffa08e64106cdf5e981be3", [:rebar3], [], "hexpm", "f278585650aa581986264638ebf698f8bb19df297f66ad91b18910dfc6e19323"},
|
||||
"mochiweb": {:hex, :mochiweb, "2.20.1", "e4dbd0ed716f076366ecf62ada5755a844e1d95c781e8c77df1d4114be868cdf", [], [], "hexpm", "d1aeee7870470d2fa9eae0b3d5ab6c33801aa2d82b10e9dade885c5c921b36aa"},
|
||||
"nimble_parsec": {:hex, :nimble_parsec, "1.1.0", "3a6fca1550363552e54c216debb6a9e95bd8d32348938e13de5eda962c0d7f89", [:mix], [], "hexpm", "08eb32d66b706e913ff748f11694b17981c0b04a33ef470e33e11b3d3ac8f54b"},
|
||||
"parse_trans": {:hex, :parse_trans, "3.3.1", "16328ab840cc09919bd10dab29e431da3af9e9e7e7e6f0089dd5a2d2820011d8", [:rebar3], [], "hexpm", "07cd9577885f56362d414e8c4c4e6bdf10d43a8767abb92d24cbe8b24c54888b"},
|
||||
"phoenix": {:hex, :phoenix, "1.5.8", "71cfa7a9bb9a37af4df98939790642f210e35f696b935ca6d9d9c55a884621a4", [:mix], [{:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: true]}, {:phoenix_html, "~> 2.13", [hex: :phoenix_html, repo: "hexpm", optional: true]}, {:phoenix_pubsub, "~> 2.0", [hex: :phoenix_pubsub, repo: "hexpm", optional: false]}, {:plug, "~> 1.10", [hex: :plug, repo: "hexpm", optional: false]}, {:plug_cowboy, "~> 1.0 or ~> 2.2", [hex: :plug_cowboy, repo: "hexpm", optional: true]}, {:plug_crypto, "~> 1.1.2 or ~> 1.2", [hex: :plug_crypto, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "35ded0a32f4836168c7ab6c33b88822eccd201bcd9492125a9bea4c54332d955"},
|
||||
|
|
Loading…
Reference in a new issue