Trust In-Reply-To when it looks sane

classic Classic list List threaded Threaded
6 messages Options
David Bremner-2 David Bremner-2
Reply | Threaded
Open this post in threaded view
|

Trust In-Reply-To when it looks sane

In cf8aaafbad68, Arron Ecay changed the way indexing chose a "replyto"
from In-Reply-To to References. This mostly works well, but for some
corner cases that matter to notmuch users (e.g. the Debian BTS). In
this series I try to keep the best of both options, by choosing
In-Reply-To when it looks sane.


_______________________________________________
notmuch mailing list
[hidden email]
https://notmuchmail.org/mailman/listinfo/notmuch
David Bremner-2 David Bremner-2
Reply | Threaded
Open this post in threaded view
|

[PATCH 1/5] test: add known broken test for good In-Reply-To / bad References

The current scheme of choosing the replyto (i.e. the default parent
for threading purposes) does not work well for mailers that put
the oldest Reference last.
---
 test/T510-thread-replies.sh                   | 27 +++++++++++++++++++
 .../threading/parent-priority/cur/child       | 11 ++++++++
 .../threading/parent-priority/cur/grand-child | 10 +++++++
 .../threading/parent-priority/cur/root        |  7 +++++
 4 files changed, 55 insertions(+)
 create mode 100644 test/corpora/threading/parent-priority/cur/child
 create mode 100644 test/corpora/threading/parent-priority/cur/grand-child
 create mode 100644 test/corpora/threading/parent-priority/cur/root

diff --git a/test/T510-thread-replies.sh b/test/T510-thread-replies.sh
index 6837ff17..a19669a9 100755
--- a/test/T510-thread-replies.sh
+++ b/test/T510-thread-replies.sh
@@ -165,4 +165,31 @@ expected=`echo "$expected" | notmuch_json_show_sanitize`
 test_expect_equal_json "$output" "$expected"
 
 
+add_email_corpus threading
+
+test_begin_subtest "trusting reply-to"
+test_subtest_known_broken
+notmuch show --entire-thread=true id:[hidden email] | grep ^Subject:  > OUTPUT
+cat <<EOF > EXPECTED
+Subject: root message
+Subject: child message
+Subject: grand-child message
+EOF
+test_expect_equal_file EXPECTED OUTPUT
+
+test_begin_subtest "trusting reply-to (tree view)"
+test_subtest_known_broken
+test_emacs '(notmuch-tree "id:[hidden email]")
+    (notmuch-test-wait)
+    (test-output)
+    (delete-other-windows)'
+cat <<EOF > EXPECTED
+  2016-06-17  Alice                 ┬►root message                                        (inbox unread)
+  2016-06-18  Alice                 ╰┬►child message                                      (inbox unread)
+  2016-06-18  Alice                  ╰─►grand-child message                               (inbox unread)
+End of search results.
+EOF
+test_expect_equal_file EXPECTED OUTPUT
+
+
 test_done
diff --git a/test/corpora/threading/parent-priority/cur/child b/test/corpora/threading/parent-priority/cur/child
new file mode 100644
index 00000000..23ee6495
--- /dev/null
+++ b/test/corpora/threading/parent-priority/cur/child
@@ -0,0 +1,11 @@
+From: Alice <[hidden email]>
+To: Daniel <[hidden email]>
+Subject: child message
+Message-ID: <[hidden email]>
+In-Reply-To: <[hidden email]>
+References:  <[hidden email]>
+Date: Fri, 17 Jun 2016 22:14:41 -0400
+
+This is a normal-ish reply, and has both a references header and an
+in-reply-to header.
+
diff --git a/test/corpora/threading/parent-priority/cur/grand-child b/test/corpora/threading/parent-priority/cur/grand-child
new file mode 100644
index 00000000..028371d4
--- /dev/null
+++ b/test/corpora/threading/parent-priority/cur/grand-child
@@ -0,0 +1,10 @@
+From: Alice <[hidden email]>
+To: Daniel <[hidden email]>
+Subject: grand-child message
+Message-ID: <[hidden email]>
+In-Reply-To: <[hidden email]>
+References:  <[hidden email]> <[hidden email]>
+Date: Fri, 17 Jun 2016 22:24:41 -0400
+
+This has the references headers in the wrong order, with oldest first.
+Debbugs does this.
diff --git a/test/corpora/threading/parent-priority/cur/root b/test/corpora/threading/parent-priority/cur/root
new file mode 100644
index 00000000..3990843d
--- /dev/null
+++ b/test/corpora/threading/parent-priority/cur/root
@@ -0,0 +1,7 @@
+From: Alice <[hidden email]>
+To: Daniel <[hidden email]>
+Subject: root message
+Message-ID: <[hidden email]>
+Date: Thu, 16 Jun 2016 22:14:41 -0400
+
+This message has no reply-to
--
2.18.0

_______________________________________________
notmuch mailing list
[hidden email]
https://notmuchmail.org/mailman/listinfo/notmuch
David Bremner-2 David Bremner-2
Reply | Threaded
Open this post in threaded view
|

[PATCH 2/5] test/thread-replies: mangle In-Reply-To's

In reply to this post by David Bremner-2
In a future commit, we will start trusting In-Reply-To's when they
look sane (i.e. a single message-id). Modify these tests so they will
keep passing (i.e. keep choosing References) when that happens.
---
 test/T510-thread-replies.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/T510-thread-replies.sh b/test/T510-thread-replies.sh
index a19669a9..3e7b2b63 100755
--- a/test/T510-thread-replies.sh
+++ b/test/T510-thread-replies.sh
@@ -45,10 +45,10 @@ expected='[[[{"id": "[hidden email]",
 expected=`echo "$expected" | notmuch_json_show_sanitize`
 test_expect_equal_json "$output" "$expected"
 
-test_begin_subtest "Prefer References to In-Reply-To"
+test_begin_subtest "Prefer References to dodgy In-Reply-To"
 add_message '[id]="[hidden email]"' \
     '[subject]=two'
-add_message '[in-reply-to]="<[hidden email]>"' \
+add_message '[in-reply-to]="Your message of December 31 1999 <[hidden email]>"' \
     '[references]="<[hidden email]>"' \
     '[subject]="Re: two"'
 output=$(notmuch show --format=json 'subject:two' | notmuch_json_show_sanitize)
@@ -101,12 +101,12 @@ expected='[[[{"id": "[hidden email]", "match": true, "excluded": false,
 expected=`echo "$expected" | notmuch_json_show_sanitize`
 test_expect_equal_json "$output" "$expected"
 
-test_begin_subtest "Use last Reference"
+test_begin_subtest "Use last Reference when In-Reply-To is dodgy"
 add_message '[id]="[hidden email]"' \
     '[subject]="four"'
 add_message '[id]="[hidden email]"' \
     '[subject]="not-four"'
-add_message '[in-reply-to]="<[hidden email]>"' \
+add_message '[in-reply-to]="<[hidden email]> (RFC822 4lyfe)"' \
     '[references]="<[hidden email]> <[hidden email]>"' \
     '[subject]="neither"'
 output=$(notmuch show --format=json 'subject:four' | notmuch_json_show_sanitize)
--
2.18.0

_______________________________________________
notmuch mailing list
[hidden email]
https://notmuchmail.org/mailman/listinfo/notmuch
David Bremner-2 David Bremner-2
Reply | Threaded
Open this post in threaded view
|

[PATCH 3/5] util/string-util: export skip_space

In reply to this post by David Bremner-2
It's only few lines, but we already define the function, so make it
usable elsewhere
---
 util/string-util.c | 2 +-
 util/string-util.h | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/util/string-util.c b/util/string-util.c
index b0108811..fc2058e0 100644
--- a/util/string-util.c
+++ b/util/string-util.c
@@ -141,7 +141,7 @@ make_boolean_term (void *ctx, const char *prefix, const char *term,
     return 0;
 }
 
-static const char*
+const char*
 skip_space (const char *str)
 {
     while (*str && isspace ((unsigned char) *str))
diff --git a/util/string-util.h b/util/string-util.h
index 97770614..4c110a20 100644
--- a/util/string-util.h
+++ b/util/string-util.h
@@ -77,6 +77,8 @@ unsigned int strcase_hash (const void *ptr);
 
 void strip_trailing (char *str, char ch);
 
+const char* skip_space (const char *str);
+
 #ifdef __cplusplus
 }
 #endif
--
2.18.0

_______________________________________________
notmuch mailing list
[hidden email]
https://notmuchmail.org/mailman/listinfo/notmuch
David Bremner-2 David Bremner-2
Reply | Threaded
Open this post in threaded view
|

[PATCH 4/5] lib: add _notmuch_message_id_parse_strict

In reply to this post by David Bremner-2
The idea is that if a message-id parses with this function, the MUA
generating it was probably sane, and in particular it's probably safe
to use the result as a parent from In-Reply-to.
---
 lib/message-id.c        | 32 ++++++++++++++++++
 lib/notmuch-private.h   | 14 ++++++++
 test/Makefile.local     |  6 +++-
 test/T710-message-id.sh | 73 +++++++++++++++++++++++++++++++++++++++++
 test/message-id-parse.c | 24 ++++++++++++++
 5 files changed, 148 insertions(+), 1 deletion(-)
 create mode 100755 test/T710-message-id.sh
 create mode 100644 test/message-id-parse.c

diff --git a/lib/message-id.c b/lib/message-id.c
index d7541d50..a1dce9c8 100644
--- a/lib/message-id.c
+++ b/lib/message-id.c
@@ -1,4 +1,5 @@
 #include "notmuch-private.h"
+#include "string-util.h"
 
 /* Advance 'str' past any whitespace or RFC 822 comments. A comment is
  * a (potentially nested) parenthesized sequence with '\' used to
@@ -94,3 +95,34 @@ _notmuch_message_id_parse (void *ctx, const char *message_id, const char **next)
 
     return result;
 }
+
+char *
+_notmuch_message_id_parse_strict (void *ctx, const char *message_id)
+{
+    const char *s, *end;
+    char *result;
+
+    if (message_id == NULL || *message_id == '\0')
+ return NULL;
+
+    s = skip_space (message_id);
+    if (*s == '<')
+ s++;
+    else
+ return NULL;
+
+    for (end = s; *end && *end != '>'; end++)
+ if (isspace (*end))
+    return NULL;
+
+    if (*end != '>')
+ return NULL;
+    else {
+ const char *last =  skip_space (end+1);
+ if (*last != '\0')
+    return NULL;
+    }
+
+    result = talloc_strndup (ctx, s, end - s);
+    return result;
+}
diff --git a/lib/notmuch-private.h b/lib/notmuch-private.h
index 3764a6a9..109c067f 100644
--- a/lib/notmuch-private.h
+++ b/lib/notmuch-private.h
@@ -526,6 +526,20 @@ _notmuch_query_count_documents (notmuch_query_t *query,
 char *
 _notmuch_message_id_parse (void *ctx, const char *message_id, const char **next);
 
+/* Parse a message-id, discarding leading and trailing whitespace, and
+ * '<' and '>' delimiters.
+ *
+ * Apply a probably-stricter-than RFC definition of what is allowed in
+ * a message-id. In particular, forbid whitespace.
+ *
+ * Returns a newly talloc'ed string belonging to 'ctx'.
+ *
+ * Returns NULL if there is any error parsing the message-id.
+ */
+
+char *
+_notmuch_message_id_parse_strict (void *ctx, const char *message_id);
+
 
 /* message.cc */
 
diff --git a/test/Makefile.local b/test/Makefile.local
index 1a0ab813..1cf09778 100644
--- a/test/Makefile.local
+++ b/test/Makefile.local
@@ -15,6 +15,9 @@ smtp_dummy_modules = $(smtp_dummy_srcs:.c=.o)
 $(dir)/arg-test: $(dir)/arg-test.o command-line-arguments.o util/libnotmuch_util.a
  $(call quiet,CC) $^ -o $@ $(LDFLAGS)
 
+$(dir)/message-id-parse: $(dir)/message-id-parse.o lib/libnotmuch.a util/libnotmuch_util.a
+ $(call quiet,CC) $^ -o $@ $(LDFLAGS) $(TALLOC_LDFLAGS)
+
 $(dir)/hex-xcode: $(dir)/hex-xcode.o command-line-arguments.o util/libnotmuch_util.a
  $(call quiet,CC) $^ -o $@ $(LDFLAGS) $(TALLOC_LDFLAGS)
 
@@ -50,7 +53,8 @@ test_main_srcs=$(dir)/arg-test.c \
       $(dir)/smtp-dummy.c \
       $(dir)/symbol-test.cc \
       $(dir)/make-db-version.cc \
-      $(dir)/ghost-report.cc
+      $(dir)/ghost-report.cc \
+      $(dir)/message-id-parse.c
 
 test_srcs=$(test_main_srcs) $(dir)/database-test.c
 
diff --git a/test/T710-message-id.sh b/test/T710-message-id.sh
new file mode 100755
index 00000000..163c1300
--- /dev/null
+++ b/test/T710-message-id.sh
@@ -0,0 +1,73 @@
+#!/usr/bin/env bash
+test_description="message id parsing"
+
+. $(dirname "$0")/test-lib.sh || exit 1
+
+test_begin_subtest "good message ids"
+${TEST_DIRECTORY}/message-id-parse <<EOF >OUTPUT
+<[hidden email]>
+<[hidden email]>
+<[hidden email]>
+EOF
+cat <<EOF >EXPECTED
+GOOD: [hidden email]
+GOOD: [hidden email]
+GOOD: [hidden email]
+EOF
+test_expect_equal_file EXPECTED OUTPUT
+
+test_begin_subtest "leading and trailing space is OK"
+${TEST_DIRECTORY}/message-id-parse <<EOF >OUTPUT
+   <[hidden email]>
+<[hidden email]>    
+    <[hidden email]>
+EOF
+cat <<EOF >EXPECTED
+GOOD: [hidden email]
+GOOD: [hidden email]
+GOOD: [hidden email]
+EOF
+test_expect_equal_file EXPECTED OUTPUT
+
+test_begin_subtest "<> delimeters are required"
+${TEST_DIRECTORY}/message-id-parse <<EOF >OUTPUT
+[hidden email]>
+<[hidden email]
+[hidden email]
+EOF
+cat <<EOF >EXPECTED
+BAD: [hidden email]>
+BAD: <[hidden email]
+BAD: [hidden email]
+EOF
+test_expect_equal_file EXPECTED OUTPUT
+
+test_begin_subtest "embedded whitespace is forbidden"
+${TEST_DIRECTORY}/message-id-parse <<EOF >OUTPUT
+<018b1a8f2d1df62e804ce88b65401304832dfbbf.1346614915 .[hidden email]>
+<1530507300.raoomurnbf.astroid @strange.none>
+<1258787708-21121-
[hidden email]>
+EOF
+cat <<EOF >EXPECTED
+BAD: <018b1a8f2d1df62e804ce88b65401304832dfbbf.1346614915 .[hidden email]>
+BAD: <1530507300.raoomurnbf.astroid @strange.none>
+BAD: <1258787708-21121-
[hidden email]>
+EOF
+test_expect_equal_file EXPECTED OUTPUT
+
+
+test_begin_subtest "folded real life bad In-Reply-To values"
+${TEST_DIRECTORY}/message-id-parse <<EOF >OUTPUT
+<[hidden email]> (Ian Jackson's message of "Mon, 5 Dec 2016 14:41:01 +0000")
+<[hidden email]>  <[hidden email]>
+Your message of Tue, 09 Dec 2014 13:21:11 +0100. <1900758.CgLNVPbY9N@liber>
+EOF
+cat <<EOF >EXPECTED
+BAD: <[hidden email]> (Ian Jackson's message of "Mon, 5 Dec 2016 14:41:01 +0000")
+BAD: <[hidden email]>  <[hidden email]>
+BAD: Your message of Tue, 09 Dec 2014 13:21:11 +0100. <1900758.CgLNVPbY9N@liber>
+EOF
+test_expect_equal_file EXPECTED OUTPUT
+
+
+test_done
diff --git a/test/message-id-parse.c b/test/message-id-parse.c
new file mode 100644
index 00000000..25decc9b
--- /dev/null
+++ b/test/message-id-parse.c
@@ -0,0 +1,24 @@
+#include <stdio.h>
+#include <talloc.h>
+#include "notmuch-private.h"
+
+int main(unused (int argc), unused (char **argv)){
+    char *line = NULL;
+    size_t len = 0;
+    ssize_t nread;
+    void *local = talloc_new (NULL);
+
+    while ((nread = getline(&line, &len, stdin)) != -1) {
+ int last = strlen(line) - 1;
+ if (line[last] == '\n')
+    line[last] = '\0';
+
+ char *mid = _notmuch_message_id_parse_strict (local, line);
+ if (mid)
+    printf("GOOD: %s\n", mid);
+ else
+    printf("BAD: %s\n", line);
+    }
+
+    talloc_free (local);
+}
--
2.18.0

_______________________________________________
notmuch mailing list
[hidden email]
https://notmuchmail.org/mailman/listinfo/notmuch
David Bremner-2 David Bremner-2
Reply | Threaded
Open this post in threaded view
|

[PATCH 5/5] lib: change parent strategy to use In-Reply-To if it looks sane

In reply to this post by David Bremner-2
As reported by Sean Whitton, there are mailers (in particular the
Debian Bug Tracking System) that have sensible In-Reply-To headers,
but un-useful-for-notmuch References (in particular with the BTS, the
oldest reference is last). I looked at a sample of about 200K
messages, and only about 0.5% these had something other than a single
message-id in In-Reply-To. On this basis, if we see a single
message-id in In-Reply-To, consider that as authoritative.
---
 lib/add-message.cc          | 20 +++++++++++++++-----
 test/T510-thread-replies.sh |  2 --
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/lib/add-message.cc b/lib/add-message.cc
index f5fac8be..da37032c 100644
--- a/lib/add-message.cc
+++ b/lib/add-message.cc
@@ -227,7 +227,7 @@ _notmuch_database_link_message_to_parents (notmuch_database_t *notmuch,
    const char **thread_id)
 {
     GHashTable *parents = NULL;
-    const char *refs, *in_reply_to, *in_reply_to_message_id;
+    const char *refs, *in_reply_to, *in_reply_to_message_id, *strict_message_id = NULL;
     const char *last_ref_message_id, *this_message_id;
     GList *l, *keys = NULL;
     notmuch_status_t ret = NOTMUCH_STATUS_SUCCESS;
@@ -242,14 +242,24 @@ _notmuch_database_link_message_to_parents (notmuch_database_t *notmuch,
     parents, refs);
 
     in_reply_to = _notmuch_message_file_get_header (message_file, "in-reply-to");
+    if (in_reply_to)
+ strict_message_id = _notmuch_message_id_parse_strict (message,
+      in_reply_to);
+
     in_reply_to_message_id = parse_references (message,
        this_message_id,
        parents, in_reply_to);
 
-    /* For the parent of this message, use the last message ID of the
-     * References header, if available.  If not, fall back to the
-     * first message ID in the In-Reply-To header. */
-    if (last_ref_message_id) {
+    /* For the parent of this message, use
+     * 1) the In-Reply-To header, if it looks sane, otherwise
+     * 2) the last message ID of the References header, if available.
+     * 3) Otherwise, fall back to the first message ID in
+     * the In-Reply-To header.
+     */
+
+    if (strict_message_id) {
+ _notmuch_message_add_term (message, "replyto", strict_message_id);
+    } else if (last_ref_message_id) {
  _notmuch_message_add_term (message, "replyto",
    last_ref_message_id);
     } else if (in_reply_to_message_id) {
diff --git a/test/T510-thread-replies.sh b/test/T510-thread-replies.sh
index 3e7b2b63..86ce98db 100755
--- a/test/T510-thread-replies.sh
+++ b/test/T510-thread-replies.sh
@@ -168,7 +168,6 @@ test_expect_equal_json "$output" "$expected"
 add_email_corpus threading
 
 test_begin_subtest "trusting reply-to"
-test_subtest_known_broken
 notmuch show --entire-thread=true id:[hidden email] | grep ^Subject:  > OUTPUT
 cat <<EOF > EXPECTED
 Subject: root message
@@ -178,7 +177,6 @@ EOF
 test_expect_equal_file EXPECTED OUTPUT
 
 test_begin_subtest "trusting reply-to (tree view)"
-test_subtest_known_broken
 test_emacs '(notmuch-tree "id:[hidden email]")
     (notmuch-test-wait)
     (test-output)
--
2.18.0

_______________________________________________
notmuch mailing list
[hidden email]
https://notmuchmail.org/mailman/listinfo/notmuch