patx/mrhttp-asgi

New avx2 parser

Commit e3feb09 · Mark Reed · 2024-03-13T21:05:22-07:00

Changeset
e3feb0923dcc5769352f0efd28794f77a12a386d
Parents
7988dba33bb12ffe61303bf09cade219e1d418f3

View source at this commit

Comments

No comments yet.

Log in to comment

Diff

diff --git a/.gitignore b/.gitignore
index d590fa9..e5f6ecb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,5 +8,8 @@
 *.swp
 settings.py
 build/*
+gbench/parse
+gbench/t
+gbench/string
 .DS_Store
 __pycache__
diff --git a/dotests.py b/dotests.py
index f5fb599..a7dea80 100644
--- a/dotests.py
+++ b/dotests.py
@@ -121,13 +121,11 @@ try:
      '-H','Accept-Language: en-US,en;q=0.5',
      '-H','Connection: keep-alive')
   opts = ('-H','Cookie: mrsession=43709dd361cc443e976b05714581a7fb; foo=fdsfdasdfasdfdsfasdfsdfsdfasdfas; short=fazc;')
-  print ("Hello          ", run_wrk(loop, 'http://localhost:8080/'),             "Requests/second" )
-  print ("Sessions       ", run_wrk(loop, 'http://localhost:8080/s',     options=opts), "Requests/second" )
-  if 0:
+  if 1:
     print ("Hello pipelined", run_wrk(loop, 'http://localhost:8080/',lua='tests/lua/pipeline.lua'), "Requests/second" )
     print ("More hdrs pipelined", run_wrk(loop, 'http://localhost:8080/',options=more_headers,lua='tests/lua/pipeline.lua'), "Requests/second" )
-    print ("Hello          ", run_wrk(loop, 'http://localhost:8080/'),             "Requests/second" )
-    print ("Hello hdrs     ", run_wrk(loop, 'http://localhost:8080/', options=more_headers), "Requests/second" )
+    #print ("Hello          ", run_wrk(loop, 'http://localhost:8080/'),             "Requests/second" )
+    #print ("Hello hdrs     ", run_wrk(loop, 'http://localhost:8080/', options=more_headers), "Requests/second" )
 
     #print ("Cookies        ", run_wrk(loop, 'http://localhost:8080/printCookies', options=opts), "Requests/second" )
     #print ("many args      ", run_wrk(loop, 'http://localhost:8080/sixargs/one/two/three/four/five/six'), "Requests/second" )
diff --git a/gbench/bld b/gbench/bld
new file mode 100755
index 0000000..8b94551
--- /dev/null
+++ b/gbench/bld
@@ -0,0 +1,6 @@
+
+g++ t.cpp -g -O0 -msse4.2 -mavx2 -std=c++11 -lbenchmark -lpthread -o t
+#g++ tst.cpp -O3 -msse4.2 -mavx2 -std=c++11 -lbenchmark -lpthread -o tst
+g++ parse.cpp -O3 -msse4.2 -mavx2 -std=c++11 -lbenchmark -lpthread -o parse
+#g++ string.cpp -O3 -msse4.2 -mavx2 -std=c++11 -lbenchmark -lpthread -o string
+
diff --git a/gbench/parse.cpp b/gbench/parse.cpp
index d95e618..7a446be 100644
--- a/gbench/parse.cpp
+++ b/gbench/parse.cpp
@@ -1,8 +1,8 @@
 
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string>
+#include <cstring>
 #include <x86intrin.h>
 #ifdef __AVX2__
 #include <immintrin.h>
@@ -589,160 +589,74 @@ static void find_ranges(const char* buf, const char* buf_end, unsigned long *ran
 
 static const char* parse_headers_avx2(const char* buf, const char* buf_end, int* ret)
 {
-  int num_headers = 0;
-  int max_headers = 20;
-  /* Bitmap for the first type of tokens */
-  unsigned long rr0[2] = {0};
-  /* Bitmap for the second type of tokens */
-  unsigned long rr1[2] = {0};
-  /* Pointer to the start of the currently parsed block of 128 bytes */
-  const char* prep_start = NULL;
-  int found;
-  int n_headers = num_headers;
-
-  for (; ; ++n_headers) {
-    CHECK_EOF();
-    if (*buf == '\015') {
-      ++buf;
-      EXPECT_CHAR('\012');
-      break;
-    } else if (*buf == '\012') {
-      ++buf;
-      break;
-    }
-    if (n_headers == max_headers) {
-      *ret = -1;
-      num_headers = n_headers;
-      return NULL;
-    }
+  // 128 bit token mask
+  unsigned long bm[8] = {0};
+  // Pointer to the start of the currently parsed block of 128 bytes
+  const char* prep_start = buf;
+  const char *p = buf;
 
-    if (! (n_headers != 0 && (*buf == ' ' || *buf == '\t')) && !(*buf >= 65 && * buf <= 90)) {
-      if (! token_char_map[(unsigned char)*buf]) {
-        *ret = -1;
-        num_headers = n_headers;
-        return NULL;
-      }
-      //headers[n_headers].name = buf;
-
-      /* Attempt to find a match in the index */
-      found = 0;
-      do {
-        unsigned long distance = buf - prep_start;
-        /* Check if the bitmaps are still valid. An assumption I make is that 
-           buf > 128 (i.e. the os will never allocate memory at address 0-128 */
-        if(unlikely(distance >= 128)) {   /* Bitmaps are too old, make new ones */
-          prep_start = buf;
-          distance = 0;
-          find_ranges(buf, buf_end, rr0, rr1);
-        } else if(distance >= 64) {          /* In the second half of the bitmap */
-          unsigned long index = rr0[1] >> (distance - 64);  /* Correct offset of the bitmap */
-          unsigned long find = TZCNT(index); /* Fine next set bit */
-          if((find < 64)) {                  /* Yey, we found a token */
-            buf += find;
-            found = 1;
-            break;
-          }
-          buf = prep_start + 128;            /* No token was found in the current bitmap */
-          continue;
-        }
-        unsigned long index = rr0[0]  >> (distance); /* In the first half of the bitmap */
-        unsigned long find = TZCNT(index);           /* Find next set bit */
-        if((find < 64)){                             /* Token found */
-          buf += find;
-          found = 1;
-          break;
-        }                                            /* Token not found, look at second half of bitmap */
-        index = rr0[1];
-        find = TZCNT(index);
-        if((find < 64)){
-          buf += 64+find - distance;
-          found = 1;
-          break;
-        }     
-
-        buf = prep_start + 128;
-      } while (buf < buf_end);
-
-      if(!found)
-        if(buf >= buf_end) {
-          *ret = -2;
-          //*num_headers = n_headers;
-          return NULL;
-        }
-      //headers[n_headers].name_len = buf - headers[n_headers].name;
-      ++buf;
-      CHECK_EOF();
-      while( (*buf == ' ' || *buf == '\t') ) {
-        ++buf;
-        CHECK_EOF();
-      }
-    } else {
-      //headers[n_headers].name = NULL;
-      //headers[n_headers].name_len = 0;
+  // Load the \r and : mask into rr13 and rr58
+  // Load 512 bytes at a time into the bit mask bm[8]
+  //   Load 32 bytes from the buffer into each register and compare against the mask registers
+
+  __m256i b0, b1, b2, b3;
+  const __m256i rr13    = _mm256_set1_epi8(13);
+  const __m256i rr58    = _mm256_set1_epi8(58);
+  int state = 0;
+
+  // Process 512b per loop
+  while(1) {
+    int i = 0;
+    // Load 512b into bm[0-7]
+    while ( i < 8 ) {
+      b0 = _mm256_loadu_si256((__m256i *)(buf + (64*i)+ 0));
+      b1 = _mm256_loadu_si256((__m256i *)(buf + (64*i)+32));
+      b2 = _mm256_loadu_si256((__m256i *)(buf + (64*i)+64));
+      b3 = _mm256_loadu_si256((__m256i *)(buf + (64*i)+96));
+      bm[i++]   =       _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(rr13, b0),_mm256_cmpeq_epi8(rr58, b0)) ) |
+        ((unsigned long)_mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(rr13, b1),_mm256_cmpeq_epi8(rr58, b1)) ) << 32);
+      bm[i++] =         _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(rr13, b2),_mm256_cmpeq_epi8(rr58, b2)) ) |
+        ((unsigned long)_mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(rr13, b3),_mm256_cmpeq_epi8(rr58, b3)) ) << 32);
     }
-    const char* token_start = buf;
 
-    found = 0;
+    // Each bit in the mask is either a : or a \r
+    int off = 0;
+    int shft = 0;
+    int bmOff = 0;
+    unsigned long bitmap, tz;
+    int slen = 0; //DELME
 
     do {
-      /* Too far */
-      unsigned long distance = buf - prep_start; /* Same algorithm as above */
-      if(unlikely(distance >= 128)) {
-        prep_start = buf;
-        distance = 0;
-        find_ranges(buf, buf_end, rr0, rr1);
-      } else if(distance >= 64) {
-        unsigned long index = rr1[1] >> (distance - 64);
-        unsigned long find = TZCNT(index);
-        if((find < 64)) {
-          buf += find;
-          found = 1;
-          break;
+      bitmap = bm[ bmOff ] >> shft;
+      tz = TZCNT(bitmap);
+      if ( tz < 64 ) { // tz is 64 if not found
+        p += tz;
+        //printf( " fnd >%.*s<\n", p-buf, buf );  
+        if ( state == 0 ) { // :
+          state = 1;
+          p += 2; buf = p;
+        } else { // \r
+          state = 0;
+          p += 2; buf = p;
+          if ( *p == '\r' ) goto wedone;
         }
-        buf = prep_start + 128;
-        continue;
-      }
-      unsigned long index = rr1[0]  >> (distance);
-      unsigned long find = TZCNT(index);
-      if((find < 64)){
-        buf += find;
-        found = 1;
-        break;
-      }
-      index = rr1[1];
-      find = TZCNT(index);
-      if((find < 64)){
-        buf += 64+find - distance;
-        found = 1;
-        break;
-      }
-
-      buf = prep_start + 128;
-    } while (buf < buf_end);
-
-    if(!found)
-      if(buf >= buf_end) {
-        *ret = -2;
-        num_headers = n_headers;
-        return NULL;
+      } else {
+        p += 64 - shft;
+        //printf("DELMEZ %.*s\n", 3, p) ;
       }
-
-    unsigned short two_char = *(unsigned short*)buf;
-
-    if( likely(two_char == 0x0a0d) ) {
-      //headers[n_headers].value_len = buf - token_start;
-      buf += 2;
-    } else if (unlikely(two_char & 0x0a == 0x0a)) {
-      //headers[n_headers].value_len = buf - token_start;
-      ++buf;
-    } else {
-      *ret = -1;
-      num_headers = n_headers;
-      return NULL;
-    }
-    //headers[n_headers].value = token_start;
+      off = p-prep_start;
+      //printf("DELME off=%d\n",off);
+      shft = off&0x3F;
+      bmOff = off/64;
+    } while ( bmOff < 8 ) ;
+    prep_start += 512;
+    buf = prep_start;
   }
-  num_headers = n_headers;
+
+wedone:
+// Host: server\r\n
+// User-Agent: Mozilla/5.0 (X11; Linux x86_64) Gecko/20130501 Firefox/30.0 AppleWebKit/600.00 Chrome/30.0.0000.0 Trident/10.0 Safari/600.00\r\n
+  //printf("%.*s\n", p - prep_start, prep_start);
   return buf;
 }
 
@@ -773,7 +687,7 @@ static const char *my_get_eol(const char *buf) {
   while (1)
   {
     __m256i v0 = _mm256_loadu_si256((const __m256i *)buf);
-    __m256i v1 = _mm256_cmpeq_epi8(v0, m13);     //  
+    __m256i v1 = _mm256_cmpeq_epi8(v0, m13);     
     unsigned long vmask = _mm256_movemask_epi8(v1);  
     if (vmask != 0) {
         buf += TZCNT(vmask) + 2;
@@ -890,6 +804,20 @@ static void parse_mine3( const char* buf ) {
   //__m256i b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,b10,b11,b12,b13,b14,b15;
   __m256i b0,b1,b2,b3,b4,b5,b6,b7;
 
+  const char *obuf = buf;
+  const char *sbuf = buf;
+
+  int i;  // msk[i] 
+  int t;
+  unsigned int s = 0;
+  int name_or_value = 0;
+
+  const char *block_start = obuf;
+
+new512:
+  i = 0;
+  buf = obuf;
+
   b0 = _mm256_loadu_si256((const __m256i *) (buf + 32*0)); // buf[0]
   b1 = _mm256_loadu_si256((const __m256i *) (buf + 32*1)); // buf[32]
   b2 = _mm256_loadu_si256((const __m256i *) (buf + 32*2)); // buf[64]
@@ -899,15 +827,14 @@ static void parse_mine3( const char* buf ) {
   b6 = _mm256_loadu_si256((const __m256i *) (buf + 32*6));
   b7 = _mm256_loadu_si256((const __m256i *) (buf + 32*7)); // 256 bytes
 
-  msk[0] =            _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b0, m13), _mm256_cmpeq_epi8(b0, m58) ) )  |
-     ((unsigned long long) _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b1, m13), _mm256_cmpeq_epi8(b1, m58) ) ) << 32);
-  msk[1] =            _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b2, m13), _mm256_cmpeq_epi8(b2, m58) ) )  |
-     ((unsigned long long) _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b3, m13), _mm256_cmpeq_epi8(b3, m58) ) ) << 32);
-  msk[2] =            _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b4, m13), _mm256_cmpeq_epi8(b4, m58) ) )  |
-     ((unsigned long long) _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b5, m13), _mm256_cmpeq_epi8(b5, m58) ) ) << 32);
-  msk[3] =            _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b6, m13), _mm256_cmpeq_epi8(b6, m58) ) )  |
-     ((unsigned long long) _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b7, m13), _mm256_cmpeq_epi8(b7, m58) ) ) << 32);
-
+  msk[0] = (unsigned int)_mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b0, m13), _mm256_cmpeq_epi8(b0, m58) ) )  |
+        ((unsigned long) _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b1, m13), _mm256_cmpeq_epi8(b1, m58) ) ) << 32);
+  msk[1] = (unsigned int)_mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b2, m13), _mm256_cmpeq_epi8(b2, m58) ) )  |
+        ((unsigned long) _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b3, m13), _mm256_cmpeq_epi8(b3, m58) ) ) << 32);
+  msk[2] = (unsigned int)_mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b4, m13), _mm256_cmpeq_epi8(b4, m58) ) )  |
+        ((unsigned long) _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b5, m13), _mm256_cmpeq_epi8(b5, m58) ) ) << 32);
+  msk[3] = (unsigned int)_mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b6, m13), _mm256_cmpeq_epi8(b6, m58) ) )  |
+        ((unsigned long) _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b7, m13), _mm256_cmpeq_epi8(b7, m58) ) ) << 32);
 
   b0 = _mm256_loadu_si256((const __m256i *) (buf + 32*8));
   b1 = _mm256_loadu_si256((const __m256i *) (buf + 32*9));
@@ -918,28 +845,36 @@ static void parse_mine3( const char* buf ) {
   b6 = _mm256_loadu_si256((const __m256i *) (buf + 32*14));
   b7 = _mm256_loadu_si256((const __m256i *) (buf + 32*15));
 
-  msk[4] =            _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b0, m13), _mm256_cmpeq_epi8(b0, m58) ) )  ^
-     ((unsigned long) _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b1, m13), _mm256_cmpeq_epi8(b1, m58) ) ) << 32);
-  msk[5] =            _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b2, m13), _mm256_cmpeq_epi8(b2, m58) ) )  ^
-     ((unsigned long) _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b3, m13), _mm256_cmpeq_epi8(b3, m58) ) ) << 32);
-  msk[6] =            _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b4, m13), _mm256_cmpeq_epi8(b4, m58) ) )  ^
-     ((unsigned long) _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b5, m13), _mm256_cmpeq_epi8(b5, m58) ) ) << 32);
-  msk[7] =            _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b6, m13), _mm256_cmpeq_epi8(b6, m58) ) )  ^
-     ((unsigned long) _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b7, m13), _mm256_cmpeq_epi8(b7, m58) ) ) << 32);
+  msk[4] = (unsigned int)_mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b0, m13), _mm256_cmpeq_epi8(b0, m58) ) )  ^
+        ((unsigned long) _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b1, m13), _mm256_cmpeq_epi8(b1, m58) ) ) << 32);
+  msk[5] = (unsigned int)_mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b2, m13), _mm256_cmpeq_epi8(b2, m58) ) )  ^
+        ((unsigned long) _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b3, m13), _mm256_cmpeq_epi8(b3, m58) ) ) << 32);
+  msk[6] = (unsigned int)_mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b4, m13), _mm256_cmpeq_epi8(b4, m58) ) )  ^
+        ((unsigned long) _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b5, m13), _mm256_cmpeq_epi8(b5, m58) ) ) << 32);
+  msk[7] = (unsigned int)_mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b6, m13), _mm256_cmpeq_epi8(b6, m58) ) )  ^
+        ((unsigned long) _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b7, m13), _mm256_cmpeq_epi8(b7, m58) ) ) << 32);
+
 
-  const char *obuf = buf;
-  const char *sbuf = buf;
   // "Host: server\r\n"
-  int i = 0;  // msk[i] 
-  int t;
   do {
 
-    const char *block_start = obuf+64*i;
+    block_start = obuf+64*i;
 
     while(1) {
-      t = TZCNT((msk[i]>>(buf-block_start)));
+      s = buf-block_start;
+      t = TZCNT((msk[i]>>s));
+      //printf("DELME mski %016llx shift %d\n", msk[i], s );
+      //printf("DELME shft %016llx\n", msk[i]>>s );
       if ( t < 64 ) {
-        buf += t+2;
+        buf += t;
+        if ( name_or_value == 1 ) {
+          if ( *buf == ':' ) { buf += 1; continue; } // : in value field
+          name_or_value = 0;
+        } else {
+          name_or_value = 1;
+        }
+        //printf( " fnd >%.*s<\n", buf-sbuf, sbuf );  
+        buf += 2; if ( *buf == '\r' ) break; // \r\n\r\n marks the end
         sbuf = buf;
         if ( (buf-block_start)> 64 ) break; // TODO?
       } else {
@@ -948,42 +883,54 @@ static void parse_mine3( const char* buf ) {
       }
 
     }
-  
-    i+=1; 
+
+    i+=1;
+    if ( buf[0] == '\r' ) goto done;
   } while ( i < 8 && buf[0] != '\r' );
-  
-  
+
+  obuf += 512; 
+  goto new512;
+done: 
+  i += 1;
 }
 
 static void parse_mine2( const char* buf ) {
-  unsigned long msk;
-  int i=0,t; // 32B index
+  unsigned int msk;
+  int i=0,tz; // 32B index
   int cnt = 0;
+  unsigned int shifted;
   const char *sbuf = buf;
   const char *obuf = buf;
+  int name_or_value = 0;
+
   do {
-    int shifted = 0;
     const char *block_start = obuf+32*i;
     __m256i b0 = _mm256_loadu_si256((const __m256i *) block_start);
     msk = _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b0, m13), _mm256_cmpeq_epi8(b0, m58) ) );
-
     while (1) {
 
       shifted = buf-block_start;
-      t = TZCNT((msk >> shifted));
-      if ( t < 32 ) {
-        if ( t == 0 ) break;
-        buf += t+2;
+      if ( shifted >= 32 ) break;
+      tz = TZCNT((msk >> shifted));
+      if ( tz < 32 ) {
+        buf += tz;
+        if ( name_or_value == 1 ) {
+          if ( *buf == ':' ) { buf += 1; continue; } // : in value field
+          name_or_value = 0;
+        } else {
+          name_or_value = 1;
+        }
+        //printf( " fnd >%.*s<\n", buf-sbuf, sbuf );  
+        buf += 2; if ( *buf == '\r' ) break; // \r\n\r\n marks the end
         sbuf = buf;
       } else {
-        buf = block_start + 32;
+        buf += 32 - shifted;
         break;
       }
 
     }
     i+=1;
-    cnt += 1;
-  } while ( cnt < 20 && buf[0] != '\r' );
+  } while ( *buf != '\r' );
 }
 
 static void parse_sse4( const char* buf ) {
@@ -998,13 +945,20 @@ static void parse_mysse4( const char* buf ) {
     buf = my_get_eol( buf );
   }
 }
+static char buf[8096] = "Host: server\r\n"
+"User-Agent: Mozilla/5.0 (X11; Linux x86_64) Gecko/20130501 Firefox/30.0 AppleWebKit/600.00 Chrome/30.0.0000.0 Trident/10.0 Safari/600.00\r\n"
+"Cookie: uid=12345678901234567890; __utma=1.1234567890.1234567890.1234567890.1234567890.12; wd=2560x1600\r\n"
+"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,* /*;q=0.8\r\n"
+"Accept-Language: en-US,en;q=0.5\r\n"
+"Connection: keep-alive\r\n\r\n";
+static char buf2[8096] = "Host: localhost:8080\r\nUser-Agent: python-requests/2.31.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: * /*\r\nConnection: keep-alive\r\nCookie: foo=b=ar\r\nContent-Length: 0\r\n\r\n";
 
 static void BM_SlowParse(benchmark::State& state) {
   // Perform setup here
   std::string text = "Host: server\n"
 "User-Agent: Mozilla/5.0 (X11; Linux x86_64) Gecko/20130501 Firefox/30.0 AppleWebKit/600.00 Chrome/30.0.0000.0 Trident/10.0 Safari/600.00\n"
 "Cookie: uid=12345678901234567890; __utma=1.1234567890.1234567890.1234567890.1234567890.12; wd=2560x1600\n"
-"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\n"
+"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,* /*;q=0.8\n"
 "Accept-Language: en-US,en;q=0.5\n"
 "Connection: keep-alive\n";
 
@@ -1015,13 +969,6 @@ static void BM_SlowParse(benchmark::State& state) {
 }
 static void BM_sse4_get_eol(benchmark::State& state) {
   // Perform setup here
-  char buf[8096] = "Host: server\r\n"
-"User-Agent: Mozilla/5.0 (X11; Linux x86_64) Gecko/20130501 Firefox/30.0 AppleWebKit/600.00 Chrome/30.0.0000.0 Trident/10.0 Safari/600.00\r\n"
-"Cookie: uid=12345678901234567890; __utma=1.1234567890.1234567890.1234567890.1234567890.12; wd=2560x1600\r\n"
-"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n"
-"Accept-Language: en-US,en;q=0.5\r\n"
-"Connection: keep-alive\r\n\r\n\r\n\r\n\r\n";
-
   for (auto _ : state) {
     // This code gets timed
     parse_sse4(buf);
@@ -1029,13 +976,6 @@ static void BM_sse4_get_eol(benchmark::State& state) {
 }
 static void BM_my_get_eol(benchmark::State& state) {
   // Perform setup here
-  char buf[8096] = "Host: server\r\n"
-"User-Agent: Mozilla/5.0 (X11; Linux x86_64) Gecko/20130501 Firefox/30.0 AppleWebKit/600.00 Chrome/30.0.0000.0 Trident/10.0 Safari/600.00\r\n"
-"Cookie: uid=12345678901234567890; __utma=1.1234567890.1234567890.1234567890.1234567890.12; wd=2560x1600\r\n"
-"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n"
-"Accept-Language: en-US,en;q=0.5\r\n"
-"Connection: keep-alive\r\n\r\n\r\n\r\n\r\n";
-
   for (auto _ : state) {
     // This code gets timed
     parse_mysse4(buf);
@@ -1044,15 +984,6 @@ static void BM_my_get_eol(benchmark::State& state) {
 
 static void BM_my_header_parse(benchmark::State& state) {
   // Perform setup here
-  char buf[8096] = "Host: server\r\n"
-"User-Agent: Mozilla/5.0 (X11; Linux x86_64) Gecko/20130501 Firefox/30.0 AppleWebKit/600.00 Chrome/30.0.0000.0 Trident/10.0 Safari/600.00\r\n"
-"Cookie: uid=12345678901234567890; __utma=1.1234567890.1234567890.1234567890.1234567890.12; wd=2560x1600\r\n"
-"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n"
-"Accept-Language: en-US,en;q=0.5\r\n"
-"Connection: keep-alive\r\n\r\nzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz           "
-"                                                                         "
-"                                                                         ";
-
   for (auto _ : state) {
     // This code gets timed
     parse_mine(buf);
@@ -1060,15 +991,6 @@ static void BM_my_header_parse(benchmark::State& state) {
 }
 static void BM_my2_header_parse(benchmark::State& state) {
   // Perform setup here
-  char buf[8096] = "Host: server\r\n"
-"User-Agent: Mozilla/5.0 (X11; Linux x86_64) Gecko/20130501 Firefox/30.0 AppleWebKit/600.00 Chrome/30.0.0000.0 Trident/10.0 Safari/600.00\r\n"
-"Cookie: uid=12345678901234567890; __utma=1.1234567890.1234567890.1234567890.1234567890.12; wd=2560x1600\r\n"
-"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n"
-"Accept-Language: en-US,en;q=0.5\r\n"
-"Connection: keep-alive\r\n\r\nzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz           "
-"                                                                         "
-"                                                                         ";
-
   for (auto _ : state) {
     // This code gets timed
     parse_mine2(buf);
@@ -1076,15 +998,6 @@ static void BM_my2_header_parse(benchmark::State& state) {
 }
 static void BM_my3_header_parse(benchmark::State& state) {
   // Perform setup here
-  char buf[8096] = "Host: server\r\n"
-"User-Agent: Mozilla/5.0 (X11; Linux x86_64) Gecko/20130501 Firefox/30.0 AppleWebKit/600.00 Chrome/30.0.0000.0 Trident/10.0 Safari/600.00\r\n"
-"Cookie: uid=12345678901234567890; __utma=1.1234567890.1234567890.1234567890.1234567890.12; wd=2560x1600\r\n"
-"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n"
-"Accept-Language: en-US,en;q=0.5\r\n"
-"Connection: keep-alive\r\n\r\nzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz           "
-"                                                                         "
-"                                                                         ";
-
   for (auto _ : state) {
     // This code gets timed
     parse_mine3(buf);
@@ -1094,35 +1007,19 @@ static void BM_my3_header_parse(benchmark::State& state) {
 
 static void BM_old_header_parse(benchmark::State& state) {
   // Perform setup here
-  char buf[8096] = "Host: server\r\n"
-"User-Agent: Mozilla/5.0 (X11; Linux x86_64) Gecko/20130501 Firefox/30.0 AppleWebKit/600.00 Chrome/30.0.0000.0 Trident/10.0 Safari/600.00\r\n"
-"Cookie: uid=12345678901234567890; __utma=1.1234567890.1234567890.1234567890.1234567890.12; wd=2560x1600\r\n"
-"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n"
-"Accept-Language: en-US,en;q=0.5\r\n"
-"Connection: keep-alive\r\n\r\nzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz           "
-"                                                                         "
-"                                                                         ";
   int ret = 0;
   for (auto _ : state) {
     // This code gets timed
-    parse_headers(buf,buf+512,&ret);
+    parse_headers(buf,buf+2048,&ret);
   }
 }
 
 static void BM_avx2_header_parse(benchmark::State& state) {
   // Perform setup here
-  char buf[8096] = "Host: server\r\n"
-"User-Agent: Mozilla/5.0 (X11; Linux x86_64) Gecko/20130501 Firefox/30.0 AppleWebKit/600.00 Chrome/30.0.0000.0 Trident/10.0 Safari/600.00\r\n"
-"Cookie: uid=12345678901234567890; __utma=1.1234567890.1234567890.1234567890.1234567890.12; wd=2560x1600\r\n"
-"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n"
-"Accept-Language: en-US,en;q=0.5\r\n"
-"Connection: keep-alive\r\n\r\nzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz           "
-"                                                                         "
-"                                                                         ";
   int ret = 0;
   for (auto _ : state) {
     // This code gets timed
-    parse_headers_avx2(buf,buf+512,&ret);
+    parse_headers_avx2(buf,buf+2048,&ret);
   }
 }
 
@@ -1139,3 +1036,35 @@ BENCHMARK(BM_old_header_parse);
 BENCHMARK(BM_avx2_header_parse);
 BENCHMARK_MAIN();
 
+/*
+
+int main() {
+  char buf[8096] = "Host: server\r\n"
+"User-Agent: Mozilla/5.0 (X11; Linux x86_64) Gecko/20130501 Firefox/30.0 AppleWebKit/600.00 Chrome/30.0.0000.0 Trident/10.0 Safari/600.00\r\n"
+"Cookie: uid=12345678901234567890; __utma=1.1234567890.1234567890.1234567890.1234567890.12; wd=2560x1600\r\n"
+"Cookie: uid=12345678901234567890; __utma=1.1234567890.1234567890.1234567890.1234567890.12; wd=2560x1600\r\n"
+"Cookie: uid=12345678901234567890; __utma=1.1234567890.1234567890.1234567890.1234567890.12; wd=2560x1600\r\n"
+"Cookie: uid=12345678901234567890; __utma=1.1234567890.1234567890.1234567890.1234567890.12; wd=2560x1600\r\n"
+"Cookie: uid=12345678901234567890; __utma=1.1234567890.1234567890.1234567890.1234567890.12; wd=2560x1600\r\n"
+"Cookie: uid=12345678901234567890; __utma=1.1234567890.1234567890.1234567890.1234567890.12; wd=2560x1600\r\n"
+"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,* /*;q=0.8\r\n"
+"Accept-Language: en-US,en;q=0.5\r\n"
+"Connection: keep-alive\r\n\r\n";
+  //strcpy(buf,"Host: localhost:8080\r\nUser-Agent: curl/7.68.0\r\nAccept: * /*\r\n\r\n");
+  //strcpy(buf,"Host: localhost:8080\r\nUser-Agent: python-requests/2.31.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: * /*\r\nConnection: keep-alive\r\nCookie: foo=b=ar\r\nContent-Length: 0\r\n\r\n");
+
+
+  int ret = 0;
+  parse_headers_avx2(buf,buf+512,&ret);
+  //parse_headers(buf,buf+2048,&ret);
+  //parse_mine3(buf);
+  printf(" ret=%d\n",ret);
+
+  //unsigned long long l = 0x80008020ull;
+  //unsigned int s = 7;
+  //printf(" WTF %08x\n", l >> s );
+
+}
+
+
+*/
diff --git a/gbench/readme b/gbench/readme
index ccffad2..03d21d5 100644
--- a/gbench/readme
+++ b/gbench/readme
@@ -1,7 +1,7 @@
 
 Benches
  g++ tst.cpp -std=c++11 -lbenchmark -lpthread -o tst
- g++ parse.cpp -std=c++11 -lbenchmark -lpthread -o parse
+ g++ parse.cpp -msse4.2 -mavx2 -std=c++11 -lbenchmark -lpthread -o parse
 
 Test code first:
   g++ t.cpp -msse4.2 -mavx2 -std=c++11 -lpthread -o t
@@ -16,3 +16,8 @@ Install gbench
   cmake -E chdir "build" cmake -DBENCHMARK_DOWNLOAD_DEPENDENCIES=on -DCMAKE_BUILD_TYPE=Release ../
   cmake --build "build" --config Release
   sudo cmake --build "build" --config Release --target install
+
+
+sudo cpupower frequency-set --governor performance
+./mybench
+sudo cpupower frequency-set --governor powersave
diff --git a/gbench/run b/gbench/run
new file mode 100755
index 0000000..1c16508
--- /dev/null
+++ b/gbench/run
@@ -0,0 +1,4 @@
+
+sudo sh -c "echo performance | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor"
+./$1
+sudo sh -c "echo powersave | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor"
diff --git a/gbench/string.cpp b/gbench/string.cpp
index 8e7818e..a4eb0e0 100644
--- a/gbench/string.cpp
+++ b/gbench/string.cpp
@@ -19,28 +19,58 @@
 #endif
 
 static inline bool _isdigit(char c)  { return  c >= '0'  && c <= '9'; }
+static inline bool _isdigit2(unsigned char c)  { return  (c & 0xF0) == 0x30; }
+#define IS_DIGIT(c) ((c&0xF0) == 0x30)
+#define IS_DIGIT2(c) (c >= '0' && c <= '9')
 
-static void _strtol( char* buf ) {
+static long _strtol( char* buf ) {
   char * endptr = buf+4;
-  long n = strtol(buf, &endptr, 10);
+  return strtol(buf, &endptr, 10);
 }
-static void my_strtol( char* s ) {
+static long my_strtol( char* s ) {
   long l;
   while (_isdigit(*s)) {
     l = (l * 10) + (*s++ - '0');
   }
+  return l;
 }
+static long my_strtol2( char* s ) {
+  long l;
+  while (_isdigit2(*s)) {
+    l = (l * 10) + (*s++ - 0x30);
+  }
+  return l;
+}
+static long my_strtol3( char* s ) {
+  long l;
+  while (IS_DIGIT2(*s)) {
+    l = (l * 10) + (*s++ - '0');
+  }
+  return l;
+}
+
+
 
 static void BM_strtol(benchmark::State& state) {
-  char buf[8096] = "1234 ";
-  for (auto _ : state) { _strtol(buf); }
+  char buf[8096] = "123z4 ";
+  for (auto _ : state) { long x = _strtol(buf);  }
 }
 static void BM_my_strtol(benchmark::State& state) {
-  char buf[8096] = "1234 ";
-  for (auto _ : state) { my_strtol(buf); }
+  char buf[8096] = "123z4 ";
+  for (auto _ : state) { long x = my_strtol(buf); }
+}
+static void BM_my_strtol2(benchmark::State& state) {
+  char buf[8096] = "123z4 ";
+  for (auto _ : state) { long x = my_strtol2(buf); }
+}
+static void BM_my_strtol3(benchmark::State& state) {
+  char buf[8096] = "123z4 ";
+  for (auto _ : state) { long x = my_strtol3(buf); }
 }
 
 BENCHMARK(BM_strtol);
 BENCHMARK(BM_my_strtol);
+BENCHMARK(BM_my_strtol2);
+BENCHMARK(BM_my_strtol3);
 BENCHMARK_MAIN();
 
diff --git a/gbench/t.cpp b/gbench/t.cpp
index 479459e..b587e64 100644
--- a/gbench/t.cpp
+++ b/gbench/t.cpp
@@ -1,13 +1,14 @@
 
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string>
+#include <cstring>
 #include <x86intrin.h>
 #ifdef __AVX2__
 #include <immintrin.h>
 #endif
 
+#include <benchmark/benchmark.h>
 
 #if __GNUC__ >= 3
 #define likely(x) __builtin_expect(!!(x), 1)
@@ -35,6 +36,58 @@
     CHECK_EOF();                                                                                                                   \
     EXPECT_CHAR_NO_CHECK(ch);
 
+// Table for converting to lower case
+#define TOLC(c) __lct[(unsigned char)c]
+static const unsigned char __lct[] __attribute__((aligned(64))) = {
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+  0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+  0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+  0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+  0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+  0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+  0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+  0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+  0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+  0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+  0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+  0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+  0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+  0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+  0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+  0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+  0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+  0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+  0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+  0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+  0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+  0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+  0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+  0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+  0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+  0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+  0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+  0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+  0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+  0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+  0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
+};
+
+
+static const char *token_char_map = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+                                    "\0\1\0\1\1\1\1\1\0\0\1\1\0\1\1\0\1\1\1\1\1\1\1\1\1\1\0\0\0\0\0\0"
+                                    "\0\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\0\0\0\1\1"
+                                    "\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\0\1\0\1\0"
+                                    "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+                                    "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+                                    "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+                                    "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
+
+static unsigned long TZCNT(unsigned long long in) {
+  unsigned long res;
+  asm("tzcnt %1, %0\n\t" : "=r"(res) : "r"(in));
+  return res;
+}
 
 std::size_t slow_hparse(std::string &text) noexcept
 {
@@ -117,86 +170,539 @@ FOUND_CTL:
     return buf;
 }
 
-static unsigned long TZCNT(unsigned long long in) {
-  unsigned long res;
-  asm("tzcnt %1, %0\n\t" : "=r"(res) : "r"(in));
-  return res;
+static const char *parse_headers(const char *buf, const char *buf_end, int *ret)
+{
+    int num_headers = 0;
+    int max_headers = 20;
+    if ( buf_end <= buf ) {
+      *ret = -2;
+      return NULL;
+    }
+    for (;; ++num_headers) {
+        CHECK_EOF();
+        if (*buf == '\015') {
+            ++buf;
+            EXPECT_CHAR('\012');
+            break;
+        } else if (*buf == '\012') {
+            ++buf;
+            break;
+        }
+        if (num_headers == max_headers) {
+            *ret = -1;
+            return NULL;
+        }
+        //printf(">%.*s<", 10, buf);
+        // Listed small to larger - probably best as most used TODO check bounds
+        switch ( TOLC(*buf) ) {
+          case 'h': // Host
+            ////headers[*num_headers].name = buf;
+            //headers[*num_headers].name_len = 4;
+            buf += 6;
+            goto hvalue;
+          case 'c': 
+            if ( buf[6] == ':' ) { // Cookie:
+              //headers[*num_headers].name = buf;
+              //headers[*num_headers].name_len = 6;
+              buf += 8;
+              goto hvalue;
+            } 
+            if ( buf[10] == ':' ) { // Connection: 
+              //headers[*num_headers].name = buf;
+              //headers[*num_headers].name_len = 10;
+              buf += 12;
+              goto hvalue;
+            }
+            if ( buf[11] == ':' ) { // Content-MD5: 
+              //headers[*num_headers].name = buf;
+              //headers[*num_headers].name_len = 11;
+              buf += 13;
+              goto hvalue;
+            }
+            if ( buf[12] == ':' ) { // Content-Type: 
+              //headers[*num_headers].name = buf;
+              //headers[*num_headers].name_len = 12;
+              buf += 14;
+              //goto hvalue;
+              //if ( buf[0] == 'a' && buf[13] == 'r' ) { //"application/mrpacker"
+                //mrr->flags = 2;
+              //} 
+              buf = get_token_to_eol(buf, buf_end, ret); 
+              goto skipvalue;
+            }
+            if ( buf[13] == ':' ) { // Cache-Control:
+              //headers[*num_headers].name = buf;
+              //headers[*num_headers].name_len = 13;
+              buf += 15;
+              goto hvalue;
+            }
+            if ( buf[14] == ':' ) { // Content-Length:   
+              //headers[*num_headers].name = buf;
+              //headers[*num_headers].name_len = 14;
+              buf += 16;
+              goto hvalue;
+            }
+            if ( buf[16] == ':' ) { // CF-Connecting-IP
+              //headers[*num_headers].name = buf;
+              //headers[*num_headers].name_len = 16;
+              buf += 18;
+              //mrr->ip = buf;
+              buf = get_token_to_eol(buf, buf_end, ret); 
+              //mrr->ip_len = headers[*num_headers].value_len;
+              goto skipvalue;
+            }
+            break;
+            //printf( "%.*s\n" , 10, buf);
+            //printf( "Host: %08x == %08x\n" , MR_CHAR4_INT('o', 's', 't',':'), *((unsigned int *)(buf+1)));
+          case 'd':
+            if ( buf[4] == ':' ) { // Date:
+              //headers[*num_headers].name = buf;
+              //headers[*num_headers].name_len = 4;
+              buf += 6;
+              goto hvalue;
+            }
+            if ( buf[3] == ':' ) { // DNT:       
+              //headers[*num_headers].name = buf;
+              //headers[*num_headers].name_len = 3;
+              buf += 5;
+              goto hvalue;
+            }
+            break;
+          case 'x':
+            if ( buf[9] == ':' ) { // X-Real-IP
+              //headers[*num_headers].name = buf;
+              //headers[*num_headers].name_len = 9;
+              buf += 11;
+              //mrr->ip = buf;
+              buf = get_token_to_eol(buf, buf_end, ret); 
+              //mrr->ip_len = headers[*num_headers].value_len;
+              goto skipvalue;
+            }
+            if ( buf[15] == ':' ) { // X-Forwarded-For:       
+              //headers[*num_headers].name = buf;
+              //headers[*num_headers].name_len = 15;
+              buf += 17;
+              //mrr->ip = buf;
+              buf = get_token_to_eol(buf, buf_end, ret); 
+              //mrr->ip_len = headers[*num_headers].value_len;
+              goto skipvalue;
+              //goto hvalue;
+            }
+            if ( buf[16] == ':' ) { // X-Forwarded-Host:       
+              //headers[*num_headers].name = buf;
+              //headers[*num_headers].name_len = 16;
+              buf += 18;
+              goto hvalue;
+            }
+            break;
+          case 'f':
+            if ( buf[5] == ':' ) { // From:
+              //headers[*num_headers].name = buf;
+              //headers[*num_headers].name_len = 5;
+              buf += 7;
+              goto hvalue;
+            }
+            if ( buf[9] == ':' ) { // Forwarded:     
+              //headers[*num_headers].name = buf;
+              //headers[*num_headers].name_len = 9;
+              buf += 11;
+              goto hvalue;
+            }
+            break;
+          case 'i': 
+            if ( buf[13] == ':' ) { // If-None-Match:  
+              //headers[*num_headers].name = buf;
+              //headers[*num_headers].name_len = 13;
+              buf += 15;
+              goto hvalue;
+            }
+            if ( buf[17] == ':' ) { // If-Modified-Since:  
+              //headers[*num_headers].name = buf;
+              //headers[*num_headers].name_len = 17;
+              buf += 19;
+              goto hvalue;
+            }
+            break;
+          case 'o':
+            //headers[*num_headers].name = buf;
+            //headers[*num_headers].name_len = 6;
+            buf += 8;
+            goto hvalue;
+          case 'r':
+            //headers[*num_headers].name = buf;
+            //headers[*num_headers].name_len = 7;
+            buf += 9;
+            goto hvalue;
+          case 't': // Transfer-Encoding:
+            //headers[*num_headers].name = buf;
+            //headers[*num_headers].name_len = 17;
+            buf += 19;
+            goto hvalue;
+          case 'u':
+            if ( buf[10] == ':' ) { // User-Agent:     
+              //headers[*num_headers].name = buf;
+              //headers[*num_headers].name_len = 10;
+              buf += 12;
+              goto hvalue;
+            }
+            if ( buf[25] == ':' ) { // Upgrade-Insecure-Requests:     
+              //headers[*num_headers].name = buf;
+              //headers[*num_headers].name_len = 25;
+              buf += 27;
+              goto hvalue;
+            }
+            break;
+          case 'a':
+            if ( buf[6] == ':' ) { // Accept: 
+              //headers[*num_headers].name = buf;
+              //headers[*num_headers].name_len = 6;
+              buf += 8;
+              goto hvalue;
+            }
+            if ( buf[13] == ':' ) { // Authorization:   
+              //headers[*num_headers].name = buf;
+              //headers[*num_headers].name_len = 13;
+              buf += 15;
+              goto hvalue;
+            }
+            if ( buf[14] == ':' ) { // Accept-Charset:           
+              //headers[*num_headers].name = buf;
+              //headers[*num_headers].name_len = 14;
+              buf += 16;
+              goto hvalue;
+            }
+            if ( buf[15] == ':' ) { // Accept-Encoding: -Datetime
+              //headers[*num_headers].name = buf;
+              //headers[*num_headers].name_len = 15;
+              buf += 17;
+              goto hvalue;
+            }
+            if ( buf[16] == ':' ) { // Accept-Language:
+              //headers[*num_headers].name = buf;
+              //headers[*num_headers].name_len = 15;
+              buf += 17;
+              goto hvalue;
+            }
+            if ( buf[29] == ':' ) { // Access-Control-Request-Method:     
+              //headers[*num_headers].name = buf;
+              //headers[*num_headers].name_len = 29;
+              buf += 31;
+              goto hvalue;
+            }
+            if ( buf[30] == ':' ) { // Access-Control-Request-Headers:     
+              //headers[*num_headers].name = buf;
+              //headers[*num_headers].name_len = 30;
+              buf += 32;
+              goto hvalue;
+            }
+            break;
+
+        }
+        if (!(num_headers != 0 && (*buf == ' ' || *buf == '\t'))) {
+            /* parsing name, but do not discard SP before colon, see
+             * http://www.mozilla.org/security/announce/2006/mfsa2006-33.html */
+            //headers[*num_headers].name = buf;
+            static const char ranges1[] = "\x00 "  /* control chars and up to SP */
+                                                      "\"\""   /* 0x22 */
+                                                      "()"     /* 0x28,0x29 */
+                                                      ",,"     /* 0x2c */
+                                                      "//"     /* 0x2f */
+                                                      ":@"     /* 0x3a-0x40 */
+                                                      "[]"     /* 0x5b-0x5d */
+                                                      "{\377"; /* 0x7b-0xff */
+            int found;
+            buf = findchar(buf, buf_end, ranges1, sizeof(ranges1) - 1, &found);
+            if (!found) {
+                CHECK_EOF();
+            }
+            while (1) {
+                if (*buf == ':') {
+                    break;
+                } else if (!token_char_map[(unsigned char)*buf]) {
+                    *ret = -1;
+                    return NULL;
+                }
+                ++buf;
+                CHECK_EOF();
+            }
+            //if ((headers[*num_headers].name_len = buf - headers[*num_headers].name) == 0) {
+                //*ret = -1;
+                //return NULL;
+            //}
+            ++buf;
+            for (;; ++buf) {
+                CHECK_EOF();
+                if (!(*buf == ' ' || *buf == '\t')) {
+                    break;
+                }
+            }
+        } else {
+            //headers[*num_headers].name = NULL;
+            //headers[*num_headers].name_len = 0;
+        }
+hvalue:
+        if ((buf = get_token_to_eol(buf, buf_end, ret)) == NULL) {
+            return NULL;
+        }
+skipvalue:
+      ;
+    }
+    return buf;
 }
 
-__m256i m13 = _mm256_set1_epi8(13);
-static const char *my_get_eol(const char *buf) {
+static void find_ranges32(__m256i b0, unsigned long *range0, unsigned long *range1) {
+  const __m256i rr0 = _mm256_set1_epi8(0x00 - 1);
+  const __m256i rr1 = _mm256_set1_epi8(0x1f + 1);
+  const __m256i rr2 = _mm256_set1_epi8(0x3a);
+  const __m256i rr4 = _mm256_set1_epi8(0x7f);
+  const __m256i rr7 = _mm256_set1_epi8(0x09);
 
-  while (1)
-  {
-    __m256i v0 = _mm256_loadu_si256((const __m256i *)buf);
-    __m256i v1 = _mm256_cmpeq_epi8(v0, m13);
-    unsigned long vmask = _mm256_movemask_epi8(v1);
-    if (vmask != 0) {
-        buf += TZCNT(vmask) + 2;
-        break;
-    }
-    buf += 32; //pSrc1++;
-  }
-  return buf;
+  /* 0<=x */
+  __m256i gz0 = _mm256_cmpgt_epi8(b0, rr0);
+  /* 0=<x<=1f */
+  __m256i z_1f_0 = _mm256_and_si256(_mm256_cmpgt_epi8(rr1, b0), gz0);
+  /* 0<=x<=1f || x==3a */
+  __m256i range0_0 = _mm256_or_si256(_mm256_cmpeq_epi8(rr2, b0), z_1f_0);
+  /* 0<=x<9 || 9<x<=1f || x==7f */
+  __m256i range1_0 = _mm256_or_si256(_mm256_cmpeq_epi8(rr4, b0), _mm256_andnot_si256(_mm256_cmpeq_epi8(b0, rr7), z_1f_0));
+  /* Generate bit masks */
+  unsigned int r0 = _mm256_movemask_epi8(range0_0);
+  /* Combine 32bit masks into a single 64bit mask */
+  *range0 = r0;
+  r0 = _mm256_movemask_epi8(range1_0);
+  *range1 = r0;
 }
 
-static void parse_sse4( const char* buf ) {
-  int ret = 0;
-  while ( ret == 0 && buf != NULL && buf[0] != '\r' ) {
-    buf = get_token_to_eol( buf, buf+512, &ret); 
-    printf("%d - %.16s\n",(int)buf[0],buf);
-  }
+/* Parse only 64 bytes */
+static void find_ranges64(__m256i b0, __m256i b1, unsigned long *range0, unsigned long *range1) {
+  const __m256i rr0 = _mm256_set1_epi8(0x00 - 1);
+  const __m256i rr1 = _mm256_set1_epi8(0x1f + 1);
+  const __m256i rr2 = _mm256_set1_epi8(0x3a);
+  const __m256i rr4 = _mm256_set1_epi8(0x7f);
+  const __m256i rr7 = _mm256_set1_epi8(0x09);
+
+  /* 0<=x */
+  __m256i gz0 = _mm256_cmpgt_epi8(b0, rr0);
+  __m256i gz1 = _mm256_cmpgt_epi8(b1, rr0);
+  /* 0=<x<=1f */
+  __m256i z_1f_0 = _mm256_and_si256(_mm256_cmpgt_epi8(rr1, b0), gz0);
+  __m256i z_1f_1 = _mm256_and_si256(_mm256_cmpgt_epi8(rr1, b1), gz1);
+  /* 0<=x<=1f || x==3a */
+  __m256i range0_0 = _mm256_or_si256(_mm256_cmpeq_epi8(rr2, b0), z_1f_0);
+  __m256i range0_1 = _mm256_or_si256(_mm256_cmpeq_epi8(rr2, b1), z_1f_1);
+  /* 0<=x<9 || 9<x<=1f || x==7f */
+  __m256i range1_0 = _mm256_or_si256(_mm256_cmpeq_epi8(rr4, b0), _mm256_andnot_si256(_mm256_cmpeq_epi8(b0, rr7), z_1f_0));
+  __m256i range1_1 = _mm256_or_si256(_mm256_cmpeq_epi8(rr4, b1), _mm256_andnot_si256(_mm256_cmpeq_epi8(b1, rr7), z_1f_1));
+  /* Generate bit masks */
+  unsigned int r0 = _mm256_movemask_epi8(range0_0);
+  unsigned int r1 = _mm256_movemask_epi8(range0_1);
+  /* Combine 32bit masks into a single 64bit mask */
+  *range0 = r0 ^ ((unsigned long)r1 << 32);
+  r0 = _mm256_movemask_epi8(range1_0);
+  r1 = _mm256_movemask_epi8(range1_1);
+  *range1 = r0 ^ ((unsigned long)r1 << 32);     
 }
-static void parse_mysse4( const char* buf ) {
-  int ret = 0;
-  while ( ret == 0 && buf != NULL && buf[0] != '\r' ) {
-    buf = my_get_eol( buf );
-    printf("%d - %.16s\n",(int)buf[0],buf);
+
+/* This function parses 128 bytes at a time, creating bitmap of all interesting tokens */
+static void find_ranges(const char* buf, const char* buf_end, unsigned long *range0, unsigned long *range1) {
+  const __m256i rr0 = _mm256_set1_epi8(0x00 - 1);
+  const __m256i rr1 = _mm256_set1_epi8(0x1f + 1);
+  const __m256i rr2 = _mm256_set1_epi8(0x3a);
+  const __m256i rr4 = _mm256_set1_epi8(0x7f);
+  const __m256i rr7 = _mm256_set1_epi8(0x09);
+
+  __m256i b0, b1, b2, b3;
+  unsigned char tmpbuf[32];
+  int i;
+  int dist;
+
+  if((dist = buf_end - buf) < 128) {
+    //memcpy(tmpbuf, buf + (dist & (-32)), dist & 31);
+    for (i=0; i < (dist & 31); i++) tmpbuf[i] = buf[ (dist & (-32)) + i];
+    if (dist >= 96) {
+      b0 = _mm256_loadu_si256((const __m256i_u*) buf + 32*0);
+      b1 = _mm256_loadu_si256((const __m256i_u*) buf + 32*1);
+      b2 = _mm256_loadu_si256((const __m256i_u*) buf + 32*2);
+      b3 = _mm256_loadu_si256((const __m256i_u*) tmpbuf);
+    } else if (dist >= 64) {
+      b0 = _mm256_loadu_si256((const __m256i_u*) buf + 32*0);
+      b1 = _mm256_loadu_si256((const __m256i_u*) buf + 32*1);
+      b2 = _mm256_loadu_si256((const __m256i_u*) tmpbuf);
+      b3 = _mm256_setzero_si256();
+    } else {
+      if(dist < 32) {
+        b0 = _mm256_loadu_si256((const __m256i_u*)tmpbuf);
+        return find_ranges32(b0, range0, range1);
+      } else {
+        b0 = _mm256_loadu_si256((const __m256i_u*) buf + 32*0);
+        b1 = _mm256_loadu_si256((const __m256i_u*)tmpbuf);
+        return find_ranges64(b0, b1, range0, range1);
+      }
+    }
+  } else {
+    /* Load 128 bytes */
+    b0 = _mm256_loadu_si256((const __m256i_u*) buf + 32*0);
+    b1 = _mm256_loadu_si256((const __m256i_u*) buf + 32*1);
+    b2 = _mm256_loadu_si256((const __m256i_u*) buf + 32*2);
+    b3 = _mm256_loadu_si256((const __m256i_u*) buf + 32*3);
   }
+
+  /* 0<=x */
+  __m256i gz0 = _mm256_cmpgt_epi8(b0, rr0);
+  __m256i gz1 = _mm256_cmpgt_epi8(b1, rr0);
+  __m256i gz2 = _mm256_cmpgt_epi8(b2, rr0);
+  __m256i gz3 = _mm256_cmpgt_epi8(b3, rr0);
+  /* 0=<x<=1f */
+  __m256i z_1f_0 = _mm256_and_si256(_mm256_cmpgt_epi8(rr1, b0), gz0);
+  __m256i z_1f_1 = _mm256_and_si256(_mm256_cmpgt_epi8(rr1, b1), gz1);
+  __m256i z_1f_2 = _mm256_and_si256(_mm256_cmpgt_epi8(rr1, b2), gz2);
+  __m256i z_1f_3 = _mm256_and_si256(_mm256_cmpgt_epi8(rr1, b3), gz3);
+  /* 0<=x<=1f || x==3a */
+  __m256i range0_0 = _mm256_or_si256(_mm256_cmpeq_epi8(rr2, b0), z_1f_0);
+  __m256i range0_1 = _mm256_or_si256(_mm256_cmpeq_epi8(rr2, b1), z_1f_1);
+  __m256i range0_2 = _mm256_or_si256(_mm256_cmpeq_epi8(rr2, b2), z_1f_2);
+  __m256i range0_3 = _mm256_or_si256(_mm256_cmpeq_epi8(rr2, b3), z_1f_3);
+  /* 0<=x<9 || 9<x<=1f || x==7f */
+  __m256i range1_0 = _mm256_or_si256(_mm256_cmpeq_epi8(rr4, b0), _mm256_andnot_si256(_mm256_cmpeq_epi8(b0, rr7), z_1f_0));
+  __m256i range1_1 = _mm256_or_si256(_mm256_cmpeq_epi8(rr4, b1), _mm256_andnot_si256(_mm256_cmpeq_epi8(b1, rr7), z_1f_1));
+  __m256i range1_2 = _mm256_or_si256(_mm256_cmpeq_epi8(rr4, b2), _mm256_andnot_si256(_mm256_cmpeq_epi8(b2, rr7), z_1f_2));
+  __m256i range1_3 = _mm256_or_si256(_mm256_cmpeq_epi8(rr4, b3), _mm256_andnot_si256(_mm256_cmpeq_epi8(b3, rr7), z_1f_3));
+  /* Generate bit masks */
+  unsigned int r0 = _mm256_movemask_epi8(range0_0);
+  unsigned int r1 = _mm256_movemask_epi8(range0_1);
+  /* Combine 32bit masks into a single 64bit mask */
+  *range0 = r0 ^ ((unsigned long)r1 << 32);
+
+  r0 = _mm256_movemask_epi8(range0_2);
+  r1 = _mm256_movemask_epi8(range0_3);
+  range0[1] = r0 ^ ((unsigned long)r1 << 32);
+
+  r0 = _mm256_movemask_epi8(range1_0);
+  r1 = _mm256_movemask_epi8(range1_1);
+
+  *range1 = r0 ^ ((unsigned long)r1 << 32);     
+  r0 = _mm256_movemask_epi8(range1_2);
+  r1 = _mm256_movemask_epi8(range1_3);
+
+  range1[1] = r0 ^ ((unsigned long)r1 << 32);
 }
 
-//__m256i m13 = _mm256_set1_epi8(13);
-__m256i m58 = _mm256_set1_epi8(58);   //  0x1313131313131313...
-                                      //  0x32333435363713   //  abcdef\r
-                                      //  32 bit number 0x40
+static const char* parse_headers_avx2(const char* buf, const char* buf_end, int* ret)
+{
+  // 128 bit token mask
+  unsigned long bm[8] = {0};
+  // Pointer to the start of the currently parsed block of 128 bytes
+  const char* prep_start = buf;
+  const char *p = buf;
 
-static void parse_mine( const char* buf ) {
-  unsigned long msk;
-  int i=0,t; // 32B index
-  int cnt = 0;
-  const char *sbuf = buf;
-  const char *obuf = buf;
-  do {
-    int shifted = 0;
-    const char *block_start = obuf+32*i;
-    __m256i b0 = _mm256_loadu_si256((const __m256i *) block_start);
-    msk = _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b0, m13), _mm256_cmpeq_epi8(b0, m58) ) );
-    printf(" m%d : 0x%08x\n", i, msk);
+  // Load the \r and : mask into rr13 and rr58
+  // Load 512 bytes at a time into the bit mask bm[8]
+  //   Load 32 bytes from the buffer into each register and compare against the mask registers
 
-    while (1) {
+  __m256i b0, b1, b2, b3;
+  const __m256i rr13    = _mm256_set1_epi8(13);
+  const __m256i rr58    = _mm256_set1_epi8(58);
+  int state = 0;
 
-      shifted = buf-block_start;
-      t = TZCNT((msk >> shifted)); 
-      printf("DELME shifted %d tzcnt %d\n", shifted, t);
-      if ( t < 32 ) {
-        if ( t == 0 ) break;
-        buf += t+2;
-        printf("L=%d str=%.*s\n", buf-sbuf-2, buf-sbuf-2, sbuf);
-        sbuf = buf;
+  // Process 512b per loop
+  while(1) {
+    int i = 0;
+    // Load 512b into bm[0-7]
+    while ( i < 8 ) {
+      b0 = _mm256_loadu_si256((__m256i *)(buf + (64*i)+ 0));
+      b1 = _mm256_loadu_si256((__m256i *)(buf + (64*i)+32));
+      b2 = _mm256_loadu_si256((__m256i *)(buf + (64*i)+64));
+      b3 = _mm256_loadu_si256((__m256i *)(buf + (64*i)+96));
+      bm[i++]   =       _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(rr13, b0),_mm256_cmpeq_epi8(rr58, b0)) ) |
+        ((unsigned long)_mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(rr13, b1),_mm256_cmpeq_epi8(rr58, b1)) ) << 32);
+      bm[i++] =         _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(rr13, b2),_mm256_cmpeq_epi8(rr58, b2)) ) |
+        ((unsigned long)_mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(rr13, b3),_mm256_cmpeq_epi8(rr58, b3)) ) << 32);
+    }
+
+    // Each bit in the mask is either a : or a \r
+    int off = 0;
+    int shft = 0;
+    int bmOff = 0;
+    unsigned long bitmap, tz;
+    int slen = 0; //DELME
+
+    do {
+      bitmap = bm[ bmOff ] >> shft;
+      tz = TZCNT(bitmap);
+      if ( tz < 64 ) { // tz is 64 if not found
+        p += tz;
+        printf( " fnd >%.*s<\n", p-buf, buf );  
+        if ( state == 0 ) { // :
+          state = 1;
+          p += 2; buf = p;
+        } else { // \r
+          state = 0;
+          p += 2; buf = p;
+          if ( *p == '\r' ) goto wedone;
+        }
       } else {
-        buf = block_start + 32;
-        break;
+        p += 64 - shft;
+        //printf("DELMEZ %.*s\n", 3, p) ;
       }
+      off = p-prep_start;
+      //printf("DELME off=%d\n",off);
+      shft = off&0x3F;
+      bmOff = off/64;
+    } while ( bmOff < 8 ) ;
+    prep_start += 512;
+    buf = prep_start;
+  }
 
+wedone:
+// Host: server\r\n
+// User-Agent: Mozilla/5.0 (X11; Linux x86_64) Gecko/20130501 Firefox/30.0 AppleWebKit/600.00 Chrome/30.0.0000.0 Trident/10.0 Safari/600.00\r\n
+  //printf("%.*s\n", p - prep_start, prep_start);
+  return buf;
+}
+
+
+static const char *my_get_eol128(const char *buf) {
+  //__m128i* pSrc1 = (__m128i *)string;         // init pointer to start of string
+  __m128i m0 = _mm_set1_epi8(13);              // vector of 16 `\0` characters
+
+  while (1)
+  {
+    __m128i v0 = _mm_loadu_si128((const __m128i *)buf);
+    __m128i v1 = _mm_cmpeq_epi8(v0, m0);    // compare all 16 chars
+    unsigned int vmask = _mm_movemask_epi8(v1);      // get 16 comparison result bits
+    if (vmask != 0) {
+        buf += TZCNT(vmask) + 2;
+        break;                              // we found a `\0`, break out of loop
     }
-    i+=1;
-    cnt += 1;
-  } while ( cnt < 20 && buf[0] != '\r' );
+    buf += 16; //pSrc1++;                                // next 16 characters...
+  }
+  return buf;
+}
 
+ //64bits  256bits  bytes 8 * 32 
+__m256i m13 = _mm256_set1_epi8(13);             
 
+static const char *my_get_eol(const char *buf) {
+
+  while (1)
+  {
+    __m256i v0 = _mm256_loadu_si256((const __m256i *)buf);
+    __m256i v1 = _mm256_cmpeq_epi8(v0, m13);     
+    unsigned long vmask = _mm256_movemask_epi8(v1);  
+    if (vmask != 0) {
+        buf += TZCNT(vmask) + 2;
+        break;                             
+    }
+    buf += 32; //pSrc1++;                 
+  }
+  return buf;
 }
 
-static void parse_mine3( const char* buf ) {
+//__m256i m13 = _mm256_set1_epi8(13);
+__m256i m58 = _mm256_set1_epi8(58);   //  0x1313131313131313...
+                                      //  0x32333435363713   //  abcdef\r
+                                      //  32 bit number 0x40
+static void parse_mine( const char* buf ) {
 
   unsigned long long msk[8];  // 1 bit for each of 512 bytes matching  : or \r
 
@@ -240,49 +746,325 @@ static void parse_mine3( const char* buf ) {
   msk[7] =            _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b6, m13), _mm256_cmpeq_epi8(b6, m58) ) )  ^
      ((unsigned long) _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b7, m13), _mm256_cmpeq_epi8(b7, m58) ) ) << 32);
 
-  const char *obuf = buf;
-  const char *sbuf = buf;
+  //for ( int i = 0; i < 8; i++ ) {
+    //printf(" m%d : 0x%016llx\n", i, msk[i]);
+  //}
+
+  //  uint64 msk[8]  -- 512 bits
+  //  Loop until crlfcrlf or 0xA
+  //    Name =  string(buf, tzcnt(msk[i]) )
+  //    msk >>= len,  buf += len // TODO increment i for each 64bits
+  //    Value = string(buf, tzcnt(msk[i]))
+
   // "Host: server\r\n"
   int i = 0;  // msk[i] 
-  int cnt = 0, t;
+  int l, dist, t;
   //int cnt = 0;
+  while (1) {
+
+    // msk[0] is only 64 bits 
+    l = 0;
+    while(1) {
+      t = TZCNT(msk[i]); // tz is 6,  'server\r\n'
+                             // msk[0] is all 0s and I get 64+2
+      if ( t == 64 ) {
+        l += t-dist; 
+        dist = 0;
+        i += 1;
+        if ( i > 7 ) break; 
+      } else {
+        l += t;
+        dist += t+2;
+        buf += l+2;  
+        if ( t+2 > 64 ) {
+          msk[i] = 0;
+          i += 1;
+          if ( i > 7 ) break; 
+          dist = t+2-64;
+          msk[i] >>= (t+2-64);
+        } else {
+          msk[i] >>= t+2;
+        }
+    
+        break;
+      }
+  
+    }
+   
+    if ( i > 7 ) break; 
+    if ( buf[0] == '\r' ) break;
+  }
+  
+  
+}
+static void parse_mine3( const char* buf ) {
+
+  unsigned long long msk[8];  // 1 bit for each of 512 bytes matching  : or \r
+
+  //__m256i b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,b10,b11,b12,b13,b14,b15;
+  __m256i b0,b1,b2,b3,b4,b5,b6,b7;
+
+  const char *obuf = buf;
+  const char *sbuf = buf;
+
+  int i;  // msk[i] 
+  int t;
+  unsigned int s = 0;
+  int name_or_value = 0;
+
+  const char *block_start = obuf;
+
+new512:
+  i = 0;
+  buf = obuf;
+
+  b0 = _mm256_loadu_si256((const __m256i *) (buf + 32*0)); // buf[0]
+  b1 = _mm256_loadu_si256((const __m256i *) (buf + 32*1)); // buf[32]
+  b2 = _mm256_loadu_si256((const __m256i *) (buf + 32*2)); // buf[64]
+  b3 = _mm256_loadu_si256((const __m256i *) (buf + 32*3)); // buf[96]
+  b4 = _mm256_loadu_si256((const __m256i *) (buf + 32*4)); // buf[128]
+  b5 = _mm256_loadu_si256((const __m256i *) (buf + 32*5));
+  b6 = _mm256_loadu_si256((const __m256i *) (buf + 32*6));
+  b7 = _mm256_loadu_si256((const __m256i *) (buf + 32*7)); // 256 bytes
+
+  msk[0] = (unsigned int)_mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b0, m13), _mm256_cmpeq_epi8(b0, m58) ) )  |
+        ((unsigned long) _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b1, m13), _mm256_cmpeq_epi8(b1, m58) ) ) << 32);
+  msk[1] = (unsigned int)_mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b2, m13), _mm256_cmpeq_epi8(b2, m58) ) )  |
+        ((unsigned long) _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b3, m13), _mm256_cmpeq_epi8(b3, m58) ) ) << 32);
+  msk[2] = (unsigned int)_mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b4, m13), _mm256_cmpeq_epi8(b4, m58) ) )  |
+        ((unsigned long) _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b5, m13), _mm256_cmpeq_epi8(b5, m58) ) ) << 32);
+  msk[3] = (unsigned int)_mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b6, m13), _mm256_cmpeq_epi8(b6, m58) ) )  |
+        ((unsigned long) _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b7, m13), _mm256_cmpeq_epi8(b7, m58) ) ) << 32);
+
+  b0 = _mm256_loadu_si256((const __m256i *) (buf + 32*8));
+  b1 = _mm256_loadu_si256((const __m256i *) (buf + 32*9));
+  b2 = _mm256_loadu_si256((const __m256i *) (buf + 32*10));
+  b3 = _mm256_loadu_si256((const __m256i *) (buf + 32*11));
+  b4 = _mm256_loadu_si256((const __m256i *) (buf + 32*12));
+  b5 = _mm256_loadu_si256((const __m256i *) (buf + 32*13));
+  b6 = _mm256_loadu_si256((const __m256i *) (buf + 32*14));
+  b7 = _mm256_loadu_si256((const __m256i *) (buf + 32*15));
+
+  msk[4] = (unsigned int)_mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b0, m13), _mm256_cmpeq_epi8(b0, m58) ) )  ^
+        ((unsigned long) _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b1, m13), _mm256_cmpeq_epi8(b1, m58) ) ) << 32);
+  msk[5] = (unsigned int)_mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b2, m13), _mm256_cmpeq_epi8(b2, m58) ) )  ^
+        ((unsigned long) _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b3, m13), _mm256_cmpeq_epi8(b3, m58) ) ) << 32);
+  msk[6] = (unsigned int)_mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b4, m13), _mm256_cmpeq_epi8(b4, m58) ) )  ^
+        ((unsigned long) _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b5, m13), _mm256_cmpeq_epi8(b5, m58) ) ) << 32);
+  msk[7] = (unsigned int)_mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b6, m13), _mm256_cmpeq_epi8(b6, m58) ) )  ^
+        ((unsigned long) _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b7, m13), _mm256_cmpeq_epi8(b7, m58) ) ) << 32);
+
+
+  // "Host: server\r\n"
   do {
 
-    const char *block_start = obuf+64*i;
+    block_start = obuf+64*i;
 
     while(1) {
-      t = TZCNT((msk[i]>>(buf-block_start))); 
-      printf("DELME shifted %d tzcnt %d msk %016llx\n", buf-block_start, t, (msk[i]>>(buf-block_start)));
+      s = buf-block_start;
+      t = TZCNT((msk[i]>>s));
+      //printf("DELME mski %016llx shift %d\n", msk[i], s );
+      //printf("DELME shft %016llx\n", msk[i]>>s );
       if ( t < 64 ) {
-        buf += t+2;  
-        printf("L=%d str=%.*s\n", buf-sbuf-2, buf-sbuf-2, sbuf);
+        buf += t;
+        if ( name_or_value == 1 ) {
+          if ( *buf == ':' ) { buf += 1; continue; } // : in value field
+          name_or_value = 0;
+        } else {
+          name_or_value = 1;
+        }
+        printf( " fnd >%.*s<\n", buf-sbuf, sbuf );  
+        buf += 2; if ( *buf == '\r' ) break; // \r\n\r\n marks the end
         sbuf = buf;
         if ( (buf-block_start)> 64 ) break; // TODO?
       } else {
         buf = block_start + 64;
         break;
       }
-  
+
     }
-  
-    cnt += 1; 
-    i+=1; if ( i > 7 ) break; // TODO
-  } while ( buf[0] != '\r' );
-  
-  
+
+    i+=1;
+    if ( buf[0] == '\r' ) goto done;
+  } while ( i < 8 && buf[0] != '\r' );
+
+  obuf += 512; 
+  goto new512;
+done: 
+  i += 1;
 }
 
+static void parse_mine2( const char* buf ) {
+  unsigned int msk;
+  int i=0,tz; // 32B index
+  int cnt = 0;
+  unsigned int shifted;
+  const char *sbuf = buf;
+  const char *obuf = buf;
+  int name_or_value = 0;
 
-int main() { 
+  do {
+    const char *block_start = obuf+32*i;
+    __m256i b0 = _mm256_loadu_si256((const __m256i *) block_start);
+    msk = _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b0, m13), _mm256_cmpeq_epi8(b0, m58) ) );
+    while (1) {
+
+      shifted = buf-block_start;
+      if ( shifted >= 32 ) break;
+      tz = TZCNT((msk >> shifted));
+      if ( tz < 32 ) {
+        buf += tz;
+        if ( name_or_value == 1 ) {
+          if ( *buf == ':' ) { buf += 1; continue; } // : in value field
+          name_or_value = 0;
+        } else {
+          name_or_value = 1;
+        }
+        printf( " fnd >%.*s<\n", buf-sbuf, sbuf );  
+        buf += 2; if ( *buf == '\r' ) break; // \r\n\r\n marks the end
+        sbuf = buf;
+      } else {
+        buf += 32 - shifted;
+        break;
+      }
+
+    }
+    i+=1;
+  } while ( *buf != '\r' );
+}
+
+static void parse_sse4( const char* buf ) {
+  int ret = 0;
+  while ( ret == 0 && buf != NULL && buf[0] != '\r' ) {
+    buf = get_token_to_eol( buf, buf+512, &ret);
+  }
+}
+static void parse_mysse4( const char* buf ) {
+  int ret = 0;
+  while ( ret == 0 && buf != NULL && buf[0] != '\r' ) {
+    buf = my_get_eol( buf );
+  }
+}
+/*
+static char buf[8096] = "Host: server\r\n"
+"User-Agent: Mozilla/5.0 (X11; Linux x86_64) Gecko/20130501 Firefox/30.0 AppleWebKit/600.00 Chrome/30.0.0000.0 Trident/10.0 Safari/600.00\r\n"
+"Cookie: uid=12345678901234567890; __utma=1.1234567890.1234567890.1234567890.1234567890.12; wd=2560x1600\r\n"
+"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,* /*;q=0.8\r\n"
+"Accept-Language: en-US,en;q=0.5\r\n"
+"Connection: keep-alive\r\n\r\n";
+static char buf2[8096] = "Host: localhost:8080\r\nUser-Agent: python-requests/2.31.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: * /*\r\nConnection: keep-alive\r\nCookie: foo=b=ar\r\nContent-Length: 0\r\n\r\n";
+
+static void BM_SlowParse(benchmark::State& state) {
+  // Perform setup here
+  std::string text = "Host: server\n"
+"User-Agent: Mozilla/5.0 (X11; Linux x86_64) Gecko/20130501 Firefox/30.0 AppleWebKit/600.00 Chrome/30.0.0000.0 Trident/10.0 Safari/600.00\n"
+"Cookie: uid=12345678901234567890; __utma=1.1234567890.1234567890.1234567890.1234567890.12; wd=2560x1600\n"
+"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,* /*;q=0.8\n"
+"Accept-Language: en-US,en;q=0.5\n"
+"Connection: keep-alive\n";
+
+  for (auto _ : state) {
+    // This code gets timed
+    slow_hparse(text);
+  }
+}
+static void BM_sse4_get_eol(benchmark::State& state) {
+  // Perform setup here
+  for (auto _ : state) {
+    // This code gets timed
+    parse_sse4(buf);
+  }
+}
+static void BM_my_get_eol(benchmark::State& state) {
+  // Perform setup here
+  for (auto _ : state) {
+    // This code gets timed
+    parse_mysse4(buf);
+  }
+}
+
+static void BM_my_header_parse(benchmark::State& state) {
+  // Perform setup here
+  for (auto _ : state) {
+    // This code gets timed
+    parse_mine(buf);
+  }
+}
+static void BM_my2_header_parse(benchmark::State& state) {
+  // Perform setup here
+  for (auto _ : state) {
+    // This code gets timed
+    parse_mine2(buf);
+  }
+}
+static void BM_my3_header_parse(benchmark::State& state) {
+  // Perform setup here
+  for (auto _ : state) {
+    // This code gets timed
+    parse_mine3(buf);
+  }
+}
+
+
+static void BM_old_header_parse(benchmark::State& state) {
+  // Perform setup here
+  int ret = 0;
+  for (auto _ : state) {
+    // This code gets timed
+    parse_headers(buf,buf+512,&ret);
+  }
+}
+
+static void BM_avx2_header_parse(benchmark::State& state) {
+  // Perform setup here
+  int ret = 0;
+  for (auto _ : state) {
+    // This code gets timed
+    parse_headers_avx2(buf,buf+512,&ret);
+  }
+}
+
+
+
+
+//BENCHMARK(BM_SlowParse);
+BENCHMARK(BM_sse4_get_eol);
+BENCHMARK(BM_my_get_eol);
+BENCHMARK(BM_my3_header_parse);
+BENCHMARK(BM_my2_header_parse);
+//BENCHMARK(BM_my_header_parse);
+BENCHMARK(BM_old_header_parse);
+BENCHMARK(BM_avx2_header_parse);
+BENCHMARK_MAIN();
+
+*/
+
+int main() {
   char buf[8096] = "Host: server\r\n"
 "User-Agent: Mozilla/5.0 (X11; Linux x86_64) Gecko/20130501 Firefox/30.0 AppleWebKit/600.00 Chrome/30.0.0000.0 Trident/10.0 Safari/600.00\r\n"
 "Cookie: uid=12345678901234567890; __utma=1.1234567890.1234567890.1234567890.1234567890.12; wd=2560x1600\r\n"
-"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n"
+"Cookie: uid=12345678901234567890; __utma=1.1234567890.1234567890.1234567890.1234567890.12; wd=2560x1600\r\n"
+"Cookie: uid=12345678901234567890; __utma=1.1234567890.1234567890.1234567890.1234567890.12; wd=2560x1600\r\n"
+"Cookie: uid=12345678901234567890; __utma=1.1234567890.1234567890.1234567890.1234567890.12; wd=2560x1600\r\n"
+"Cookie: uid=12345678901234567890; __utma=1.1234567890.1234567890.1234567890.1234567890.12; wd=2560x1600\r\n"
+"Cookie: uid=12345678901234567890; __utma=1.1234567890.1234567890.1234567890.1234567890.12; wd=2560x1600\r\n"
+"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,* /*;q=0.8\r\n"
 "Accept-Language: en-US,en;q=0.5\r\n"
-"Connection: keep-alive\r\n\r\nzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz           "
-"                                                                         "
-"                                                                         ";
+"Connection: keep-alive\r\n\r\n";
+  //strcpy(buf,"Host: localhost:8080\r\nUser-Agent: curl/7.68.0\r\nAccept: * /*\r\n\r\n");
+  //strcpy(buf,"Host: localhost:8080\r\nUser-Agent: python-requests/2.31.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: * /*\r\nConnection: keep-alive\r\nCookie: foo=b=ar\r\nContent-Length: 0\r\n\r\n");
+
 
-  parse_mine3(buf);
+  int ret = 0;
+  parse_headers_avx2(buf,buf+512,&ret);
+  //parse_headers(buf,buf+2048,&ret);
+  //parse_mine3(buf);
+  printf(" ret=%d\n",ret);
+
+  //unsigned long long l = 0x80008020ull;
+  //unsigned int s = 7;
+  //printf(" WTF %08x\n", l >> s );
 
 }
+
+
diff --git a/gbench/tst b/gbench/tst
index 332d9ac..df15d88 100755
Binary files a/gbench/tst and b/gbench/tst differ
diff --git a/src/mrhttp/internals/mrhttpparser.c b/src/mrhttp/internals/mrhttpparser.c
index 26ea77c..f10f3d2 100644
--- a/src/mrhttp/internals/mrhttpparser.c
+++ b/src/mrhttp/internals/mrhttpparser.c
@@ -1,3 +1,4 @@
+
 /*
  * Copyright (c) 2013-2018 Mark Reed
  *
@@ -254,7 +255,7 @@ static const char *is_complete(const char *buf, const char *buf_end, size_t last
     } while (0)
 
 
-#ifdef __DELMEAVX2__
+#ifdef __AVX2__
 static unsigned long TZCNT(unsigned long long in) {
   unsigned long res;
   asm("tzcnt %1, %0\n\t" : "=r"(res) : "r"(in));
@@ -264,96 +265,164 @@ static const char *parse_headers_avx2(const char *buf, const char *buf_end, stru
                                  size_t max_headers, int *ret, struct mr_request *mrr)
 {
   unsigned long long msk[8];  // 1 bit for each of 512 bytes matching  : or \r
+
+  //__m256i b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,b10,b11,b12,b13,b14,b15;
+  __m256i b0,b1,b2,b3,b4,b5,b6,b7;
+
+  __m256i m13 = _mm256_set1_epi8(13); // \r
+  __m256i m58 = _mm256_set1_epi8(58); // :
+
+  const char *obuf = buf;
   const char *sbuf = buf;
-  int cnt = 0;
+
+  int i;  // msk[i] 
+  int t;
+  unsigned int s = 0;
   int name_or_value = 0;
 
-__m256i m13 = _mm256_set1_epi8(13);
-__m256i m58 = _mm256_set1_epi8(58);
+  const char *block_start = obuf;
+
+av_new512:
+  i = 0;
+  buf = obuf;
+
+  b0 = _mm256_loadu_si256((const __m256i *) (buf + 32*0)); // buf[0]
+  b1 = _mm256_loadu_si256((const __m256i *) (buf + 32*1)); // buf[32]
+  b2 = _mm256_loadu_si256((const __m256i *) (buf + 32*2)); // buf[64]
+  b3 = _mm256_loadu_si256((const __m256i *) (buf + 32*3)); // buf[96]
+  b4 = _mm256_loadu_si256((const __m256i *) (buf + 32*4)); // buf[128]
+  b5 = _mm256_loadu_si256((const __m256i *) (buf + 32*5));
+  b6 = _mm256_loadu_si256((const __m256i *) (buf + 32*6));
+  b7 = _mm256_loadu_si256((const __m256i *) (buf + 32*7)); // 256 bytes
+
+  msk[0] = (unsigned int)_mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b0, m13), _mm256_cmpeq_epi8(b0, m58) ) )  |
+        ((unsigned long) _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b1, m13), _mm256_cmpeq_epi8(b1, m58) ) ) << 32);
+  msk[1] = (unsigned int)_mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b2, m13), _mm256_cmpeq_epi8(b2, m58) ) )  |
+        ((unsigned long) _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b3, m13), _mm256_cmpeq_epi8(b3, m58) ) ) << 32);
+  msk[2] = (unsigned int)_mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b4, m13), _mm256_cmpeq_epi8(b4, m58) ) )  |
+        ((unsigned long) _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b5, m13), _mm256_cmpeq_epi8(b5, m58) ) ) << 32);
+  msk[3] = (unsigned int)_mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b6, m13), _mm256_cmpeq_epi8(b6, m58) ) )  |
+        ((unsigned long) _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b7, m13), _mm256_cmpeq_epi8(b7, m58) ) ) << 32);
+
+  b0 = _mm256_loadu_si256((const __m256i *) (buf + 32*8));
+  b1 = _mm256_loadu_si256((const __m256i *) (buf + 32*9));
+  b2 = _mm256_loadu_si256((const __m256i *) (buf + 32*10));
+  b3 = _mm256_loadu_si256((const __m256i *) (buf + 32*11));
+  b4 = _mm256_loadu_si256((const __m256i *) (buf + 32*12));
+  b5 = _mm256_loadu_si256((const __m256i *) (buf + 32*13));
+  b6 = _mm256_loadu_si256((const __m256i *) (buf + 32*14));
+  b7 = _mm256_loadu_si256((const __m256i *) (buf + 32*15));
+
+  msk[4] = (unsigned int)_mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b0, m13), _mm256_cmpeq_epi8(b0, m58) ) )  ^
+        ((unsigned long) _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b1, m13), _mm256_cmpeq_epi8(b1, m58) ) ) << 32);
+  msk[5] = (unsigned int)_mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b2, m13), _mm256_cmpeq_epi8(b2, m58) ) )  ^
+        ((unsigned long) _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b3, m13), _mm256_cmpeq_epi8(b3, m58) ) ) << 32);
+  msk[6] = (unsigned int)_mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b4, m13), _mm256_cmpeq_epi8(b4, m58) ) )  ^
+        ((unsigned long) _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b5, m13), _mm256_cmpeq_epi8(b5, m58) ) ) << 32);
+  msk[7] = (unsigned int)_mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b6, m13), _mm256_cmpeq_epi8(b6, m58) ) )  ^
+        ((unsigned long) _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b7, m13), _mm256_cmpeq_epi8(b7, m58) ) ) << 32);
+
+
+  // "Host: server\r\n"
+  do {
+
+    block_start = obuf+64*i;
+
+    while(1) {
+      s = buf-block_start;
+      t = TZCNT((msk[i]>>s));
+      if ( t < 64 ) {
+        buf += t;
+        if ( name_or_value == 1 ) {
+          if ( *buf == ':' ) { buf += 1; continue; } // : in value field
+          headers[*num_headers].value = sbuf;
+          headers[*num_headers].value_len = buf-sbuf;
+          ++*num_headers;
+          if (*num_headers >= max_headers) { printf("DELME hdr too many\n"); *ret = -1; return NULL; }
+          name_or_value = 0;
+          buf += 2; if ( *buf == '\r' ) { goto av_done; } // \r\n\r\n marks the end
+        } else {
+          headers[*num_headers].name = sbuf;
+          headers[*num_headers].name_len = buf-sbuf;
+          name_or_value = 1;
+          buf += 2;
+        }
+        sbuf = buf;
+        if ( (buf-block_start)> 64 ) break; // TODO?
+      } else {
+        buf = block_start + 64;
+        break;
+      }
+
+    }
+
+    i+=1;
+    if ( buf[0] == '\r' ) goto av_done;
+  } while ( i < 8 && buf[0] != '\r' );
+
+  obuf += 512;
+  goto av_new512;
+av_done:
+  buf += 2;
+  *ret = 0;
+  return buf;
+}
+
+
+
+static const char *parse_headers_avx2_old(const char *buf, const char *buf_end, struct mr_header *headers, size_t *num_headers,
+                                 size_t max_headers, int *ret, struct mr_request *mrr)
+{
+  unsigned long msk;
+  int i=0,tz; // 32B index
+  int shifted;
+  const char *sbuf = buf;
+  const char *obuf = buf;
+  int name_or_value = 0;
+
+  __m256i m13 = _mm256_set1_epi8(13); // \r
+  __m256i m58 = _mm256_set1_epi8(58); // :
 
-  // Parse in 512B chunks with avx2 instructions
   do {
+    const char *block_start = obuf+32*i; i += 1;
+    if ( block_start > buf_end ) { printf("DELME hdr too big\n"); *ret = -1; return NULL; }
+    __m256i b0 = _mm256_loadu_si256((const __m256i *) block_start);
+    msk = _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b0, m13), _mm256_cmpeq_epi8(b0, m58) ) );
 
-    const char *block_start = buf; // Start of each 64B block
-    __m256i b0,b1,b2,b3,b4,b5,b6,b7;
-  
-    b0 = _mm256_loadu_si256((const __m256i *) (buf + 32*0)); // buf[0]
-    b1 = _mm256_loadu_si256((const __m256i *) (buf + 32*1)); // buf[32]
-    b2 = _mm256_loadu_si256((const __m256i *) (buf + 32*2)); // buf[64]
-    b3 = _mm256_loadu_si256((const __m256i *) (buf + 32*3)); // buf[96]
-    b4 = _mm256_loadu_si256((const __m256i *) (buf + 32*4)); // buf[128]
-    b5 = _mm256_loadu_si256((const __m256i *) (buf + 32*5));
-    b6 = _mm256_loadu_si256((const __m256i *) (buf + 32*6));
-    b7 = _mm256_loadu_si256((const __m256i *) (buf + 32*7)); // 256 bytes
-  
-    msk[0] =            _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b0, m13), _mm256_cmpeq_epi8(b0, m58) ) )  |
-       ((unsigned long) _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b1, m13), _mm256_cmpeq_epi8(b1, m58) ) ) << 32);
-    msk[1] =            _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b2, m13), _mm256_cmpeq_epi8(b2, m58) ) )  |
-       ((unsigned long) _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b3, m13), _mm256_cmpeq_epi8(b3, m58) ) ) << 32);
-    msk[2] =            _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b4, m13), _mm256_cmpeq_epi8(b4, m58) ) )  |
-       ((unsigned long) _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b5, m13), _mm256_cmpeq_epi8(b5, m58) ) ) << 32);
-    msk[3] =            _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b6, m13), _mm256_cmpeq_epi8(b6, m58) ) )  |
-       ((unsigned long) _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b7, m13), _mm256_cmpeq_epi8(b7, m58) ) ) << 32);
-  
-    b0 = _mm256_loadu_si256((const __m256i *) (buf + 32*8));
-    b1 = _mm256_loadu_si256((const __m256i *) (buf + 32*9));
-    b2 = _mm256_loadu_si256((const __m256i *) (buf + 32*10));
-    b3 = _mm256_loadu_si256((const __m256i *) (buf + 32*11));
-    b4 = _mm256_loadu_si256((const __m256i *) (buf + 32*12));
-    b5 = _mm256_loadu_si256((const __m256i *) (buf + 32*13));
-    b6 = _mm256_loadu_si256((const __m256i *) (buf + 32*14));
-    b7 = _mm256_loadu_si256((const __m256i *) (buf + 32*15));
-  
-    msk[4] =            _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b0, m13), _mm256_cmpeq_epi8(b0, m58) ) )  |
-       ((unsigned long) _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b1, m13), _mm256_cmpeq_epi8(b1, m58) ) ) << 32);
-    msk[5] =            _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b2, m13), _mm256_cmpeq_epi8(b2, m58) ) )  |
-       ((unsigned long) _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b3, m13), _mm256_cmpeq_epi8(b3, m58) ) ) << 32);
-    msk[6] =            _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b4, m13), _mm256_cmpeq_epi8(b4, m58) ) )  |
-       ((unsigned long) _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b5, m13), _mm256_cmpeq_epi8(b5, m58) ) ) << 32);
-    msk[7] =            _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b6, m13), _mm256_cmpeq_epi8(b6, m58) ) )  |
-       ((unsigned long) _mm256_movemask_epi8( _mm256_or_si256(_mm256_cmpeq_epi8(b7, m13), _mm256_cmpeq_epi8(b7, m58) ) ) << 32);
-  
+    while (1) {
+    
     // "Host: server\r\n"
     // Headers end on \r\n\r\n
-    int i = 0;  // msk[i]
-    int t;
-    do {
-  
-      while(1) {
-        t = TZCNT((msk[i]>>(buf-block_start)));
-        if ( t < 64 ) {
-          buf += t;
-          //printf(">%.*s<\n", 16, sbuf);
-          if ( name_or_value == 1 ) {
-            if ( buf[0] != '\r' ) { buf++; continue; } // Handle : in the value
-            headers[*num_headers].value = sbuf;
-            headers[*num_headers].value_len = buf-sbuf;
-            ++*num_headers;
-            name_or_value = 0;
-          } else {
-            headers[*num_headers].name = sbuf;
-            headers[*num_headers].name_len = buf-sbuf;
-            name_or_value = 1;
-          }
-          buf += 2; if ( buf[0] == '\r' ) { return buf+2; } // End of headers
-          sbuf = buf;
-          if ( (buf-block_start)> 64 ) break; 
+      shifted = buf-block_start;
+      tz = TZCNT((msk >> shifted));
+      if ( tz < 32 ) {
+        buf += tz;
+
+        if ( name_or_value == 1 ) {
+          if ( *buf == ':' ) { buf += 1; continue; } // : in value field
+          headers[*num_headers].value = sbuf;
+          headers[*num_headers].value_len = buf-sbuf;
+          ++*num_headers;
+          if (*num_headers >= max_headers) { printf("DELME hdr too many\n"); *ret = -1; return NULL; }
+          name_or_value = 0;
+          buf += 2; if ( *buf == '\r' ) { break; } // \r\n\r\n marks the end
         } else {
-          buf = block_start + 64;
-          break;
+          headers[*num_headers].name = sbuf;
+          headers[*num_headers].name_len = buf-sbuf;
+          name_or_value = 1;
+          buf += 2;
         }
-  
+        sbuf = buf;
+      } else {
+        buf += 32 - shifted;
+        break;
       }
-  
-      block_start += 64;
-      //if ( buf[0] == '\r' ) { return buf+2; } // End of headers
-    } while ( ++i < 8 );
-    
-  } while (++cnt < 32);
 
-  // If we get here the header was too large so abort. TODO abort with appropriate error code
-  *ret = -1;
-  return NULL;
+    }
+  } while ( *buf != '\r' );
+  buf += 2;
+  *ret = 0;
+  return buf;
 }
 #endif
 
@@ -684,8 +753,7 @@ static const char *parse_request(const char *buf, const char *buf_end, const cha
         *ret = -2;
         return NULL;
     }
-
-#ifdef __DELMEAVX2__
+#ifdef __AVX2__
     return parse_headers_avx2(buf, buf_end, headers, num_headers, max_headers, ret, mrr);
 #else
     return parse_headers(buf, buf_end, headers, num_headers, max_headers, ret, mrr);
@@ -715,7 +783,6 @@ int mr_parse_request(const char *buf_start, size_t len, const char **method, siz
     *minor_version = -1;
     *num_headers = 0;
 
-
     /* if last_len != 0, check if the request is complete (a fast countermeasure
        againt slowloris */
     if (last_len != 0 && is_complete(buf, buf_end, last_len, &r) == NULL) {
diff --git a/src/mrhttp/internals/parser.c b/src/mrhttp/internals/parser.c
index f840630..afff786 100644
--- a/src/mrhttp/internals/parser.c
+++ b/src/mrhttp/internals/parser.c
@@ -1,4 +1,5 @@
 
+
 #include <strings.h>
 #include <sys/param.h>
 #include <immintrin.h>
@@ -115,7 +116,11 @@ parse_headers:
       header < request->headers + request->num_headers;
       header++) {
 
-
+    if(header_name_equal("Content-Type")) {
+      if ( header->value[0] == 'a' && header->value[13] == 'r' ) { //"application/mrpacker"
+        request->hreq.flags = 2;
+      } 
+    }
     if(header_name_equal("Content-Length")) {
       char * endptr = (char *)header->value + header->value_len;
       self->body_length = strtol(header->value, &endptr, 10);
diff --git a/tests/test_requests.py b/tests/test_requests.py
index 0a9cc1c..36d13a6 100644
--- a/tests/test_requests.py
+++ b/tests/test_requests.py
@@ -107,7 +107,7 @@ def test_one():
   eq(r.text, '{"key": "value"}')
   r = requests.post('http://localhost:8080/json', json={"name": "value"})
   eq(r.text, 'value')
-  headers = {'Content-type': 'application/mrpacker'}
+  headers = {'Content-Type': 'application/mrpacker'}
   o = { "typ":"post", "s":2, "t": 'Blonde: "What does IDK stand for?"', "l":"localhost/sub/3", "txt": 'Brunette: "I don’t know."\nBlonde: "OMG, nobody does!"' }
   r = requests.post('http://localhost:8080/mrp', data=mrpacker.pack(o), headers=headers)
   if eq(r.text, 'post') != 0:
diff --git a/tests/tst.py b/tests/tst.py
index ab51ce4..fd4d8a0 100644
--- a/tests/tst.py
+++ b/tests/tst.py
@@ -4,6 +4,24 @@ import requests
 from common import eq,contains,stop_server
 import mrpacker
 
+if 1:
+
+  headers = {'Content-Type': 'application/mrpacker'}
+  o = { "typ":"post", "s":2, "t": 'Blonde: "What does IDK stand for?"', "l":"localhost/sub/3", "txt": 'Brunette: "I don’t know."\nBlonde: "OMG, nobody does!"' }
+  r = requests.post('http://localhost:8080/mrp', data=mrpacker.pack(o), headers=headers)
+  if eq(r.text, 'post') != 0:
+    print( r.raw.headers )
+    print( "text is ", r.text )
+
+  cookie = {'foo': 'bar','baz':'3'}
+  r = requests.post('http://localhost:8080/printCookies', cookies=cookie)
+  eq(r.text, "{'baz': '3', 'foo': 'bar'}")
+
+  cookie = {'foo': 'b=ar'}
+  r = requests.post('http://localhost:8080/printCookies', cookies=cookie)
+  eq(r.text, "{'foo': 'b=ar'}")
+  
+
 if 0:
   data = {}
   s = "lo(ng"*10000
@@ -11,7 +29,7 @@ if 0:
   r = requests.post('http://localhost:8080/form',data)
   eq(r.status_code, 200)
 
-if 1:
+if 0:
   headers = {'Content-type': 'application/mrpacker'}
   o = { "typ":"post", "s":4, "t": 'Blonde: "What does IDK stand for?"', "l":"", "txt": 'Brunette: "I don’t know."\nBlonde: "OMG, nobody does!"' }
   o = { "typ": "post", "l": "", "t": "Blonde: \"What does IDK stand for?\"", "txt": "Brunette: \"I don’t know.\"\n\nBlonde: \"OMG, nobody does!\"", "s": 3}