Puppy Linux Discussion Forum Forum Index Puppy Linux Discussion Forum
Puppy HOME page : puppylinux.com
"THE" alternative forum : puppylinux.info
 
 FAQFAQ   SearchSearch   MemberlistMemberlist   UsergroupsUsergroups   RegisterRegister 
 ProfileProfile   Log in to check your private messagesLog in to check your private messages   Log inLog in 

The time now is Wed 17 Dec 2014, 19:33
All times are UTC - 4
 Forum index » Off-Topic Area » Programming
Attempting to parse a tab-based database in C
Post_new_topic   Reply_to_topic View_previous_topic :: View_next_topic
Page 1 of 1 Posts_count  
Author Message
Ibidem

Joined: 25 May 2010
Posts: 527
Location: State of Jefferson

PostPosted: Fri 26 Jul 2013, 02:14    Post_subject:  Attempting to parse a tab-based database in C  

I'm trying to parse a tab-based database in this format:
Code:
001a  Some long text here
\t03af  Subfield of 001a
\t040d  And another subfield
00c4  New field

Every entry is separated by a newline, and consists of a leading 16-bit hex value in plain text, two spaces, and an arbitrarily long string (the longest line is 108 chars, though). Subentries begin with a tab (C \t) preceding the hex value.
Entries are sorted.
I'm trying to get that string out, given the hex value.
(If you're curious, this is the pci-id database, which I'd like to parse so toybox lspci can output text. But this parser would also apply to the usb id database. Of course, this means any code I use must be PD/CC0/other permissive license that does not stipulate preservation of the copyright notice; I intend to indicate the source, but Rob's policy is to not require a notice if you copy from toybox.)

What I have is buggy and segfaults at line 38 (if curr[match] != id[match]), but I don't know why:
Code:
/* Test of reading the PCI-id database
 * Written in 2013 and released under CC0, Isaac Dunham
 */
#define _XOPEN_SOURCE 600
#include <unistd.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <errno.h>
#include <string.h>


int get_db_entry(int fd, char *buf, ssize_t bufsiz, char *id, char *string, char *subfield)
{
  int overboard;
  off_t offset = 0;
  int bytes, match;
  char *curr, *eol;
  errno = 0;
  while (!errno)
  {
    overboard = 0;
    bytes = read(fd, buf, bufsiz);
    curr = buf; eol = buf;
    if (string || overboard)
      printf("Something's wrong! string is %s, overboard is %d\n",string, overboard);
    while (!string && !overboard) {
   /* Yes, this is where this should be.
    * If we aren't in a "subfield" any longer,
    * we cannot match the device within its class.
    */
      if (subfield && curr[0]++ !=subfield[0])
        string=id;
      for (match=0; match<4;) {
        if (curr[match] != id[match]) {
          eol=strstr(eol, "\n");
          break;
        } else {
          match++; printf("Match: %d\n", match);
        }
      }
      printf("string loop: match: %d curr: %jd eol: %jd\n", match, curr-buf, eol-buf);
      if (match == 4) {
        string = curr[match + 3];
        return string;
      } else if ( eol && eol != buf) {
        curr=++eol;
      }
      if (((curr - buf) > (bytes - 8)) || (eol == buf)) {
        overboard = (buf + bytes) - curr;
      }
    } //answer or overboard
    overboard = 0 - overboard;
    offset = lseek(fd, overboard, SEEK_CUR);
    printf("Read loop: off: %jd curr_off: %d eol_off: %d overboard: %d\n", (intmax_t)offset, curr-buf, eol-buf, overboard );
   
  } //errno
}

int main(int argc, char *argv[])
{
  int fd = open("/usr/share/misc/pci.ids", O_RDONLY);
  char *class = "168c", *dev = "001c", *sub = "\t";
  char *clnam = NULL;
  char *buff = calloc(1, 4104);

  get_db_entry(fd, buff, 4096, class, clnam, 0);
}

Any ideas?
Back to top
View user's profile Send_private_message 
vovchik


Joined: 23 Oct 2006
Posts: 1287
Location: Ukraine

PostPosted: Fri 26 Jul 2013, 20:16    Post_subject:  

Dear Ibidem,

I don't want to spoil all your fun with C, but it is much simpler with BaCon and the resulting binary is about 7600k. And you can avoid a lot of unnecessary typing, too... Smile

Code:

' *****************************************************
' PROGRAM:   pcidb.bac
' PURPOSE:   parse pci database
' AUTHOR:      vovchik (Puppy Linux forum)
' MODDED:      
' DEPENDS:   gcc, bacon
' PLATFORM:   Puppy Linux (actually, any *nix)
' DATE:      27-07-2013
' NOTES:      Use following compile line and UPX binary afterwards (~7600 bytes):
' bacon -o -s -o -Os -o -fdata-sections -o -ffunction-sections -o -Wl,--gc-sections pcidb
' *****************************************************


' *********************
' SUBS & FUNCTIONS
' *********************

' ------------------
SUB PARSE_DATA(STRING myfile$)
' ------------------
   LOCAL txt$ TYPE STRING
   OPEN myfile$ FOR READING AS myfile
   WHILE NOT(ENDFILE(myfile)) DO
      READLN txt$ FROM myfile
      IF LEN(CHOP$(txt$)) > 4 THEN
         IF ASC(LEFT$(txt$, 1)) > 32 THEN
            PRINT "------"
            PRINT "Manufacturer ID: ", LEFT$(txt$, 4)
            PRINT "Manufacturer name: ", CHOP$(MID$(txt$, 6))
         ELSE
            txt$ = CHOP$(txt$)
            PRINT TAB$(1), "Device ID:   ", LEFT$(txt$, INSTR(txt$, "  ") - 1)
            PRINT TAB$(1), "Device desc: ", MID$(txt$, INSTR(txt$, "  ") + 2)
         END IF
      END IF
   WEND
   CLOSE FILE myfile
   PRINT "------"
   PRINT "All done."
END SUB

' *********************
' END SUBS & FUNCTIONS
' *********************
 

' *********************
' MAIN
' *********************

PARSE_DATA("/usr/share/misc/pci.ids")

' *********************
' END MAIN
' *********************


With kind regards,
vovchik
Back to top
View user's profile Send_private_message 
jamesbond

Joined: 26 Feb 2007
Posts: 2232
Location: The Blue Marble

PostPosted: Sat 27 Jul 2013, 01:45    Post_subject:  

Code:

/******************
 * Author: jamesbond, July 2013
 * Released under CC0
 * ****************/
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <errno.h>

#define HANDLE_ERROR(msg, label) { set_error(error, msg); goto label; }
void set_error (char **error, char *msg) {
   if (error) {
      if (msg) *error = strdup(msg);
      else *error = strdup (strerror(errno));
   }
}

#define SEPARATOR_LEN  2 //two spaces
#define DEVICE_PREFIX_LEN 1 //one tab for the device

// return: 0 - failed, 1 - success
int lookup_vendor_device(char *filename, char *vendor_id, char *device_id,
                         char **vendor_text, char **device_text, char **error)
{
   struct stat st;
   int fd;
   char *data;
   
   // validation & setup
   if (error) *error = 0;   
   if (! (filename && vendor_id && device_id && vendor_text && device_text))
      HANDLE_ERROR("params cannot be null", err);

   int vlen, dlen;
   vlen = strlen (vendor_id);
   dlen = strlen (device_id);
   *vendor_text = *device_text = NULL;
   
   // open and map
   if ((fd = open (filename, O_RDONLY)) == -1)
      HANDLE_ERROR(NULL, err);
   if (fstat(fd, &st) == -1)
      HANDLE_ERROR(NULL, close_err)
   if ((data = mmap(NULL, st.st_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0)) == MAP_FAILED)
      HANDLE_ERROR(NULL, close_err)
   
   //look for vendor
   char *next;
   next = data;
   
   char *v_start=NULL;   
   while (next) {
      if (strncasecmp (next, vendor_id, vlen) == 0) {
         v_start = next;
         break;
      }
      if ((next = strchr (next, '\n'))) next++;
   }
   if (v_start) {
      next = strchr (next, '\n');
      if (next) {
         *next=0;
         next++;
      }
      *vendor_text = strdup ( v_start + strlen(vendor_id) + SEPARATOR_LEN );
   } else HANDLE_ERROR("vendor not found",unmap_err);
   
   //look for device
   char *d_start=NULL;
   if (v_start) {
      while (next && next[0]=='\t') {
         if (strncasecmp (next + DEVICE_PREFIX_LEN, device_id, dlen) == 0) {
            d_start = next;
            break;
         }
         if ((next = strchr (next, '\n'))) next++;
      }
      if (d_start) {
         next = strchr (next, '\n');
         if (next) {
            *next=0;
            next++;
         }
         *device_text = strdup ( d_start + strlen(device_id) + SEPARATOR_LEN + DEVICE_PREFIX_LEN );
      }
   }

   // clean up
   munmap (data, st.st_size);
   close (fd);
   return 1;
   
unmap_err:
   munmap (data, st.st_size);
close_err:
   close (fd);
err:
   return 0;
}


////////////////
// test
////////////////
int main (int argc, char *argv[]) {
   
   char *pci_ids="/usr/share/pci.ids";
   char *vendor_text=NULL, *device_text=NULL, *error=NULL;
   
   if (argc < 3) { printf ("Usage: prog vendor device\n"); exit(1); }
   if (lookup_vendor_device (pci_ids, argv[1], argv[2],
                             &vendor_text, &device_text, &error)) {
      printf ("%s (%s) - %s (%s)\n", argv[1], argv[2], vendor_text, device_text);
      free (vendor_text);
      free (device_text);
   } else {
      printf ("Error: %s\n", error);
      free (error);
      exit (1);
   }
   exit(0);
}


Compiles well with musl.

It uses linear search so it's a bit slow, but since pci.ids only have about 17000 lines, it isn't so bad (2ms to find the first entry, 5ms to find the last entry).

Test cases (obviously with my copy of pci.ids which may not be the same as yours):
Code:

./lookup 1002 4144 --> radeon (radeon)
./lookup 1002 4143 --> radeon (null - can't find device)
./lookup 1002 0000 --> radeon (null - can't find device, 0000 device code belongs to another vendor)
./lookup 2002 4144 --> can't find vendor
./lookup 0000 xxxx --> first entry in pci.ids (no device)
./lookup "C 11" 10 --> those funny vendor/device entries near end of pci.ids
./lookup "C 11" 80 --> vendor/device of last entry of pci.ids
./lookup "C 11" 81 --> beyond last entry of pci.ids - vendor only, no device


EDIT: actually not too shabby considering that the sed of the same stuff does it in 14ms (looking for 1002 4144, I have skip the pretty processing - and just print the entire matched line):
Code:
sed -n '/^1002/ {p; :next ; n; /\t0000/ p; /^\t/!q; b next;}' /usr/share/pci.ids

The time taken by the routine for this particular entry is 2ms. Of course, the price is you pay is 600KB of memory mapping, but I believe that's a a small price to pay even in 32-bit systems.

More EDIT: Oh and don't pass garbage as the pci.ids file. One tabs and two spaces with hex codes followed by description and new line. Garbage In Segfaults Out (tm) Smile

_________________
Fatdog64, Slacko and Puppeee user. Puppy user since 2.13.
Contributed Fatdog64 packages thread
Back to top
View user's profile Send_private_message 
technosaurus


Joined: 18 May 2008
Posts: 4423

PostPosted: Thu 01 Aug 2013, 22:42    Post_subject:  

I havent played around with it, but to me the obvious way to handle formatted files is fscanf, the formatting options are similar to those used by printf and you can read the data into a specialized struct for each entry or handle them on the fly.
_________________
Web Programming - Pet Packaging 100 & 101
Back to top
View user's profile Send_private_message 
Ibidem

Joined: 25 May 2010
Posts: 527
Location: State of Jefferson

PostPosted: Fri 02 Aug 2013, 02:23    Post_subject:  

Thanks for all the replies.
@jamesbond:
I'd been assuming that mmap would be out of question, with the potential number of devices. But it's much lower than I expected.
(I'd expect that around 4-8 MB it would make sense to avoid mmap...)
Also, 2 ms is very reasonable for a full search. But let's see...
An emulated Versatile board (qemu-system-arm) might be ~200 MHz effective speed, or ~1/10 - 1/15 of a standard X86 system...
I'd expect well under 100 ms on anything that's likely to run lspci, so it won't be noticeable.

@technosaurus:
As far as I can tell (man 3p), fscanf checks for whitespace in general, so checking for \t is going to break...also the oddball vendor IDs ("C 11") will really screw it up.
fgets could work if I did it that way.
I think it would look vaguely like this:
Code:

int match;
char *vname = 0L, *dname = 0L;
while(!errno){
  str=fgets(buf, sizeof(buf), fil);
  if (str[0]=='\t' && vname) str++;
  for(match=0; ; match < 4) {
    if str[match] == vendor[match] {
     match++;
    } else {
     break;
    }
  }
  if (match  > 3 )  vname = str + match + 4;
  }
  ....


@vovchik:
C is the language of the project I'm working on.
Back to top
View user's profile Send_private_message 
technosaurus


Joined: 18 May 2008
Posts: 4423

PostPosted: Fri 02 Aug 2013, 17:40    Post_subject:  

Yes, I wasnt thinking of that. It would be fairly straightforward (but possibly tedious) to adapt musl libc's fscanf code by replacing the isspace parts with a strncmp where the substring points to the next chunk of fmt string and n is the distance to the next %* ... Honestly its hard to believe such a thing doesnt already exist.
_________________
Web Programming - Pet Packaging 100 & 101
Back to top
View user's profile Send_private_message 
Ibidem

Joined: 25 May 2010
Posts: 527
Location: State of Jefferson

PostPosted: Sat 03 Aug 2013, 13:42    Post_subject:  

Here's an fgets-based example:
Code:
/* Lookup IDs using fgets in tab-delimited DB (pci.ids or usb.ids)
 * Written 2013 AD by Isaac Dunham, released under CC0
 */
#include <stdio.h>
#include <string.h>

char * checkmatch(char * id, char * buf)
{
   int i = 0;
   while (i < 4) {
      if (id[i] == buf[i]) {
         i++;
      } else {
         return (char *)0L;
      }
   }
   return (buf + i + 2);
}

/*
 * In: vendid, devid, fil
 * Out: vname, devname
 * Out must be zeroed before use.
 * vname and devname must be char[256], zeroed out
 * Returns (2 - number of IDs matched): vendor must be matched for
 * dev to be matched
 */
int find_in_db(char * vendid, char * devid, FILE * fil,
      char * vname, char * devname)
{
   char buf[256], *vtext = 0L, *dtext = 0L;
   while (!(vname[0])) {
      //loop through
      if (fgets(buf, 255, fil)==NULL) return 2;
      if (vtext = checkmatch(vendid, buf))
         strncpy(vname, vtext, strlen(vtext) - 1);
   }
   while (!(devname[0])) {
      if ((fgets(buf, 255, fil)==NULL) || (buf[0] != '\t' ))
         return 1;
      if (dtext = checkmatch(devid, buf + 1))
         strncpy(devname, dtext, strlen(dtext) - 1);
   }
   return 0; /* Succeeded in matching both */
}

int main(int argc, char **argv)
{
   char * vendor = argv[1], * device = argv[2], * data = argv[3];
   char vendorname[256], devname[256];
   int result;
   FILE * file = fopen(data, "r");

   memset(vendorname, 0x00, 256);
   memset(devname, 0x00, 256);
   
   result = find_in_db(vendor, device, file, vendorname, devname);
   printf("%d: %s : %s\n", result, (result < 2) ? vendorname : vendor,
       (result < 1) ? devname : device);
   fclose(file);
}   

In this case, you specify the file to use as the 3rd argument.

It's a lot faster than I'd expected...
Back to top
View user's profile Send_private_message 
Display_posts:   Sort by:   
Page 1 of 1 Posts_count  
Post_new_topic   Reply_to_topic View_previous_topic :: View_next_topic
 Forum index » Off-Topic Area » Programming
Jump to:  

Rules_post_cannot
Rules_reply_cannot
Rules_edit_cannot
Rules_delete_cannot
Rules_vote_cannot
You cannot attach files in this forum
You can download files in this forum


Powered by phpBB © 2001, 2005 phpBB Group
[ Time: 0.1047s ][ Queries: 11 (0.0049s) ][ GZIP on ]