Attempting to parse a tab-based database in C

For discussions about programming, programming questions/advice, and projects that don't really have anything to do with Puppy.
Post Reply
Message
Author
Ibidem
Posts: 549
Joined: Wed 26 May 2010, 03:31
Location: State of Jefferson

Attempting to parse a tab-based database in C

#1 Post by Ibidem »

I'm trying to parse a tab-based database in this format:

Code: Select all

001a  Some long text here
\t03af  Subfield of 001a
\t040d  And another subfield
00c4  New field
Every entry is separated by a newline, and consists of a leading 16-bit hex value in plain text, two spaces, and an arbitrarily long string (the longest line is 108 chars, though). Subentries begin with a tab (C \t) preceding the hex value.
Entries are sorted.
I'm trying to get that string out, given the hex value.
(If you're curious, this is the pci-id database, which I'd like to parse so toybox lspci can output text. But this parser would also apply to the usb id database. Of course, this means any code I use must be PD/CC0/other permissive license that does not stipulate preservation of the copyright notice; I intend to indicate the source, but Rob's policy is to not require a notice if you copy from toybox.)

What I have is buggy and segfaults at line 38 (if curr[match] != id[match]), but I don't know why:

Code: Select all

/* Test of reading the PCI-id database
 * Written in 2013 and released under CC0, Isaac Dunham
 */
#define _XOPEN_SOURCE 600
#include <unistd.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <errno.h>
#include <string.h>


int get_db_entry(int fd, char *buf, ssize_t bufsiz, char *id, char *string, char *subfield)
{
  int overboard;
  off_t offset = 0;
  int bytes, match;
  char *curr, *eol;
  errno = 0;
  while (!errno)
  {
    overboard = 0;
    bytes = read(fd, buf, bufsiz);
    curr = buf; eol = buf;
    if (string || overboard)
      printf("Something's wrong! string is %s, overboard is %d\n",string, overboard);
    while (!string && !overboard) {
   /* Yes, this is where this should be. 
    * If we aren't in a "subfield" any longer, 
    * we cannot match the device within its class.
    */
      if (subfield && curr[0]++ !=subfield[0])
        string=id;
      for (match=0; match<4;) {
        if (curr[match] != id[match]) {
          eol=strstr(eol, "\n");
          break;
        } else {
          match++; printf("Match: %d\n", match);
        }
      }
      printf("string loop: match: %d curr: %jd eol: %jd\n", match, curr-buf, eol-buf);
      if (match == 4) {
        string = curr[match + 3];
        return string;
      } else if ( eol && eol != buf) {
        curr=++eol;
      }
      if (((curr - buf) > (bytes - 8)) || (eol == buf)) {
        overboard = (buf + bytes) - curr;
      }
    } //answer or overboard
    overboard = 0 - overboard;
    offset = lseek(fd, overboard, SEEK_CUR);
    printf("Read loop: off: %jd curr_off: %d eol_off: %d overboard: %d\n", (intmax_t)offset, curr-buf, eol-buf, overboard );
    
  } //errno
}

int main(int argc, char *argv[])
{
  int fd = open("/usr/share/misc/pci.ids", O_RDONLY);
  char *class = "168c", *dev = "001c", *sub = "\t";
  char *clnam = NULL;
  char *buff = calloc(1, 4104);

  get_db_entry(fd, buff, 4096, class, clnam, 0);
}
Any ideas?

User avatar
vovchik
Posts: 1507
Joined: Tue 24 Oct 2006, 00:02
Location: Ukraine

#2 Post by vovchik »

Dear Ibidem,

I don't want to spoil all your fun with C, but it is much simpler with BaCon and the resulting binary is about 7600k. And you can avoid a lot of unnecessary typing, too... :)

Code: Select all

' *****************************************************
' PROGRAM:	pcidb.bac
' PURPOSE:	parse pci database
' AUTHOR:		vovchik (Puppy Linux forum)
' MODDED:		
' DEPENDS:	gcc, bacon
' PLATFORM:	Puppy Linux (actually, any *nix)
' DATE:		27-07-2013
' NOTES:		Use following compile line and UPX binary afterwards (~7600 bytes):
' bacon -o -s -o -Os -o -fdata-sections -o -ffunction-sections -o -Wl,--gc-sections pcidb
' *****************************************************


' *********************
' SUBS & FUNCTIONS
' *********************

' ------------------
SUB PARSE_DATA(STRING myfile$)
' ------------------
	LOCAL txt$ TYPE STRING
	OPEN myfile$ FOR READING AS myfile
	WHILE NOT(ENDFILE(myfile)) DO
		READLN txt$ FROM myfile
		IF LEN(CHOP$(txt$)) > 4 THEN
			IF ASC(LEFT$(txt$, 1)) > 32 THEN
				PRINT "------"
				PRINT "Manufacturer ID: ", LEFT$(txt$, 4)
				PRINT "Manufacturer name: ", CHOP$(MID$(txt$, 6))
			ELSE
				txt$ = CHOP$(txt$)
				PRINT TAB$(1), "Device ID:   ", LEFT$(txt$, INSTR(txt$, "  ") - 1)
				PRINT TAB$(1), "Device desc: ", MID$(txt$, INSTR(txt$, "  ") + 2)
			END IF
		END IF
	WEND
	CLOSE FILE myfile
	PRINT "------"
	PRINT "All done."
END SUB

' *********************
' END SUBS & FUNCTIONS
' *********************
  

' *********************
' MAIN
' *********************

PARSE_DATA("/usr/share/misc/pci.ids")

' *********************
' END MAIN
' *********************
With kind regards,
vovchik

jamesbond
Posts: 3433
Joined: Mon 26 Feb 2007, 05:02
Location: The Blue Marble

#3 Post by jamesbond »

Code: Select all

/******************
 * Author: jamesbond, July 2013
 * Released under CC0
 * ****************/
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <errno.h>

#define HANDLE_ERROR(msg, label) { set_error(error, msg); goto label; }
void set_error (char **error, char *msg) {
	if (error) {
		if (msg) *error = strdup(msg);
		else *error = strdup (strerror(errno));
	}
}

#define SEPARATOR_LEN  2 //two spaces
#define DEVICE_PREFIX_LEN 1 //one tab for the device

// return: 0 - failed, 1 - success
int lookup_vendor_device(char *filename, char *vendor_id, char *device_id, 
                         char **vendor_text, char **device_text, char **error)
{
	struct stat st;
	int fd;
	char *data;
	
	// validation & setup
	if (error) *error = 0;	
	if (! (filename && vendor_id && device_id && vendor_text && device_text)) 
		HANDLE_ERROR("params cannot be null", err);

	int vlen, dlen;
	vlen = strlen (vendor_id);
	dlen = strlen (device_id);
	*vendor_text = *device_text = NULL;
	
	// open and map
	if ((fd = open (filename, O_RDONLY)) == -1) 
		HANDLE_ERROR(NULL, err);
	if (fstat(fd, &st) == -1) 
		HANDLE_ERROR(NULL, close_err)
	if ((data = mmap(NULL, st.st_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0)) == MAP_FAILED) 
		HANDLE_ERROR(NULL, close_err)
	
	//look for vendor
	char *next;
	next = data; 
	
	char *v_start=NULL;	
	while (next) {
		if (strncasecmp (next, vendor_id, vlen) == 0) {
			v_start = next;
			break;
		}
		if ((next = strchr (next, '\n'))) next++;
	}
	if (v_start) {
		next = strchr (next, '\n');
		if (next) {
			*next=0;
			next++;
		}
		*vendor_text = strdup ( v_start + strlen(vendor_id) + SEPARATOR_LEN );
	} else HANDLE_ERROR("vendor not found",unmap_err);
	
	//look for device
	char *d_start=NULL;
	if (v_start) {
		while (next && next[0]=='\t') {
			if (strncasecmp (next + DEVICE_PREFIX_LEN, device_id, dlen) == 0) {
				d_start = next;
				break;
			}
			if ((next = strchr (next, '\n'))) next++;
		}
		if (d_start) {
			next = strchr (next, '\n');
			if (next) {
				*next=0;
				next++;
			}
			*device_text = strdup ( d_start + strlen(device_id) + SEPARATOR_LEN + DEVICE_PREFIX_LEN );
		}
	}

	// clean up
	munmap (data, st.st_size); 
	close (fd);
	return 1;
	
unmap_err:
	munmap (data, st.st_size); 
close_err:
	close (fd);
err:
	return 0;
}


////////////////
// test
////////////////
int main (int argc, char *argv[]) {
	
	char *pci_ids="/usr/share/pci.ids";
	char *vendor_text=NULL, *device_text=NULL, *error=NULL;
	
	if (argc < 3) { printf ("Usage: prog vendor device\n"); exit(1); }
	if (lookup_vendor_device (pci_ids, argv[1], argv[2], 
	                          &vendor_text, &device_text, &error)) {
		printf ("%s (%s) - %s (%s)\n", argv[1], argv[2], vendor_text, device_text);
		free (vendor_text);
		free (device_text);
	} else {
		printf ("Error: %s\n", error);
		free (error);
		exit (1);
	}
	exit(0);
}
Compiles well with musl.

It uses linear search so it's a bit slow, but since pci.ids only have about 17000 lines, it isn't so bad (2ms to find the first entry, 5ms to find the last entry).

Test cases (obviously with my copy of pci.ids which may not be the same as yours):

Code: Select all

./lookup 1002 4144 --> radeon (radeon)
./lookup 1002 4143 --> radeon (null - can't find device)
./lookup 1002 0000 --> radeon (null - can't find device, 0000 device code belongs to another vendor)
./lookup 2002 4144 --> can't find vendor
./lookup 0000 xxxx --> first entry in pci.ids (no device)
./lookup "C 11" 10 --> those funny vendor/device entries near end of pci.ids
./lookup "C 11" 80 --> vendor/device of last entry of pci.ids
./lookup "C 11" 81 --> beyond last entry of pci.ids - vendor only, no device
EDIT: actually not too shabby considering that the sed of the same stuff does it in 14ms (looking for 1002 4144, I have skip the pretty processing - and just print the entire matched line):

Code: Select all

sed -n '/^1002/ {p; :next ; n; /\t0000/ p; /^\t/!q; b next;}' /usr/share/pci.ids
The time taken by the routine for this particular entry is 2ms. Of course, the price is you pay is 600KB of memory mapping, but I believe that's a a small price to pay even in 32-bit systems.

More EDIT: Oh and don't pass garbage as the pci.ids file. One tabs and two spaces with hex codes followed by description and new line. Garbage In Segfaults Out (tm) :)
Fatdog64 forum links: [url=http://murga-linux.com/puppy/viewtopic.php?t=117546]Latest version[/url] | [url=https://cutt.ly/ke8sn5H]Contributed packages[/url] | [url=https://cutt.ly/se8scrb]ISO builder[/url]

User avatar
technosaurus
Posts: 4853
Joined: Mon 19 May 2008, 01:24
Location: Blue Springs, MO
Contact:

#4 Post by technosaurus »

I havent played around with it, but to me the obvious way to handle formatted files is fscanf, the formatting options are similar to those used by printf and you can read the data into a specialized struct for each entry or handle them on the fly.
Check out my [url=https://github.com/technosaurus]github repositories[/url]. I may eventually get around to updating my [url=http://bashismal.blogspot.com]blogspot[/url].

Ibidem
Posts: 549
Joined: Wed 26 May 2010, 03:31
Location: State of Jefferson

#5 Post by Ibidem »

Thanks for all the replies.
@jamesbond:
I'd been assuming that mmap would be out of question, with the potential number of devices. But it's much lower than I expected.
(I'd expect that around 4-8 MB it would make sense to avoid mmap...)
Also, 2 ms is very reasonable for a full search. But let's see...
An emulated Versatile board (qemu-system-arm) might be ~200 MHz effective speed, or ~1/10 - 1/15 of a standard X86 system...
I'd expect well under 100 ms on anything that's likely to run lspci, so it won't be noticeable.

@technosaurus:
As far as I can tell (man 3p), fscanf checks for whitespace in general, so checking for \t is going to break...also the oddball vendor IDs ("C 11") will really screw it up.
fgets could work if I did it that way.
I think it would look vaguely like this:

Code: Select all

int match;
char *vname = 0L, *dname = 0L;
while(!errno){
  str=fgets(buf, sizeof(buf), fil);
  if (str[0]=='\t' && vname) str++;
  for(match=0; ; match < 4) {
    if str[match] == vendor[match] {
     match++;
    } else {
     break; 
    }
  } 
  if (match  > 3 )  vname = str + match + 4;
  } 
  ....
@vovchik:
C is the language of the project I'm working on.

User avatar
technosaurus
Posts: 4853
Joined: Mon 19 May 2008, 01:24
Location: Blue Springs, MO
Contact:

#6 Post by technosaurus »

Yes, I wasnt thinking of that. It would be fairly straightforward (but possibly tedious) to adapt musl libc's fscanf code by replacing the isspace parts with a strncmp where the substring points to the next chunk of fmt string and n is the distance to the next %* ... Honestly its hard to believe such a thing doesnt already exist.
Check out my [url=https://github.com/technosaurus]github repositories[/url]. I may eventually get around to updating my [url=http://bashismal.blogspot.com]blogspot[/url].

Ibidem
Posts: 549
Joined: Wed 26 May 2010, 03:31
Location: State of Jefferson

#7 Post by Ibidem »

Here's an fgets-based example:

Code: Select all

/* Lookup IDs using fgets in tab-delimited DB (pci.ids or usb.ids)
 * Written 2013 AD by Isaac Dunham, released under CC0
 */
#include <stdio.h>
#include <string.h>

char * checkmatch(char * id, char * buf)
{
	int i = 0;
	while (i < 4) {
		if (id[i] == buf[i]) {
			i++;
		} else {
			return (char *)0L;
		}
	}
	return (buf + i + 2);
}

/*
 * In: vendid, devid, fil
 * Out: vname, devname
 * Out must be zeroed before use.
 * vname and devname must be char[256], zeroed out
 * Returns (2 - number of IDs matched): vendor must be matched for 
 * dev to be matched
 */
int find_in_db(char * vendid, char * devid, FILE * fil,
		char * vname, char * devname)
{
	char buf[256], *vtext = 0L, *dtext = 0L;
	while (!(vname[0])) {
		//loop through
		if (fgets(buf, 255, fil)==NULL) return 2;
		if (vtext = checkmatch(vendid, buf))
			strncpy(vname, vtext, strlen(vtext) - 1);
	}
	while (!(devname[0])) {
		if ((fgets(buf, 255, fil)==NULL) || (buf[0] != '\t' ))
			return 1;
		if (dtext = checkmatch(devid, buf + 1))
			strncpy(devname, dtext, strlen(dtext) - 1);
	}
	return 0; /* Succeeded in matching both */
}

int main(int argc, char **argv)
{
	char * vendor = argv[1], * device = argv[2], * data = argv[3];
	char vendorname[256], devname[256];
	int result;
	FILE * file = fopen(data, "r");

	memset(vendorname, 0x00, 256);
	memset(devname, 0x00, 256);
	
	result = find_in_db(vendor, device, file, vendorname, devname);
	printf("%d: %s : %s\n", result, (result < 2) ? vendorname : vendor,
		 (result < 1) ? devname : device);
	fclose(file);
}	
In this case, you specify the file to use as the 3rd argument.

It's a lot faster than I'd expected...

Post Reply