read line by line in the most efficient way *platform specific*
Doing read(2)
is not very good because of the fragmentation of the lines (e.g. if you read 1000 chars, the last line may start at offset 990 and need 50 chars beyond the buffer).
Better to use fgets
[recommended] or mmap
[YMMV]. Here are examples of each. Caveat: compiles, but not tested, and doesn't do much error checking
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/stat.h>
#include <sys/mman.h>
typedef struct {
unsigned int lidx;
char *lncontent;
} FileL;
// read in file lines using fgets
FileL *
getfileline_fgets(char *file)
{
FILE *xf;
char *cp;
int len;
int linecnt;
int linemax;
FileL *linelist;
FileL *line;
char lbuf[50000];
xf = fopen(file,"r");
linecnt = 0;
linemax = 0;
linelist = NULL;
while (1) {
cp = fgets(lbuf,sizeof(lbuf),xf);
if (cp == NULL)
break;
len = strlen(lbuf);
// strip newline from string [or not]
#if 1
if (len > 0) {
cp = &lbuf[len - 1];
if (*cp == '\n') {
*cp = 0;
--len;
}
}
#endif
if ((linecnt + 1) > linemax) {
linemax += 100;
linelist = realloc(linelist,linemax * sizeof(FileL));
}
line = &linelist[linecnt];
line->lidx = linecnt++;
cp = malloc(len + 1);
memcpy(cp,lbuf,len + 1);
line->lncontent = cp;
}
fclose(xf);
// trim to maximum used
linelist = realloc(linelist,linecnt * sizeof(FileL));
return linelist;
}
// read in file lines by mmap to entire file
FileL *
getfilelines_mmap(char *file)
{
int fd;
char *lhs;
char *rhs;
char *cp;
int len;
int linecnt;
int linemax;
FileL *linelist;
FileL *line;
struct stat st;
char *fbuf;
char cbuf[50000];
fd = open(file,O_RDONLY);
fstat(fd,&st);
fbuf = mmap(NULL,st.st_size,PROT_READ,MAP_PRIVATE,fd,0);
linecnt = 0;
linemax = 0;
linelist = NULL;
lhs = fbuf;
rhs = fbuf;
for (lhs = fbuf; lhs < &fbuf[st.st_size]; lhs = rhs + 1) {
rhs = strchr(lhs,'\n');
// NOTE: does _not_ handle case of malformed text file that has _no_
// newline on last line
if (rhs == NULL)
break;
len = rhs - lhs;
// strip newline from string [or not]
#if 1
if (len > 0)
--len;
#endif
if ((linecnt + 1) > linemax) {
linemax += 100;
linelist = realloc(linelist,linemax * sizeof(FileL));
}
line = &linelist[linecnt];
line->lidx = linecnt++;
cp = malloc(len + 1);
memcpy(cp,lhs,len);
cp[len] = 0;
line->lncontent = cp;
}
munmap(fbuf,st.st_size);
close(fd);
// trim to maximum used
linelist = realloc(linelist,linecnt * sizeof(FileL));
return linelist;
}
UPDATE
You wanted benchmarks. Well, you're gonna get 'em. Generated 167GB of random text data, spanning 140 files. The L:
is # of lines, W:
is max width, and shows filesize in MB. Times are in nanoseconds. Note that factors vary, but looks like mmap wins.
23:39:35.528333425 NEWDAY 11/09/15
23:39:35.528333425 ph: starting 23107 ...
23:39:35.528868198 ph: ARGV fastreadgo ...
F001: L:324255 W:2097 324.086MB
368297556 fgets
189180143 mmap
F002: L:329608 W:2822 443.649MB
475989122 fgets
248517335 mmap
F003: L:401476 W:6186 1185.270MB
1206999411 fgets
657703847 mmap
F004: L:729379 W:9350 3253.185MB
3199692871 fgets
1776602082 mmap
F005: L:85857 W:5185 212.599MB
223489564 fgets
122404608 mmap
F006: L:62871 W:5418 162.384MB
167640768 fgets
93127042 mmap
F007: L:298836 W:1083 154.481MB
196584474 fgets
100582134 mmap
F008: L:221513 W:2732 288.694MB
322105867 fgets
164965547 mmap
F009: L:420815 W:8906 1789.672MB
1801309998 fgets
961136893 mmap
F010: L:126712 W:8251 498.905MB
499274233 fgets
275901635 mmap
F011: L:443166 W:8822 1865.753MB
1839816883 fgets
1001882651 mmap
F012: L:385632 W:2162 398.467MB
467223648 fgets
248126909 mmap
F013: L:629448 W:4413 1324.616MB
1432284339 fgets
777593198 mmap
F014: L:510357 W:7313 1779.348MB
1919309671 fgets
1079111734 mmap
F015: L:188434 W:1254 112.922MB
152367682 fgets
78959769 mmap
F016: L:82139 W:4355 170.586MB
193117015 fgets
105417805 mmap
F017: L:389499 W:9063 1681.805MB
1730894028 fgets
913789253 mmap
F018: L:992849 W:3265 1547.875MB
1685006767 fgets
875256226 mmap
F019: L:931502 W:9647 4285.883MB
11181005402 fgets
2361255543 mmap
F020: L:266047 W:7454 946.298MB
955772708 fgets
537059554 mmap
F021: L:572709 W:67 18.835MB
86539501 fgets
43437303 mmap
F022: L:68373 W:3042 98.684MB
110325296 fgets
57538963 mmap
F023: L:651839 W:2006 624.153MB
706094723 fgets
369122560 mmap
F024: L:414658 W:6482 1284.202MB
1294352248 fgets
700279769 mmap
F025: L:984554 W:3441 1616.269MB
1742233370 fgets
903755131 mmap
F026: L:527629 W:3214 808.812MB
872660092 fgets
465403685 mmap
F027: L:572103 W:6219 1696.582MB
1758562312 fgets
933024466 mmap
F028: L:793354 W:5967 2255.653MB
2341754885 fgets
1251633414 mmap
F029: L:690669 W:389 128.888MB
230036016 fgets
119381427 mmap
F030: L:902519 W:8182 3523.415MB
6665490426 fgets
1930049511 mmap
F031: L:179482 W:2361 201.850MB
225333697 fgets
120424715 mmap
F032: L:342396 W:4135 675.885MB
706219203 fgets
379974402 mmap
F033: L:762237 W:4000 1455.780MB
1535236381 fgets
805977762 mmap
F034: L:421947 W:8289 1669.038MB
1686877811 fgets
900813641 mmap
F035: L:367349 W:5829 1022.373MB
1051584165 fgets
566680706 mmap
F036: L:433973 W:5064 1049.724MB
1097920811 fgets
584855289 mmap
F037: L:615918 W:9152 2686.372MB
2743719787 fgets
1468536802 mmap
F038: L:365187 W:1564 272.829MB
326368364 fgets
171071840 mmap
F039: L:61305 W:477 14.002MB
22945438 fgets
11949833 mmap
F040: L:396788 W:8576 1622.049MB
1633217001 fgets
884460205 mmap
F041: L:245326 W:5068 592.450MB
610530077 fgets
328366102 mmap
F042: L:986409 W:9174 4313.608MB
17048484450 fgets
2413375121 mmap
F043: L:367968 W:9703 1703.785MB
1677764299 fgets
922735827 mmap
F044: L:630679 W:9763 2942.911MB
4742195305 fgets
1585438052 mmap
F045: L:397072 W:7717 1459.554MB
1533634531 fgets
860518182 mmap
F046: L:918129 W:9127 3996.179MB
10259712214 fgets
2171550789 mmap
F047: L:770706 W:2720 999.584MB
1097599308 fgets
604894013 mmap
F048: L:472462 W:5011 1127.896MB
1164186449 fgets
621979909 mmap
F049: L:301834 W:4456 642.703MB
664420452 fgets
354255131 mmap
F050: L:213878 W:2913 297.159MB
321396955 fgets
168664579 mmap
F051: L:549950 W:1681 441.842MB
510553173 fgets
260455948 mmap
F052: L:63502 W:8785 267.074MB
265697002 fgets
142457939 mmap
F053: L:880396 W:6821 2864.595MB
3769485430 fgets
1591318886 mmap
F054: L:180543 W:9055 779.566MB
773462618 fgets
428627500 mmap
F055: L:964409 W:8454 3884.437MB
9085108760 fgets
2149540695 mmap
F056: L:675120 W:8912 2872.781MB
2885159527 fgets
1559580604 mmap
F057: L:345151 W:4157 684.052MB
724456228 fgets
387170980 mmap
F058: L:69114 W:4585 150.535MB
157447952 fgets
84782951 mmap
F059: L:304627 W:9441 1370.777MB
1376517664 fgets
739170571 mmap
F060: L:799770 W:3145 1200.762MB
1304001986 fgets
679163462 mmap
F061: L:808699 W:6544 2523.949MB
2590924710 fgets
1385627164 mmap
F062: L:270082 W:313 40.592MB
78777863 fgets
40733146 mmap
F063: L:308883 W:333 49.262MB
93696361 fgets
48580067 mmap
F064: L:237002 W:2618 296.446MB
347315129 fgets
178078149 mmap
F065: L:279040 W:1341 178.685MB
217230537 fgets
113291912 mmap
F066: L:809386 W:2808 1085.734MB
1177480987 fgets
615248653 mmap
F067: L:279448 W:8560 1140.280MB
1151044788 fgets
614662533 mmap
F068: L:80012 W:7441 283.334MB
286915203 fgets
158077955 mmap
F069: L:366808 W:7197 1260.521MB
1292679736 fgets
696686301 mmap
F070: L:272693 W:9275 1206.527MB
1220763889 fgets
658383413 mmap
F071: L:792609 W:1419 537.088MB
645760162 fgets
334886975 mmap
F072: L:742523 W:8640 3059.604MB
5711688133 fgets
1665879727 mmap
F073: L:583753 W:2992 833.759MB
910037328 fgets
483847376 mmap
F074: L:252560 W:7178 864.593MB
868625985 fgets
471770777 mmap
F075: L:154327 W:7026 515.619MB
516135586 fgets
277690063 mmap
F076: L:121839 W:7131 414.684MB
424518600 fgets
230518357 mmap
F077: L:760327 W:1421 515.475MB
622630358 fgets
314592959 mmap
F078: L:907033 W:3485 1508.042MB
1622356297 fgets
845695719 mmap
F079: L:884787 W:7491 3162.774MB
4932864122 fgets
1749065509 mmap
F080: L:432556 W:6039 1245.779MB
1281231973 fgets
693532807 mmap
F081: L:639804 W:6419 1957.747MB
2107303517 fgets
1130299002 mmap
F082: L:388669 W:283 52.804MB
111686630 fgets
57177517 mmap
F083: L:300542 W:1943 278.825MB
336538347 fgets
177494803 mmap
F084: L:941 W:7,3878 1.770MB
2047540 fgets
1347230 mmap
F085: L:85747 W:1841 75.417MB
92274672 fgets
49362653 mmap
F086: L:935559 W:5950 2656.411MB
2734326131 fgets
1487147776 mmap
F087: L:936993 W:1197 535.727MB
672872562 fgets
348765250 mmap
F088: L:409671 W:5235 1023.358MB
1099320520 fgets
606047909 mmap
F089: L:362220 W:5434 938.805MB
991448256 fgets
529093412 mmap
F090: L:628156 W:3682 1103.909MB
1185317812 fgets
637902310 mmap
F091: L:655456 W:6051 1892.574MB
1978859918 fgets
1066368241 mmap
F092: L:356309 W:5946 1012.893MB
1046818030 fgets
562463577 mmap
F093: L:878726 W:2946 1236.162MB
1368885560 fgets
701514499 mmap
F094: L:583863 W:747 208.701MB
293177923 fgets
148045230 mmap
F095: L:51374 W:3752 91.670MB
98830853 fgets
52715699 mmap
F096: L:757271 W:4698 1698.664MB
1790811621 fgets
946452098 mmap
F097: L:665420 W:1814 575.369MB
664290848 fgets
347346293 mmap
F098: L:152806 W:4480 326.336MB
338683910 fgets
185037896 mmap
F099: L:39027 W:2368 44.104MB
49144948 fgets
26701307 mmap
F100: L:896926 W:8209 3513.328MB
7460727008 fgets
1900543480 mmap
F101: L:796628 W:5663 2149.888MB
2207899454 fgets
1187397751 mmap
F102: L:876500 W:1986 831.161MB
934850175 fgets
486626065 mmap
F103: L:188682 W:773 69.722MB
97285527 fgets
48765985 mmap
F104: L:648920 W:9590 2969.446MB
5021968784 fgets
1622268239 mmap
F105: L:827850 W:2123 837.892MB
946063144 fgets
498163978 mmap
F106: L:879828 W:2867 1205.021MB
1304295176 fgets
682155187 mmap
F107: L:970674 W:3830 1771.667MB
1883664162 fgets
989569477 mmap
F108: L:4461 W:5634 11.840MB
12680659 fgets
7159011 mmap
F109: L:477207 W:1067 243.224MB
315370392 fgets
162708299 mmap
F110: L:140308 W:5817 389.132MB
397510757 fgets
216204659 mmap
F111: L:253358 W:4425 534.937MB
559943651 fgets
297109524 mmap
F112: L:903292 W:7989 3441.851MB
7327033977 fgets
1906200470 mmap
F113: L:555989 W:620 164.835MB
245638038 fgets
126559933 mmap
F114: L:596425 W:2330 664.143MB
739017073 fgets
391237002 mmap
F115: L:298147 W:9741 1387.530MB
1363229979 fgets
744420477 mmap
F116: L:180269 W:4522 389.175MB
402702977 fgets
213875684 mmap
F117: L:238597 W:9021 1029.314MB
1033070395 fgets
550442036 mmap
F118: L:183723 W:8705 764.555MB
765959712 fgets
413667801 mmap
F119: L:174802 W:549 45.896MB
70635625 fgets
35721310 mmap
F120: L:883013 W:4666 1963.677MB
2062197751 fgets
1092583730 mmap
F121: L:858995 W:9218 3776.896MB
9278222413 fgets
2309240152 mmap
F122: L:368895 W:5862 1030.174MB
1076473726 fgets
582127460 mmap
F123: L:208043 W:5672 563.889MB
579427255 fgets
310321934 mmap
F124: L:768482 W:4953 1816.657MB
1888233155 fgets
997797932 mmap
F125: L:905425 W:2812 1214.882MB
1394928053 fgets
724059403 mmap
F126: L:54137 W:4690 121.066MB
125124760 fgets
67811537 mmap
F127: L:448100 W:9643 2061.624MB
2066282543 fgets
1126488038 mmap
F128: L:748979 W:2111 754.038MB
854095589 fgets
447406977 mmap
F129: L:611388 W:6954 2026.306MB
2074219917 fgets
1118353849 mmap
F130: L:782834 W:9946 3715.067MB
7338500374 fgets
2029571615 mmap
F131: L:52630 W:7858 197.495MB
200711062 fgets
110759659 mmap
F132: L:930983 W:7363 3270.546MB
3376813502 fgets
1776365395 mmap
F133: L:73216 W:2127 74.344MB
85854537 fgets
46756335 mmap
F134: L:583306 W:2495 694.192MB
766430638 fgets
408095226 mmap
F135: L:877424 W:2964 1241.342MB
1339005805 fgets
702659289 mmap
F136: L:414854 W:5104 1010.006MB
1057372341 fgets
556583887 mmap
F137: L:333176 W:4912 781.109MB
820007572 fgets
435433956 mmap
F138: L:564006 W:6933 1863.574MB
1905024687 fgets
1030574213 mmap
F139: L:829571 W:9152 3622.399MB
7338698902 fgets
2002428493 mmap
F140: L:560210 W:7443 1990.047MB
2012670010 fgets
1098720143 mmap
00:00:58.770988225 NEWDAY 11/10/15
00:00:58.770988225 ph: complete (ELAPSED: 00:21:23.190149545)
Here's the perl script I used to generate the files:
#!/usr/bin/perl
# grpcntgen -- generate test data for fastread algorithms
#
# arguments:
# "-W" - maximum line width
# "-L" - maximum number of lines
# "-T" - number of test files to generate
# "-O" - output file (e.g. foo%.txt)
#
# NOTE: with no arguments or missing arguments will prompt
#pragma pgmlns
# tstgen -- test generation help routines
# gengetstr -- get a string/number
sub gengetstr
{
my($numflg,$opt,$prompt,$lim) = @_;
my($arg);
my($askflg);
my($val);
select(STDOUT);
$| = 1;
{
# search command line for -whatever
foreach $arg (@argv) {
if ($arg =~ /^$opt(.*)$/) {
$val = $1;
if ($numflg && ($val eq "")) {
$val = $lim;
$val //= 1;
}
last;
}
}
last if (defined($val));
$askflg = 1;
while (1) {
printf("Enter ")
if ($numflg != 1);
printf("%s",$prompt);
if ($numflg == 1) {
printf(" (0/1)? ");
}
else {
printf(": ");
}
$val = <STDIN>;
chomp($val);
if ($numflg == 0) {
last if ($val ne "");
next;
}
# an empty response for a number with a maximum means use it
if (($numflg == 2) && ($val eq "") && defined($lim)) {
$val = $lim;
last;
}
next unless ($val =~ /^\d+$/);
$val += 0;
last if ($numflg == 1);
next if ($val <= 0);
last unless (defined($lim));
last if ($val <= $lim);
}
}
unless ($askflg) {
printf("%s: %s\n",$prompt,$val);
}
$val;
}
# genrun -- generate all tests
sub genrun
{
local(@argv) = @_;
local($ofile,$tstmax,$tstproc);
local($tstcur);
local($splitflg);
local($genvbq);
my($sym);
my($numfmt);
my($xfile);
$genvbq = genvbq(\@argv);
$ofile = shift(@argv);
$tstmax = shift(@argv);
$tstproc = shift(@argv);
# split each test into separate file
if ($ofile =~ /%/) {
$splitflg = 1;
$numfmt = sprintf("%d",$tstmax);
$numfmt = length($numfmt);
$numfmt = sprintf("_%%%d.%dd",$numfmt,$numfmt);
$ofile =~ s/%/$numfmt/;
###die("genrun: DEBUG_CAE numfmt='$numfmt' ofile='$ofile'\n");
}
{
last if ($splitflg);
genopen($ofile);
}
for ($tstcur = 1; $tstcur <= $tstmax; ++$tstcur) {
{
last unless ($splitflg);
$xfile = sprintf($ofile,$tstcur);
genopen($xfile);
}
&$tstproc();
{
last unless ($splitflg);
genclose();
}
}
{
last if ($splitflg);
genclose();
}
}
# genvbq -- get options
sub genvbq
{
my($argv) = @_;
my($sym);
my($env);
$env = {};
while (1) {
$sym = $argv->[0];
last unless ($sym =~ s/^-//);
shift(@$argv);
if ($sym =~ /^([^=]+)=(.+)$/) {
($sym,$val) = ($1,$2);
}
else {
$val = 1;
}
$env->{$sym} = $val;
}
$env;
}
# genopen -- open output
sub genopen
{
my($ofile) = @_;
$gen_ofile = $ofile;
{
last if ($genvbq->{"n"});
last if (open($xfdst,">$ofile"));
die("genopen: unable to open '$ofile' -- $!\n");
}
}
# genclose -- close output
sub genclose
{
close($xfdst);
}
# geninit -- initialize for single test
sub geninit
{
undef($genout_lhs);
undef($genout_pre);
}
# genout -- output data
sub genout
{
my($rhs) = @_;
{
if (defined($rhs)) {
last if ((length($genout_pre) + length($genout_lhs) +
length($rhs)) < 78);
}
last if ($genout_lhs eq "");
print($xfdst $genout_pre,$genout_lhs,"\n");
undef($genout_lhs);
}
$genout_lhs .= $rhs
if (defined($rhs));
}
# genrand -- get random number
sub genrand
{
my($lim) = @_;
my($val);
$val = int(rand($lim));
$val += 1;
$val;
}
# genfmtof -- get number format
sub genfmtof
{
my($num) = @_;
my($fmt);
$fmt = sprintf("%d",$num);
$fmt = length($fmt);
$fmt = sprintf(" %%%dd",$fmt);
$fmt;
}
1;
master(@ARGV);
exit(0);
# master -- master control
sub master
{
local(@argv) = @_;
$Wmax = gengetstr(2,"-W","maximum line width");
$Lmax = gengetstr(2,"-L","maximum number of lines / file");
$tstmax = gengetstr(2,"-T","number of tests");
while (1) {
$ofile = gengetstr(0,"-O","output file name");
last if ($ofile =~ /%/);
printf("fastreadgen: filename must have %% in it (e.g. foo%%.txt)\n");
}
genrun($ofile,$tstmax,\&dotest);
}
# dotest -- generate a test
sub dotest
{
my($lno,$chroff);
my($lhs);
$Wlim = genrand($Wmax);
$Llim = genrand($Lmax);
printf("dotest: T=%d/%d W=%d L=%d\n",$tstcur,$tstmax,$Wlim,$Llim);
for ($lno = 1; $lno <= $Llim; ++$lno) {
$Wcur = genrand($Wlim);
$Wcur -= 1;
undef($lhs);
for (; $Wcur > 0; --$Wcur) {
$chroff = genrand(0x7E - 0x20);
$chroff += 0x20;
$chroff -= 1;
$chroff = chr($chroff);
last;
$lhs .= $chroff;
}
$lhs = $chroff x $Wcur;
print($xfdst $lhs,"\n");
}
}