From a877083e96aed13360a0e60fb36222efac58124e Mon Sep 17 00:00:00 2001 From: Augists Date: Fri, 11 Feb 2022 11:10:58 +0800 Subject: [PATCH] add support for importing docx file --- controllers/BookController.go | 12 +- models/BookModel.go | 47 ++- utils/docx2md.go | 551 ++++++++++++++++++++++++++++++++++ views/book/index.tpl | 2 +- 4 files changed, 606 insertions(+), 6 deletions(-) create mode 100644 utils/docx2md.go diff --git a/controllers/BookController.go b/controllers/BookController.go index 6fd9d4af..2e26708d 100644 --- a/controllers/BookController.go +++ b/controllers/BookController.go @@ -340,7 +340,7 @@ func (c *BookController) UploadCover() { fileName := "cover_" + strconv.FormatInt(time.Now().UnixNano(), 16) //附件路径按照项目组织 -// filePath := filepath.Join("uploads", book.Identify, "images", fileName+ext) + // filePath := filepath.Join("uploads", book.Identify, "images", fileName+ext) filePath := filepath.Join(conf.WorkingDirectory, "uploads", book.Identify, "images", fileName+ext) path := filepath.Dir(filePath) @@ -571,7 +571,7 @@ func (c *BookController) Copy() { } } -//导入zip压缩包 +// 导入zip压缩包或docx func (c *BookController) Import() { file, moreFile, err := c.GetFile("import-file") @@ -608,7 +608,7 @@ func (c *BookController) Import() { ext := filepath.Ext(moreFile.Filename) - if !strings.EqualFold(ext, ".zip") { + if !strings.EqualFold(ext, ".zip") && !strings.EqualFold(ext, ".docx") { c.JsonResult(6004, "不支持的文件类型") } @@ -643,7 +643,11 @@ func (c *BookController) Import() { book.Editor = "markdown" book.Theme = "default" - go book.ImportBook(tempPath, c.Lang) + if strings.EqualFold(ext, ".zip") { + go book.ImportBook(tempPath, c.Lang) + } else if strings.EqualFold(ext, ".docx") { + go book.ImportWordBook(tempPath, c.Lang) + } logs.Info("用户[", c.Member.Account, "]导入了项目 ->", book) diff --git a/models/BookModel.go b/models/BookModel.go index d78f6ca2..77e441d2 100644 --- a/models/BookModel.go +++ b/models/BookModel.go @@ -680,7 +680,7 @@ func (book *Book) ResetDocumentNumber(bookId int) { } } -//导入项目 +// 导入zip项目 func (book *Book) ImportBook(zipPath string, lang string) error { if !filetil.FileExists(zipPath) { return errors.New("文件不存在 => " + zipPath) @@ -978,6 +978,51 @@ func (book *Book) ImportBook(zipPath string, lang string) error { return err } +// 导入docx项目 +func (book *Book) ImportWordBook(docxPath string, lang string) error { + if !filetil.FileExists(docxPath) { + return errors.New("文件不存在") + } + docxPath = strings.Replace(docxPath, "\\", "/", -1) + + o := orm.NewOrm() + + o.Insert(book) + relationship := NewRelationship() + relationship.BookId = book.BookId + relationship.RoldId = 0 + relationship.MemberId = book.MemberId + relationship.Insert() + + doc := NewDocument() + doc.BookId = book.BookId + doc.MemberId = book.MemberId + docIdentify := strings.Replace(strings.TrimPrefix(docxPath, os.TempDir()+"/"), "/", "-", -1) + + if ok, err := regexp.MatchString(`[a-z]+[a-zA-Z0-9_.\-]*$`, docIdentify); !ok || err != nil { + docIdentify = "import-" + docIdentify + } + + doc.Identify = docIdentify + + if doc.Markdown, err := util.Docx2md(docxPath, false); err != nil { + logs.Error("导入doc项目转换异常 => ", err) + } + + doc.Content = string(blackfriday.Run([]byte(doc.Markdown))) + + doc.Version = time.Now().Unix() + + for _, line := range strings.Split(doc.Markdown, "\n") { + if strings.HasPrefix(line, "#") { + docName := strings.TrimLeft(line, "#") + break + } + } + + doc.DocumentName = strings.TrimSpace(docName) +} + func (book *Book) FindForRoleId(bookId, memberId int) (conf.BookRole, error) { o := orm.NewOrm() diff --git a/utils/docx2md.go b/utils/docx2md.go new file mode 100644 index 00000000..be0d2e12 --- /dev/null +++ b/utils/docx2md.go @@ -0,0 +1,551 @@ +// https://github.com/mattn/docx2md +// License MIT +package util + +import ( + "archive/zip" + "bytes" + "encoding/base64" + "encoding/xml" + "errors" + "flag" + "fmt" + "io" + "io/ioutil" + "log" + "os" + "path" + "path/filepath" + "runtime" + "strconv" + "strings" + + "github.com/mattn/go-runewidth" +) + +// Relationship is +type Relationship struct { + Text string `xml:",chardata"` + ID string `xml:"Id,attr"` + Type string `xml:"Type,attr"` + Target string `xml:"Target,attr"` + TargetMode string `xml:"TargetMode,attr"` +} + +// Relationships is +type Relationships struct { + XMLName xml.Name `xml:"Relationships"` + Text string `xml:",chardata"` + Xmlns string `xml:"xmlns,attr"` + Relationship []Relationship `xml:"Relationship"` +} + +// TextVal is +type TextVal struct { + Text string `xml:",chardata"` + Val string `xml:"val,attr"` +} + +// NumberingLvl is +type NumberingLvl struct { + Text string `xml:",chardata"` + Ilvl string `xml:"ilvl,attr"` + Tplc string `xml:"tplc,attr"` + Tentative string `xml:"tentative,attr"` + Start TextVal `xml:"start"` + NumFmt TextVal `xml:"numFmt"` + LvlText TextVal `xml:"lvlText"` + LvlJc TextVal `xml:"lvlJc"` + PPr struct { + Text string `xml:",chardata"` + Ind struct { + Text string `xml:",chardata"` + Left string `xml:"left,attr"` + Hanging string `xml:"hanging,attr"` + } `xml:"ind"` + } `xml:"pPr"` + RPr struct { + Text string `xml:",chardata"` + U struct { + Text string `xml:",chardata"` + Val string `xml:"val,attr"` + } `xml:"u"` + RFonts struct { + Text string `xml:",chardata"` + Hint string `xml:"hint,attr"` + } `xml:"rFonts"` + } `xml:"rPr"` +} + +// Numbering is +type Numbering struct { + XMLName xml.Name `xml:"numbering"` + Text string `xml:",chardata"` + Wpc string `xml:"wpc,attr"` + Cx string `xml:"cx,attr"` + Cx1 string `xml:"cx1,attr"` + Mc string `xml:"mc,attr"` + O string `xml:"o,attr"` + R string `xml:"r,attr"` + M string `xml:"m,attr"` + V string `xml:"v,attr"` + Wp14 string `xml:"wp14,attr"` + Wp string `xml:"wp,attr"` + W10 string `xml:"w10,attr"` + W string `xml:"w,attr"` + W14 string `xml:"w14,attr"` + W15 string `xml:"w15,attr"` + W16se string `xml:"w16se,attr"` + Wpg string `xml:"wpg,attr"` + Wpi string `xml:"wpi,attr"` + Wne string `xml:"wne,attr"` + Wps string `xml:"wps,attr"` + Ignorable string `xml:"Ignorable,attr"` + AbstractNum []struct { + Text string `xml:",chardata"` + AbstractNumID string `xml:"abstractNumId,attr"` + RestartNumberingAfterBreak string `xml:"restartNumberingAfterBreak,attr"` + Nsid TextVal `xml:"nsid"` + MultiLevelType TextVal `xml:"multiLevelType"` + Tmpl TextVal `xml:"tmpl"` + Lvl []NumberingLvl `xml:"lvl"` + } `xml:"abstractNum"` + Num []struct { + Text string `xml:",chardata"` + NumID string `xml:"numId,attr"` + AbstractNumID TextVal `xml:"abstractNumId"` + } `xml:"num"` +} + +type file struct { + rels Relationships + num Numbering + r *zip.ReadCloser + embed bool + list map[string]int +} + +// Node is +type Node struct { + XMLName xml.Name + Attrs []xml.Attr `xml:"-"` + Content []byte `xml:",innerxml"` + Nodes []Node `xml:",any"` +} + +// UnmarshalXML is +func (n *Node) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { + n.Attrs = start.Attr + type node Node + + return d.DecodeElement((*node)(n), &start) +} + +func escape(s, set string) string { + replacer := []string{} + for _, r := range []rune(set) { + rs := string(r) + replacer = append(replacer, rs, `\`+rs) + } + return strings.NewReplacer(replacer...).Replace(s) +} + +func (zf *file) extract(rel *Relationship, w io.Writer) error { + err := os.MkdirAll(filepath.Dir(rel.Target), 0755) + if err != nil { + return err + } + for _, f := range zf.r.File { + if f.Name != "word/"+rel.Target { + continue + } + rc, err := f.Open() + if err != nil { + return err + } + defer rc.Close() + + b := make([]byte, f.UncompressedSize64) + n, err := rc.Read(b) + if err != nil && err != io.EOF { + return err + } + if zf.embed { + fmt.Fprintf(w, "![](data:image/png;base64,%s)", + base64.StdEncoding.EncodeToString(b[:n])) + } else { + err = ioutil.WriteFile(rel.Target, b, 0644) + if err != nil { + return err + } + fmt.Fprintf(w, "![](%s)", escape(rel.Target, "()")) + } + break + } + return nil +} + +func attr(attrs []xml.Attr, name string) (string, bool) { + for _, attr := range attrs { + if attr.Name.Local == name { + return attr.Value, true + } + } + return "", false +} + +func (zf *file) walk(node *Node, w io.Writer) error { + switch node.XMLName.Local { + case "hyperlink": + fmt.Fprint(w, "[") + var cbuf bytes.Buffer + for _, n := range node.Nodes { + if err := zf.walk(&n, &cbuf); err != nil { + return err + } + } + fmt.Fprint(w, escape(cbuf.String(), "[]")) + fmt.Fprint(w, "]") + + fmt.Fprint(w, "(") + if id, ok := attr(node.Attrs, "id"); ok { + for _, rel := range zf.rels.Relationship { + if id == rel.ID { + fmt.Fprint(w, escape(rel.Target, "()")) + break + } + } + } + fmt.Fprint(w, ")") + case "t": + fmt.Fprint(w, string(node.Content)) + case "pPr": + code := false + for _, n := range node.Nodes { + switch n.XMLName.Local { + case "ind": + if left, ok := attr(n.Attrs, "left"); ok { + if i, err := strconv.Atoi(left); err == nil && i > 0 { + fmt.Fprint(w, strings.Repeat(" ", i/360)) + } + } + case "pStyle": + if val, ok := attr(n.Attrs, "val"); ok { + if strings.HasPrefix(val, "Heading") { + if i, err := strconv.Atoi(val[7:]); err == nil && i > 0 { + fmt.Fprint(w, strings.Repeat("#", i)+" ") + } + } else if val == "Code" { + code = true + } else { + if i, err := strconv.Atoi(val); err == nil && i > 0 { + fmt.Fprint(w, strings.Repeat("#", i)+" ") + } + } + } + case "numPr": + numID := "" + ilvl := "" + numFmt := "" + start := 1 + ind := 0 + for _, nn := range n.Nodes { + if nn.XMLName.Local == "numId" { + if val, ok := attr(nn.Attrs, "val"); ok { + numID = val + } + } + if nn.XMLName.Local == "ilvl" { + if val, ok := attr(nn.Attrs, "val"); ok { + ilvl = val + } + } + } + for _, num := range zf.num.Num { + if numID != num.NumID { + continue + } + for _, abnum := range zf.num.AbstractNum { + if abnum.AbstractNumID != num.AbstractNumID.Val { + continue + } + for _, ablvl := range abnum.Lvl { + if ablvl.Ilvl != ilvl { + continue + } + if i, err := strconv.Atoi(ablvl.Start.Val); err == nil { + start = i + } + if i, err := strconv.Atoi(ablvl.PPr.Ind.Left); err == nil { + ind = i / 360 + } + numFmt = ablvl.NumFmt.Val + break + } + break + } + break + } + + fmt.Fprint(w, strings.Repeat(" ", ind)) + switch numFmt { + case "decimal", "aiueoFullWidth": + key := fmt.Sprintf("%s:%d", numID, ind) + cur, ok := zf.list[key] + if !ok { + zf.list[key] = start + } else { + zf.list[key] = cur + 1 + } + fmt.Fprintf(w, "%d. ", zf.list[key]) + case "bullet": + fmt.Fprint(w, "* ") + } + } + } + if code { + fmt.Fprint(w, "`") + } + for _, n := range node.Nodes { + if err := zf.walk(&n, w); err != nil { + return err + } + } + if code { + fmt.Fprint(w, "`") + } + case "tbl": + var rows [][]string + for _, tr := range node.Nodes { + if tr.XMLName.Local != "tr" { + continue + } + var cols []string + for _, tc := range tr.Nodes { + if tc.XMLName.Local != "tc" { + continue + } + var cbuf bytes.Buffer + if err := zf.walk(&tc, &cbuf); err != nil { + return err + } + cols = append(cols, strings.Replace(cbuf.String(), "\n", "", -1)) + } + rows = append(rows, cols) + } + maxcol := 0 + for _, cols := range rows { + if len(cols) > maxcol { + maxcol = len(cols) + } + } + widths := make([]int, maxcol) + for _, row := range rows { + for i := 0; i < maxcol; i++ { + if i < len(row) { + width := runewidth.StringWidth(row[i]) + if widths[i] < width { + widths[i] = width + } + } + } + } + for i, row := range rows { + if i == 0 { + for j := 0; j < maxcol; j++ { + fmt.Fprint(w, "|") + fmt.Fprint(w, strings.Repeat(" ", widths[j])) + } + fmt.Fprint(w, "|\n") + for j := 0; j < maxcol; j++ { + fmt.Fprint(w, "|") + fmt.Fprint(w, strings.Repeat("-", widths[j])) + } + fmt.Fprint(w, "|\n") + } + for j := 0; j < maxcol; j++ { + fmt.Fprint(w, "|") + if j < len(row) { + width := runewidth.StringWidth(row[j]) + fmt.Fprint(w, escape(row[j], "|")) + fmt.Fprint(w, strings.Repeat(" ", widths[j]-width)) + } else { + fmt.Fprint(w, strings.Repeat(" ", widths[j])) + } + } + fmt.Fprint(w, "|\n") + } + fmt.Fprint(w, "\n") + case "r": + bold := false + italic := false + strike := false + for _, n := range node.Nodes { + if n.XMLName.Local != "rPr" { + continue + } + for _, nn := range n.Nodes { + switch nn.XMLName.Local { + case "b": + bold = true + case "i": + italic = true + case "strike": + strike = true + } + } + } + if strike { + fmt.Fprint(w, "~~") + } + if bold { + fmt.Fprint(w, "**") + } + if italic { + fmt.Fprint(w, "*") + } + var cbuf bytes.Buffer + for _, n := range node.Nodes { + if err := zf.walk(&n, &cbuf); err != nil { + return err + } + } + fmt.Fprint(w, escape(cbuf.String(), `*~\`)) + if italic { + fmt.Fprint(w, "*") + } + if bold { + fmt.Fprint(w, "**") + } + if strike { + fmt.Fprint(w, "~~") + } + case "p": + for _, n := range node.Nodes { + if err := zf.walk(&n, w); err != nil { + return err + } + } + fmt.Fprintln(w) + case "blip": + if id, ok := attr(node.Attrs, "embed"); ok { + for _, rel := range zf.rels.Relationship { + if id != rel.ID { + continue + } + if err := zf.extract(&rel, w); err != nil { + return err + } + } + } + case "Fallback": + case "txbxContent": + var cbuf bytes.Buffer + for _, n := range node.Nodes { + if err := zf.walk(&n, &cbuf); err != nil { + return err + } + } + fmt.Fprintln(w, "\n```\n"+cbuf.String()+"```") + default: + for _, n := range node.Nodes { + if err := zf.walk(&n, w); err != nil { + return err + } + } + } + + return nil +} + +func readFile(f *zip.File) (*Node, error) { + rc, err := f.Open() + defer rc.Close() + + b, _ := ioutil.ReadAll(rc) + if err != nil { + return nil, err + } + + var node Node + err = xml.Unmarshal(b, &node) + if err != nil { + return nil, err + } + return &node, nil +} + +func findFile(files []*zip.File, target string) *zip.File { + for _, f := range files { + if ok, _ := path.Match(target, f.Name); ok { + return f + } + } + return nil +} + +func Docx2md(arg string, embed bool) (string, error) { + r, err := zip.OpenReader(arg) + if err != nil { + return err + } + defer r.Close() + + var rels Relationships + var num Numbering + + for _, f := range r.File { + switch f.Name { + case "word/_rels/document.xml.rels": + rc, err := f.Open() + defer rc.Close() + + b, _ := ioutil.ReadAll(rc) + if err != nil { + return err + } + + err = xml.Unmarshal(b, &rels) + if err != nil { + return err + } + case "word/numbering.xml": + rc, err := f.Open() + defer rc.Close() + + b, _ := ioutil.ReadAll(rc) + if err != nil { + return err + } + + err = xml.Unmarshal(b, &num) + if err != nil { + return err + } + } + } + + f := findFile(r.File, "word/document*.xml") + if f == nil { + return errors.New("incorrect document") + } + node, err := readFile(f) + if err != nil { + return err + } + + var buf bytes.Buffer + zf := &file{ + r: r, + rels: rels, + num: num, + embed: embed, + list: make(map[string]int), + } + err = zf.walk(node, &buf) + if err != nil { + return nil, err + } + + return buf.String(), nil +} diff --git a/views/book/index.tpl b/views/book/index.tpl index 6c6d819c..c04f8b08 100644 --- a/views/book/index.tpl +++ b/views/book/index.tpl @@ -465,7 +465,7 @@ 'required': true, 'validateInitialCount': true, "language" : "{{i18n $.Lang "common.upload_lang"}}", - 'allowedFileExtensions': ['zip'], + 'allowedFileExtensions': ['zip', 'docx'], 'msgPlaceholder' : '{{i18n $.Lang "message.file_type_placeholder"}}', 'elErrorContainer' : "#import-book-form-error-message", 'uploadExtraData' : function () {